rag-service/RAG/src/modules/ingest/service.ts
2026-04-05 17:49:35 +02:00

91 lines
3.6 KiB
TypeScript

import path from "node:path";
import { parseDocument, isSupportedDocument } from "../parsers/parser-registry.js";
import { chunkDocument, codeChunkingPolicy, documentalChunkingPolicy } from "../process/chunking.js";
import type { EmbeddingProvider } from "../embeddings/provider.js";
import type { VectorStoreClient } from "../vectorstore/client.js";
import type { IngestResult, IngestSourceInput, IngestedChunk } from "../../shared/types/rag.js";
import { buildChunkId, buildDocumentId, buildQdrantPointId, buildSourceId, normalizeDocumentKey } from "../../shared/utils/ids.js";
import { listFilesRecursively } from "../../shared/utils/files.js";
import { env } from "../../config/env.js";
export class IngestService {
constructor(
private readonly embeddingProvider: EmbeddingProvider,
private readonly vectorStore: VectorStoreClient
) {}
async ingest(source: IngestSourceInput): Promise<IngestResult> {
const sourceId = source.sourceId ?? buildSourceId(source.sourceType, source.sourceRef);
const discoveredFiles = await this.resolveInputFiles(source);
const supportedFiles = discoveredFiles.filter(isSupportedDocument);
let documentsProcessed = 0;
let chunksStored = 0;
for (const filePath of supportedFiles) {
const parsed = await parseDocument(filePath);
if (!parsed.content.trim()) {
continue;
}
const documentKey = normalizeDocumentKey(
source.sourceType === "folder"
? path.relative(path.resolve(source.sourceRef), path.resolve(filePath))
: path.basename(filePath)
);
const documentId = buildDocumentId(sourceId, documentKey);
const chunkingPolicy = parsed.chunkMode === "codigo" ? codeChunkingPolicy : documentalChunkingPolicy;
const chunks = chunkDocument(parsed.title, parsed.content, chunkingPolicy);
const embeddings = await this.embeddingProvider.embed(chunks.map((chunk) => chunk.content));
const qdrantChunks: IngestedChunk[] = chunks.map((chunk, index) => {
const chunkId = buildChunkId(documentId, chunkingPolicy.mode, chunk.index);
return {
id: buildQdrantPointId(chunkId),
vector: embeddings[index],
payload: {
chunk_id: chunkId,
source_id: sourceId,
source_type: source.sourceType,
source_ref: source.sourceRef,
document_id: documentId,
document_key: documentKey,
title: parsed.title,
mime_type: parsed.mimeType,
section_title: chunk.sectionTitle,
chunk_mode: chunkingPolicy.mode,
chunk_index: chunk.index,
start_line: chunk.startLine,
end_line: chunk.endLine,
content: chunk.content,
embedding_provider: this.embeddingProvider.providerName,
embedding_model: this.embeddingProvider.modelName,
embedding_dimensions: embeddings[index]?.length ?? 0,
status: "active",
updated_at: new Date().toISOString(),
tags: source.tags ?? []
}
};
});
await this.vectorStore.upsert(qdrantChunks);
documentsProcessed += 1;
chunksStored += qdrantChunks.length;
}
return {
accepted: true,
source,
filesDiscovered: supportedFiles.length,
documentsProcessed,
chunksStored,
collectionName: env.qdrantCollection
};
}
private async resolveInputFiles(source: IngestSourceInput): Promise<string[]> {
if (source.sourceType === "file") {
return [source.sourceRef];
}
return listFilesRecursively(source.sourceRef);
}
}