254 lines
9.5 KiB
TypeScript
254 lines
9.5 KiB
TypeScript
import type { EmbeddingProvider } from "../embeddings/provider.js";
|
|
import type { VectorStoreClient } from "../vectorstore/client.js";
|
|
import type { ChunkMode, RetrieveIntent, RetrieveResponse, RetrieveScope } from "../../shared/types/rag.js";
|
|
|
|
function unique(values: string[]): string[] {
|
|
return [...new Set(values.filter(Boolean))];
|
|
}
|
|
|
|
function normalizeQuery(value: string): string {
|
|
return value.trim().replace(/\s+/g, " ");
|
|
}
|
|
|
|
function scoreBoostFromContent(content: string): number {
|
|
let boost = 0;
|
|
if (/regla|importante|critico|critica/i.test(content)) {
|
|
boost += 0.08;
|
|
}
|
|
if (/pendiente|backlog|task|objetivo/i.test(content)) {
|
|
boost += 0.06;
|
|
}
|
|
if (/workspace|proyecto|documentacion/i.test(content)) {
|
|
boost += 0.04;
|
|
}
|
|
if (/objetivo|objetivos principales|principios del sistema|resultado esperado|vision del sistema/i.test(content)) {
|
|
boost += 0.16;
|
|
}
|
|
if (/\*\*proyecto:\*\*|\*\*modulo:\*\*|\*\*ultima actualizacion:\*\*/i.test(content)) {
|
|
boost -= 0.18;
|
|
}
|
|
return boost;
|
|
}
|
|
|
|
function scoreBoostFromTitle(title: string): number {
|
|
let boost = 0;
|
|
if (/pendientes_generales/i.test(title)) {
|
|
boost += 0.1;
|
|
}
|
|
if (/ids/i.test(title)) {
|
|
boost += 0.12;
|
|
}
|
|
if (/service|client|chunking|provider|server|app/i.test(title)) {
|
|
boost += 0.05;
|
|
}
|
|
if (/readme|indice_documentacion|historial_sesiones|task/i.test(title)) {
|
|
boost += 0.03;
|
|
}
|
|
return boost;
|
|
}
|
|
|
|
function scoreBoostFromQueryAlignment(query: string, item: RetrieveResponse["items"][number]): number {
|
|
const source = `${item.title} ${item.content}`.toLowerCase();
|
|
const normalizedQuery = query.toLowerCase();
|
|
let boost = 0;
|
|
|
|
if (/pendiente|backlog|por hacer|siguiente/i.test(normalizedQuery) && /pendiente|backlog|lineas de trabajo|proximos pasos/i.test(source)) {
|
|
boost += 0.18;
|
|
}
|
|
if (/regla|norma|protocolo|instruccion/i.test(normalizedQuery) && /regla|norma|protocolo|instruccion/i.test(source)) {
|
|
boost += 0.18;
|
|
}
|
|
if (/sesion|historial|registro/i.test(normalizedQuery) && /sesion|historial|registro/i.test(source)) {
|
|
boost += 0.16;
|
|
}
|
|
if (/indice|documentacion|mapa/i.test(normalizedQuery) && /indice|documentacion|mapa/i.test(source)) {
|
|
boost += 0.14;
|
|
}
|
|
if (/funcion|metodo|clase|endpoint|service|client|chunk|codigo|linea|source_id|document_id|qdrant|embedding/i.test(normalizedQuery) && /function|class|endpoint|service|client|chunk|codigo|source_id|document_id|qdrant|embedding|def\s+/i.test(source)) {
|
|
boost += 0.2;
|
|
}
|
|
if (/source_id|document_id|chunk_id|ids/i.test(normalizedQuery) && /buildsourceid|builddocumentid|buildchunkid|ids\.ts|source_id|document_id|chunk_id/i.test(source)) {
|
|
boost += 0.28;
|
|
}
|
|
if (/source_id/i.test(normalizedQuery) && /buildsourceid|source_id/i.test(source)) {
|
|
boost += 0.22;
|
|
}
|
|
if (/document_id/i.test(normalizedQuery) && /builddocumentid|document_id/i.test(source)) {
|
|
boost += 0.22;
|
|
}
|
|
if (/chunk_id/i.test(normalizedQuery) && /buildchunkid|chunk_id/i.test(source)) {
|
|
boost += 0.22;
|
|
}
|
|
|
|
return boost;
|
|
}
|
|
|
|
function rankItems(query: string, items: RetrieveResponse["items"]): RetrieveResponse["items"] {
|
|
return [...items].sort((left, right) => {
|
|
const leftScore = left.score + scoreBoostFromTitle(left.title) + scoreBoostFromContent(left.content) + scoreBoostFromQueryAlignment(query, left);
|
|
const rightScore = right.score + scoreBoostFromTitle(right.title) + scoreBoostFromContent(right.content) + scoreBoostFromQueryAlignment(query, right);
|
|
return rightScore - leftScore;
|
|
});
|
|
}
|
|
|
|
function buildBootstrapQueries(query: string): string[] {
|
|
const cleaned = normalizeQuery(query);
|
|
if (!cleaned) {
|
|
return [
|
|
"dame un mapa inicial del dominio y sus ideas principales",
|
|
"documentacion principal y estructura general",
|
|
"temas base y puntos importantes a tener presentes",
|
|
"referencias principales para profundizar despues"
|
|
];
|
|
}
|
|
|
|
return unique([
|
|
cleaned,
|
|
`${cleaned} mapa general`,
|
|
`${cleaned} documentacion principal`,
|
|
`${cleaned} pendientes y lineas de trabajo`,
|
|
`${cleaned} reglas y estructura del workspace`
|
|
]);
|
|
}
|
|
|
|
function buildSpecificQueries(query: string): string[] {
|
|
const cleaned = normalizeQuery(query);
|
|
const queries = [cleaned];
|
|
|
|
if (/pendiente|backlog|por hacer|siguiente/i.test(cleaned)) {
|
|
queries.push(`${cleaned} pendientes backlog proximos pasos`);
|
|
queries.push(`${cleaned} lineas de trabajo prioritarias`);
|
|
queries.push(`${cleaned} sistema basico RAG estructura MCP Retell`);
|
|
}
|
|
|
|
if (/regla|norma|protocolo|instruccion/i.test(cleaned)) {
|
|
queries.push(`${cleaned} reglas protocolo instrucciones`);
|
|
}
|
|
|
|
if (/sesion|historial|registro/i.test(cleaned)) {
|
|
queries.push(`${cleaned} historial sesiones registro`);
|
|
}
|
|
|
|
if (/indice|documentacion|mapa/i.test(cleaned)) {
|
|
queries.push(`${cleaned} indice documentacion mapa`);
|
|
}
|
|
|
|
if (/caracteristica|como funciona|objetivo|modulo|arquitectura|stack/i.test(cleaned)) {
|
|
queries.push(`${cleaned} sistema rag base arquitectura stack tecnico`);
|
|
queries.push(`${cleaned} ingesta procesado salida embeddings qdrant`);
|
|
queries.push(`${cleaned} objetivo principios casos de uso`);
|
|
queries.push(`${cleaned} vision del sistema objetivos principales resultado esperado`);
|
|
}
|
|
|
|
if (/funcion|metodo|clase|endpoint|service|client|source_id|document_id|chunk|qdrant|embedding|codigo/i.test(cleaned)) {
|
|
queries.push(`${cleaned} funcion metodo clase service client`);
|
|
queries.push(`${cleaned} endpoint api qdrant embeddings retrieve answer ingest`);
|
|
queries.push(`${cleaned} source_id document_id chunk_id codigo`);
|
|
queries.push(`${cleaned} buildSourceId buildDocumentId buildChunkId ids.ts`);
|
|
}
|
|
|
|
return unique(queries);
|
|
}
|
|
|
|
function buildSpecificSummary(topics: string[], itemsCount: number): string {
|
|
if (itemsCount === 0) {
|
|
return "No se recupero contexto relevante para la consulta.";
|
|
}
|
|
|
|
return `Se recuperaron ${itemsCount} fragmentos relevantes sobre: ${topics.slice(0, 4).join(", ") || "tema consultado"}.`;
|
|
}
|
|
|
|
function buildBootstrapSummary(topics: string[], criticalPoints: string[], followUpRefs: string[], itemsCount: number): string {
|
|
if (itemsCount === 0) {
|
|
return "No se recupero contexto suficiente para construir el mapa inicial del dominio.";
|
|
}
|
|
|
|
const themeText = topics.slice(0, 4).join(", ") || "los documentos principales";
|
|
const criticalText = criticalPoints.slice(0, 3).join(", ") || "sin puntos criticos destacados";
|
|
return `Mapa inicial construido con ${itemsCount} fragmentos. Temas base: ${themeText}. Puntos a tener presentes: ${criticalText}. Referencias principales para profundizar: ${followUpRefs.slice(0, 4).join(", ") || "no disponibles"}.`;
|
|
}
|
|
|
|
async function embedQuery(provider: EmbeddingProvider, query: string): Promise<number[]> {
|
|
const [vector] = await provider.embed([query]);
|
|
return vector;
|
|
}
|
|
|
|
export class RetrieveService {
|
|
constructor(
|
|
private readonly embeddingProvider: EmbeddingProvider,
|
|
private readonly vectorStore: VectorStoreClient
|
|
) {}
|
|
|
|
async retrieve(mode: ChunkMode, intent: RetrieveIntent, query: string, scope?: RetrieveScope): Promise<RetrieveResponse> {
|
|
const items = intent === "bootstrap"
|
|
? await this.retrieveBootstrap(mode, query, scope)
|
|
: await this.retrieveSpecific(mode, query, scope);
|
|
|
|
const topics = unique(items.map((item) => item.title)).slice(0, 8);
|
|
const criticalPoints = items
|
|
.filter((item) => /critico|critical|importante|pendiente/i.test(item.content))
|
|
.map((item) => item.title)
|
|
.filter(Boolean)
|
|
.slice(0, 5);
|
|
const followUpRefs = unique(items.map((item) => item.documentId)).slice(0, 8);
|
|
const summary = intent === "bootstrap"
|
|
? buildBootstrapSummary(topics, criticalPoints, followUpRefs, items.length)
|
|
: buildSpecificSummary(topics, items.length);
|
|
|
|
return {
|
|
mode,
|
|
intent,
|
|
summary,
|
|
topics,
|
|
criticalPoints,
|
|
items,
|
|
followUpRefs,
|
|
scope
|
|
};
|
|
}
|
|
|
|
private async retrieveSpecific(mode: ChunkMode, query: string, scope?: RetrieveScope) {
|
|
const queries = buildSpecificQueries(query);
|
|
const merged = new Map<string, RetrieveResponse["items"][number]>();
|
|
|
|
for (const subquery of queries) {
|
|
const queryVector = await embedQuery(this.embeddingProvider, subquery);
|
|
const results = await this.vectorStore.search(queryVector, 6, mode, scope);
|
|
|
|
for (const item of results) {
|
|
const existing = merged.get(item.chunkId);
|
|
if (!existing || item.score > existing.score) {
|
|
merged.set(item.chunkId, item);
|
|
}
|
|
}
|
|
}
|
|
|
|
return rankItems(query, [...merged.values()]).slice(0, 8);
|
|
}
|
|
|
|
private async retrieveBootstrap(mode: ChunkMode, query: string, scope?: RetrieveScope) {
|
|
const queries = buildBootstrapQueries(query);
|
|
const merged = new Map<string, RetrieveResponse["items"][number]>();
|
|
|
|
for (const subquery of queries) {
|
|
const queryVector = await embedQuery(this.embeddingProvider, subquery);
|
|
const results = await this.vectorStore.search(queryVector, 6, mode, scope);
|
|
|
|
for (const item of results) {
|
|
const existing = merged.get(item.chunkId);
|
|
if (!existing || item.score > existing.score) {
|
|
merged.set(item.chunkId, item);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (merged.size === 0 && scope && (scope.sourceId || scope.sourceRef || (scope.tags && scope.tags.length > 0))) {
|
|
const explored = await this.vectorStore.browseScope(12, mode, scope);
|
|
for (const item of explored) {
|
|
merged.set(item.chunkId, item);
|
|
}
|
|
}
|
|
|
|
return rankItems(query, [...merged.values()]).slice(0, 12);
|
|
}
|
|
}
|