Fix oversized documental chunks and ingest GStreamer corpus

2026-04-09 15:47:27 +02:00 · 2026-04-09 15:47:27 +02:00 · 81065063cd
commit 81065063cd
parent 60d1b9679d
4 changed files with 95 additions and 41 deletions
--- a/RAG/docs/HISTORIAL_SESIONES.md
+++ b/RAG/docs/HISTORIAL_SESIONES.md
@ -52,3 +52,7 @@ Dar continuidad al RAG en `RAG/` a partir del estado actual documentado.
 - Correccion en `IngestService` (`resolveInputFiles` y `normalizeDocumentKey`) para escanear archivos desde la ruta temporal extraída (`readPath`) en lugar del identificador lógico al subir carpetas completas, evitando error de `ENOENT`.
 - Revision inicial del corpus `/_imports/gstreamer-rag-text` como futura base documental especializada para GStreamer.
 - Creacion de `RAG/docs/TASK_INGESTA_GSTREAMER.md` con el plan operativo para ingerirlo bajo un scope unico, validar retrieval y prepararlo para uso posterior con modelo local.
 - Diagnostico y correccion del fallo real de ingesta masiva en corpus documentales: algunos ficheros generaban chunks sobredimensionados que acababan rompiendo la llamada a embeddings.
 - Correccion aplicada en `src/modules/process/chunking.ts` y endurecimiento defensivo de `src/modules/embeddings/provider.ts`.
 - Ingesta completada del corpus GStreamer bajo el scope unico `gstreamer-official` / `corpus:gstreamer:official:v1` con `3117` documentos y `22003` chunks.
 - Validacion funcional en produccion mediante `GET /sources` y `POST /retrieve` para bootstrap y consulta especifica sobre request pads.
--- a/RAG/docs/TASK_INGESTA_GSTREAMER.md
+++ b/RAG/docs/TASK_INGESTA_GSTREAMER.md
@ -4,7 +4,7 @@
 **Modulo:** RAG  
 **Ultima actualizacion:** 2026-04-09  
 **Ultima modificacion por:** Agente RAG 2  
-**Estado:** Planificado
+**Estado:** Ejecutado y validado
 ---
@ -197,3 +197,65 @@ Si el usuario valida este plan, la siguiente fase sera:
 2. ejecutar la ingesta real del corpus GStreamer
 3. validar retrieval y bootstrap
 4. dejar preparado el terreno para la integracion con el modelo local
 ---
 ## Ejecucion realizada
 Resultado real de la ejecucion controlada:
 - metodo usado: ingesta local controlada contra la misma `Qdrant` remota del servicio
 - scope final:
  - `sourceId`: `corpus:gstreamer:official:v1`
  - `sourceRef`: `gstreamer-official`
  - `tags`: `gstreamer`, `official-docs`, `multimedia`, `documental`
 - documentos procesados: `3117`
 - chunks almacenados: `22003`
 ### Incidencia detectada y corregida
 Durante la ingesta aparecio el error:
 ```text
 Cannot read properties of undefined (reading 'map')
 ```
 Diagnostico:
 - no era un fallo generico del corpus completo
 - se reproducia en ficheros documentales con bloques desmesurados
 - caso detectado: `decklink/decklinkvideosink.txt`
 - ese fichero generaba un chunk de aproximadamente `168061` caracteres, demasiado grande para el proveedor de embeddings
 Correccion aplicada en codigo:
 - ajuste del chunking documental para trocear correctamente buffers sobredimensionados antes de enviarlos a embeddings
 - validacion defensiva adicional en `OpenRouterEmbeddingProvider.embed()` para detectar respuestas invalidas sin `data`
 ### Validacion funcional realizada
 1. `GET /sources`
 - el scope `gstreamer-official` aparece correctamente en produccion
 2. `POST /retrieve` con `intent=bootstrap`
 - devuelve un mapa inicial coherente del dominio GStreamer
 - referencias principales recuperadas:
  - `application-development/introduction/basics.txt`
  - `application-development/basics/index.txt`
  - `application-development/introduction/index.txt`
  - `tutorials/basic/index.txt`
 3. `POST /retrieve` con consulta especifica sobre request pads
 - devuelve contenido util y oficial sobre:
  - `request pads`
  - `gst_element_get_request_pad`
  - `gst_element_request_pad_simple`
  - tutoriales de pad availability y multithreading
 ### Estado resultante
 El corpus de GStreamer ha quedado listo como base documental especializada dentro del RAG para:
 - bootstrap general del dominio
 - consultas documentales tecnicas
 - apoyo posterior a un modelo local en revision de codigo que use GStreamer
--- a/RAG/src/modules/embeddings/provider.ts
+++ b/RAG/src/modules/embeddings/provider.ts
@ -34,6 +34,10 @@ export class OpenRouterEmbeddingProvider implements EmbeddingProvider {
      encoding_format: "float"
    });
    if (!Array.isArray(response.data)) {
      throw new Error("Embedding provider returned an invalid response without data array");
    }
    return response.data.map((item) => item.embedding);
  }
 }
--- a/RAG/src/modules/process/chunking.ts
+++ b/RAG/src/modules/process/chunking.ts
@ -100,17 +100,21 @@ function buildDocumentalChunks(title: string, content: string, policy: ChunkingP
  const chunks: ChunkedDocument[] = [];
  let index = 0;
  function pushDocumentalPiece(sectionTitle: string, piece: string, startLine: number) {
    chunks.push({
      index: index++,
      title,
      sectionTitle,
      content: piece,
      startLine,
      endLine: startLine + piece.split("\n").length - 1
    });
  }
  for (const section of sections) {
    const sectionLineCount = section.body.split("\n").length;
    if (section.body.length <= policy.maxCharacters) {
-      chunks.push({
+      pushDocumentalPiece(section.sectionTitle, section.body, section.startLine);
        index: index++,
        title,
        sectionTitle: section.sectionTitle,
        content: section.body,
        startLine: section.startLine,
        endLine: section.startLine + sectionLineCount - 1
      });
      continue;
    }
@ -129,27 +133,21 @@ function buildDocumentalChunks(title: string, content: string, policy: ChunkingP
      }
      if (buffer) {
-        chunks.push({
+        if (buffer.length > policy.maxCharacters) {
-          index: index++,
+          let currentStartLine = bufferStartLine;
-          title,
+          for (const piece of chunkOversizedText(buffer, policy.maxCharacters, policy.overlapCharacters)) {
-          sectionTitle: section.sectionTitle,
+            pushDocumentalPiece(section.sectionTitle, piece, currentStartLine);
-          content: buffer,
+            currentStartLine += piece.split("\n").length;
-          startLine: bufferStartLine,
+          }
-          endLine: bufferStartLine + buffer.split("\n").length - 1
+        } else {
-        });
+          pushDocumentalPiece(section.sectionTitle, buffer, bufferStartLine);
        }
        bufferStartLine = bufferStartLine + buffer.split("\n").length + 1;
        buffer = paragraph;
        consumedLines = paragraphLines;
      } else {
        for (const piece of chunkOversizedText(paragraph, policy.maxCharacters, policy.overlapCharacters)) {
-          chunks.push({
+          pushDocumentalPiece(section.sectionTitle, piece, bufferStartLine);
            index: index++,
            title,
            sectionTitle: section.sectionTitle,
            content: piece,
            startLine: bufferStartLine,
            endLine: bufferStartLine + piece.split("\n").length - 1
          });
        }
        bufferStartLine += paragraphLines;
      }
@ -158,24 +156,10 @@ function buildDocumentalChunks(title: string, content: string, policy: ChunkingP
    if (buffer) {
      if (buffer.length > policy.maxCharacters) {
        for (const piece of chunkOversizedText(buffer, policy.maxCharacters, policy.overlapCharacters)) {
-          chunks.push({
+          pushDocumentalPiece(section.sectionTitle, piece, bufferStartLine);
            index: index++,
            title,
            sectionTitle: section.sectionTitle,
            content: piece,
            startLine: bufferStartLine,
            endLine: bufferStartLine + piece.split("\n").length - 1
          });
        }
      } else {
-        chunks.push({
+        pushDocumentalPiece(section.sectionTitle, buffer, bufferStartLine);
          index: index++,
          title,
          sectionTitle: section.sectionTitle,
          content: buffer,
          startLine: bufferStartLine,
          endLine: bufferStartLine + buffer.split("\n").length - 1
        });
      }
    }
  }