fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks

The default MergeStrategy concatenates every extracted text part into a single ContentPart, collapsing a 500-page PDF into one chunk with a blurred average embedding — RAG retrieval was effectively broken. - ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing None preserves per-part granularity. Default factory kept for backward compatibility. - routeDataFiles._autoIndexFile, _workspaceTools.readFile, and _documentTools.describeImage explicitly pass mergeStrategy=None. - Agent tools no longer carry redundant extraction + requestIngestion fallback paths: the unified ingestion lane owns all corpus writes, and readFile/describeImage are pure consumers of the knowledge store. - Unit test asserts runExtraction(mergeStrategy=None) keeps every part.
2026-04-21 12:41:50 +02:00 · 2026-04-21 12:41:50 +02:00 · a7f4055130
commit a7f4055130
parent 078b4eaaaf
5 changed files with 140 additions and 4 deletions
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
    imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
    # Merging strategy
-    mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
+    mergeStrategy: Optional[MergeStrategy] = Field(
        default_factory=MergeStrategy,
        description=(
            "Strategy for merging extraction results. Pass None to skip merging entirely "
            "(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
            "granularity must be preserved for embedding)."
        ),
    )
    # Optional chunking parameters (for backward compatibility)
    chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
--- a/modules/routes/routeDataFiles.py
+++ b/modules/routes/routeDataFiles.py
@ -134,7 +134,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
        extractorRegistry = ExtractorRegistry()
        chunkerRegistry = ChunkerRegistry()
-        options = ExtractionOptions()
+        # mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion.
        # The default MergeStrategy concatenates all text parts into a single blob, which
        # collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval.
        options = ExtractionOptions(mergeStrategy=None)
        extracted = runExtraction(
            extractorRegistry, chunkerRegistry,
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
@ -416,7 +416,8 @@ def _registerDocumentTools(registry: ToolRegistry, services):
                            fileName = fileInfo.get("fileName", fileId)
                            extracted = runExtraction(
                                ExtractorRegistry(), None,
-                                rawBytes, fileName, fileMime, ExtractionOptions(),
+                                rawBytes, fileName, fileMime,
                                ExtractionOptions(mergeStrategy=None),
                            )
                            contentObjects = []
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
@ -107,7 +107,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
                    extracted = runExtraction(
                        ExtractorRegistry(), ChunkerRegistry(),
-                        rawBytes, fileName, mimeType, ExtractionOptions(),
+                        rawBytes, fileName, mimeType,
                        ExtractionOptions(mergeStrategy=None),
                    )
                    contentObjects = []
--- a/tests/unit/services/test_extraction_merge_strategy.py
+++ b/tests/unit/services/test_extraction_merge_strategy.py
@ -0,0 +1,124 @@
 #!/usr/bin/env python3
 # Copyright (c) 2025 Patrick Motsch
 # All rights reserved.
 """Test that runExtraction preserves per-part granularity when mergeStrategy=None.
 The default MergeStrategy concatenates all text parts into a single ContentPart, which
 collapses multi-page documents into one blob. This destroys RAG retrieval because every
 document ends up as a single ContentChunk with a "blurred average" embedding.
 Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve
 per-page / per-section chunks.
 """
 import os
 import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
 from modules.datamodels.datamodelExtraction import (
    ContentPart,
    ExtractionOptions,
    MergeStrategy,
 )
 from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
 from modules.serviceCenter.services.serviceExtraction.subRegistry import (
    ChunkerRegistry,
    Extractor,
    ExtractorRegistry,
 )
 class _FakeMultiPagePdfExtractor(Extractor):
    """Emits one text ContentPart per simulated page."""
    def __init__(self, pageCount: int = 10):
        self.pageCount = pageCount
    def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
        return mimeType == "application/pdf"
    def getSupportedExtensions(self):
        return [".pdf"]
    def getSupportedMimeTypes(self):
        return ["application/pdf"]
    def extract(self, fileBytes: bytes, context):
        return [
            ContentPart(
                id=f"page-{i}",
                parentId=None,
                label=f"page_{i + 1}",
                typeGroup="text",
                mimeType="text/plain",
                data=f"Page {i + 1} content — distinct semantic anchor #{i}",
                metadata={"pageIndex": i, "size": 64},
            )
            for i in range(self.pageCount)
        ]
 def _buildRegistry(pageCount: int) -> ExtractorRegistry:
    registry = ExtractorRegistry()
    fake = _FakeMultiPagePdfExtractor(pageCount)
    registry.register("application/pdf", fake)
    registry.register("pdf", fake)
    return registry
 def test_default_options_merge_all_text_parts_into_one():
    """Regression safeguard: default ExtractionOptions still merges (legacy behaviour).
    Non-ingestion callers (AI processing, summarization) rely on this default.
    """
    registry = _buildRegistry(pageCount=5)
    extracted = runExtraction(
        registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
        ExtractionOptions(),
    )
    textParts = [p for p in extracted.parts if p.typeGroup == "text"]
    assert len(textParts) == 1, (
        f"Default options should merge all text parts into one, got {len(textParts)}"
    )
    assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, (
        "Merged text should contain content from all pages"
    )
    print("test_default_options_merge_all_text_parts_into_one [PASS]")
 def test_merge_none_preserves_all_text_parts():
    """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion."""
    registry = _buildRegistry(pageCount=500)
    extracted = runExtraction(
        registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
        ExtractionOptions(mergeStrategy=None),
    )
    textParts = [p for p in extracted.parts if p.typeGroup == "text"]
    assert len(textParts) == 500, (
        f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}"
    )
    assert textParts[0].label == "page_1"
    assert textParts[-1].label == "page_500"
    print("test_merge_none_preserves_all_text_parts [PASS]")
 def test_explicit_merge_strategy_still_merges():
    """Callers can still opt in to merging by passing an explicit MergeStrategy."""
    registry = _buildRegistry(pageCount=3)
    extracted = runExtraction(
        registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
        ExtractionOptions(mergeStrategy=MergeStrategy()),
    )
    textParts = [p for p in extracted.parts if p.typeGroup == "text"]
    assert len(textParts) == 1, (
        f"Explicit MergeStrategy should merge, got {len(textParts)} parts"
    )
    print("test_explicit_merge_strategy_still_merges [PASS]")
 if __name__ == "__main__":
    test_default_options_merge_all_text_parts_into_one()
    test_merge_none_preserves_all_text_parts()
    test_explicit_merge_strategy_still_merges()
    print("\nAll merge-strategy tests passed.")