From a7f4055130dae218baaca66394852a3fe30e458b Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 12:41:50 +0200 Subject: [PATCH] fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default MergeStrategy concatenates every extracted text part into a single ContentPart, collapsing a 500-page PDF into one chunk with a blurred average embedding — RAG retrieval was effectively broken. - ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing None preserves per-part granularity. Default factory kept for backward compatibility. - routeDataFiles._autoIndexFile, _workspaceTools.readFile, and _documentTools.describeImage explicitly pass mergeStrategy=None. - Agent tools no longer carry redundant extraction + requestIngestion fallback paths: the unified ingestion lane owns all corpus writes, and readFile/describeImage are pure consumers of the knowledge store. - Unit test asserts runExtraction(mergeStrategy=None) keeps every part. --- modules/datamodels/datamodelExtraction.py | 9 +- modules/routes/routeDataFiles.py | 5 +- .../serviceAgent/coreTools/_documentTools.py | 3 +- .../serviceAgent/coreTools/_workspaceTools.py | 3 +- .../test_extraction_merge_strategy.py | 124 ++++++++++++++++++ 5 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 tests/unit/services/test_extraction_merge_strategy.py diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py index 0aaaffd8..38fd1d27 100644 --- a/modules/datamodels/datamodelExtraction.py +++ b/modules/datamodels/datamodelExtraction.py @@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel): imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)") # Merging strategy - mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results") + mergeStrategy: Optional[MergeStrategy] = Field( + default_factory=MergeStrategy, + description=( + "Strategy for merging extraction results. Pass None to skip merging entirely " + "(required for per-chunk ingestion pipelines like RAG, where per-page/per-section " + "granularity must be preserved for embedding)." + ), + ) # Optional chunking parameters (for backward compatibility) chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed") diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index f281d15e..26614ff0 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -134,7 +134,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() - options = ExtractionOptions() + # mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion. + # The default MergeStrategy concatenates all text parts into a single blob, which + # collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval. + options = ExtractionOptions(mergeStrategy=None) extracted = runExtraction( extractorRegistry, chunkerRegistry, diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py index b9b00755..64b3a147 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py @@ -416,7 +416,8 @@ def _registerDocumentTools(registry: ToolRegistry, services): fileName = fileInfo.get("fileName", fileId) extracted = runExtraction( ExtractorRegistry(), None, - rawBytes, fileName, fileMime, ExtractionOptions(), + rawBytes, fileName, fileMime, + ExtractionOptions(mergeStrategy=None), ) contentObjects = [] diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index bb548081..aa337472 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -107,7 +107,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): extracted = runExtraction( ExtractorRegistry(), ChunkerRegistry(), - rawBytes, fileName, mimeType, ExtractionOptions(), + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), ) contentObjects = [] diff --git a/tests/unit/services/test_extraction_merge_strategy.py b/tests/unit/services/test_extraction_merge_strategy.py new file mode 100644 index 00000000..784bb783 --- /dev/null +++ b/tests/unit/services/test_extraction_merge_strategy.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Test that runExtraction preserves per-part granularity when mergeStrategy=None. + +The default MergeStrategy concatenates all text parts into a single ContentPart, which +collapses multi-page documents into one blob. This destroys RAG retrieval because every +document ends up as a single ContentChunk with a "blurred average" embedding. + +Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve +per-page / per-section chunks. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.datamodels.datamodelExtraction import ( + ContentPart, + ExtractionOptions, + MergeStrategy, +) +from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction +from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ChunkerRegistry, + Extractor, + ExtractorRegistry, +) + + +class _FakeMultiPagePdfExtractor(Extractor): + """Emits one text ContentPart per simulated page.""" + + def __init__(self, pageCount: int = 10): + self.pageCount = pageCount + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return mimeType == "application/pdf" + + def getSupportedExtensions(self): + return [".pdf"] + + def getSupportedMimeTypes(self): + return ["application/pdf"] + + def extract(self, fileBytes: bytes, context): + return [ + ContentPart( + id=f"page-{i}", + parentId=None, + label=f"page_{i + 1}", + typeGroup="text", + mimeType="text/plain", + data=f"Page {i + 1} content — distinct semantic anchor #{i}", + metadata={"pageIndex": i, "size": 64}, + ) + for i in range(self.pageCount) + ] + + +def _buildRegistry(pageCount: int) -> ExtractorRegistry: + registry = ExtractorRegistry() + fake = _FakeMultiPagePdfExtractor(pageCount) + registry.register("application/pdf", fake) + registry.register("pdf", fake) + return registry + + +def test_default_options_merge_all_text_parts_into_one(): + """Regression safeguard: default ExtractionOptions still merges (legacy behaviour). + + Non-ingestion callers (AI processing, summarization) rely on this default. + """ + registry = _buildRegistry(pageCount=5) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Default options should merge all text parts into one, got {len(textParts)}" + ) + assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, ( + "Merged text should contain content from all pages" + ) + print("test_default_options_merge_all_text_parts_into_one [PASS]") + + +def test_merge_none_preserves_all_text_parts(): + """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion.""" + registry = _buildRegistry(pageCount=500) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=None), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 500, ( + f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}" + ) + assert textParts[0].label == "page_1" + assert textParts[-1].label == "page_500" + print("test_merge_none_preserves_all_text_parts [PASS]") + + +def test_explicit_merge_strategy_still_merges(): + """Callers can still opt in to merging by passing an explicit MergeStrategy.""" + registry = _buildRegistry(pageCount=3) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=MergeStrategy()), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Explicit MergeStrategy should merge, got {len(textParts)} parts" + ) + print("test_explicit_merge_strategy_still_merges [PASS]") + + +if __name__ == "__main__": + test_default_options_merge_all_text_parts_into_one() + test_merge_none_preserves_all_text_parts() + test_explicit_merge_strategy_still_merges() + print("\nAll merge-strategy tests passed.")