fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks
The default MergeStrategy concatenates every extracted text part into a single ContentPart, collapsing a 500-page PDF into one chunk with a blurred average embedding — RAG retrieval was effectively broken. - ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing None preserves per-part granularity. Default factory kept for backward compatibility. - routeDataFiles._autoIndexFile, _workspaceTools.readFile, and _documentTools.describeImage explicitly pass mergeStrategy=None. - Agent tools no longer carry redundant extraction + requestIngestion fallback paths: the unified ingestion lane owns all corpus writes, and readFile/describeImage are pure consumers of the knowledge store. - Unit test asserts runExtraction(mergeStrategy=None) keeps every part.
This commit is contained in:
parent
078b4eaaaf
commit
a7f4055130
5 changed files with 140 additions and 4 deletions
|
|
@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
|
||||||
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
|
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
|
||||||
|
|
||||||
# Merging strategy
|
# Merging strategy
|
||||||
mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
|
mergeStrategy: Optional[MergeStrategy] = Field(
|
||||||
|
default_factory=MergeStrategy,
|
||||||
|
description=(
|
||||||
|
"Strategy for merging extraction results. Pass None to skip merging entirely "
|
||||||
|
"(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
|
||||||
|
"granularity must be preserved for embedding)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# Optional chunking parameters (for backward compatibility)
|
# Optional chunking parameters (for backward compatibility)
|
||||||
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
|
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
|
||||||
|
|
|
||||||
|
|
@ -134,7 +134,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
||||||
|
|
||||||
extractorRegistry = ExtractorRegistry()
|
extractorRegistry = ExtractorRegistry()
|
||||||
chunkerRegistry = ChunkerRegistry()
|
chunkerRegistry = ChunkerRegistry()
|
||||||
options = ExtractionOptions()
|
# mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion.
|
||||||
|
# The default MergeStrategy concatenates all text parts into a single blob, which
|
||||||
|
# collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval.
|
||||||
|
options = ExtractionOptions(mergeStrategy=None)
|
||||||
|
|
||||||
extracted = runExtraction(
|
extracted = runExtraction(
|
||||||
extractorRegistry, chunkerRegistry,
|
extractorRegistry, chunkerRegistry,
|
||||||
|
|
|
||||||
|
|
@ -416,7 +416,8 @@ def _registerDocumentTools(registry: ToolRegistry, services):
|
||||||
fileName = fileInfo.get("fileName", fileId)
|
fileName = fileInfo.get("fileName", fileId)
|
||||||
extracted = runExtraction(
|
extracted = runExtraction(
|
||||||
ExtractorRegistry(), None,
|
ExtractorRegistry(), None,
|
||||||
rawBytes, fileName, fileMime, ExtractionOptions(),
|
rawBytes, fileName, fileMime,
|
||||||
|
ExtractionOptions(mergeStrategy=None),
|
||||||
)
|
)
|
||||||
|
|
||||||
contentObjects = []
|
contentObjects = []
|
||||||
|
|
|
||||||
|
|
@ -107,7 +107,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
|
||||||
|
|
||||||
extracted = runExtraction(
|
extracted = runExtraction(
|
||||||
ExtractorRegistry(), ChunkerRegistry(),
|
ExtractorRegistry(), ChunkerRegistry(),
|
||||||
rawBytes, fileName, mimeType, ExtractionOptions(),
|
rawBytes, fileName, mimeType,
|
||||||
|
ExtractionOptions(mergeStrategy=None),
|
||||||
)
|
)
|
||||||
|
|
||||||
contentObjects = []
|
contentObjects = []
|
||||||
|
|
|
||||||
124
tests/unit/services/test_extraction_merge_strategy.py
Normal file
124
tests/unit/services/test_extraction_merge_strategy.py
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Test that runExtraction preserves per-part granularity when mergeStrategy=None.
|
||||||
|
|
||||||
|
The default MergeStrategy concatenates all text parts into a single ContentPart, which
|
||||||
|
collapses multi-page documents into one blob. This destroys RAG retrieval because every
|
||||||
|
document ends up as a single ContentChunk with a "blurred average" embedding.
|
||||||
|
|
||||||
|
Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve
|
||||||
|
per-page / per-section chunks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
||||||
|
|
||||||
|
from modules.datamodels.datamodelExtraction import (
|
||||||
|
ContentPart,
|
||||||
|
ExtractionOptions,
|
||||||
|
MergeStrategy,
|
||||||
|
)
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
||||||
|
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
|
||||||
|
ChunkerRegistry,
|
||||||
|
Extractor,
|
||||||
|
ExtractorRegistry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeMultiPagePdfExtractor(Extractor):
|
||||||
|
"""Emits one text ContentPart per simulated page."""
|
||||||
|
|
||||||
|
def __init__(self, pageCount: int = 10):
|
||||||
|
self.pageCount = pageCount
|
||||||
|
|
||||||
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||||
|
return mimeType == "application/pdf"
|
||||||
|
|
||||||
|
def getSupportedExtensions(self):
|
||||||
|
return [".pdf"]
|
||||||
|
|
||||||
|
def getSupportedMimeTypes(self):
|
||||||
|
return ["application/pdf"]
|
||||||
|
|
||||||
|
def extract(self, fileBytes: bytes, context):
|
||||||
|
return [
|
||||||
|
ContentPart(
|
||||||
|
id=f"page-{i}",
|
||||||
|
parentId=None,
|
||||||
|
label=f"page_{i + 1}",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data=f"Page {i + 1} content — distinct semantic anchor #{i}",
|
||||||
|
metadata={"pageIndex": i, "size": 64},
|
||||||
|
)
|
||||||
|
for i in range(self.pageCount)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _buildRegistry(pageCount: int) -> ExtractorRegistry:
|
||||||
|
registry = ExtractorRegistry()
|
||||||
|
fake = _FakeMultiPagePdfExtractor(pageCount)
|
||||||
|
registry.register("application/pdf", fake)
|
||||||
|
registry.register("pdf", fake)
|
||||||
|
return registry
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_options_merge_all_text_parts_into_one():
|
||||||
|
"""Regression safeguard: default ExtractionOptions still merges (legacy behaviour).
|
||||||
|
|
||||||
|
Non-ingestion callers (AI processing, summarization) rely on this default.
|
||||||
|
"""
|
||||||
|
registry = _buildRegistry(pageCount=5)
|
||||||
|
extracted = runExtraction(
|
||||||
|
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
|
||||||
|
ExtractionOptions(),
|
||||||
|
)
|
||||||
|
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
|
||||||
|
assert len(textParts) == 1, (
|
||||||
|
f"Default options should merge all text parts into one, got {len(textParts)}"
|
||||||
|
)
|
||||||
|
assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, (
|
||||||
|
"Merged text should contain content from all pages"
|
||||||
|
)
|
||||||
|
print("test_default_options_merge_all_text_parts_into_one [PASS]")
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_none_preserves_all_text_parts():
|
||||||
|
"""Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion."""
|
||||||
|
registry = _buildRegistry(pageCount=500)
|
||||||
|
extracted = runExtraction(
|
||||||
|
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
|
||||||
|
ExtractionOptions(mergeStrategy=None),
|
||||||
|
)
|
||||||
|
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
|
||||||
|
assert len(textParts) == 500, (
|
||||||
|
f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}"
|
||||||
|
)
|
||||||
|
assert textParts[0].label == "page_1"
|
||||||
|
assert textParts[-1].label == "page_500"
|
||||||
|
print("test_merge_none_preserves_all_text_parts [PASS]")
|
||||||
|
|
||||||
|
|
||||||
|
def test_explicit_merge_strategy_still_merges():
|
||||||
|
"""Callers can still opt in to merging by passing an explicit MergeStrategy."""
|
||||||
|
registry = _buildRegistry(pageCount=3)
|
||||||
|
extracted = runExtraction(
|
||||||
|
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
|
||||||
|
ExtractionOptions(mergeStrategy=MergeStrategy()),
|
||||||
|
)
|
||||||
|
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
|
||||||
|
assert len(textParts) == 1, (
|
||||||
|
f"Explicit MergeStrategy should merge, got {len(textParts)} parts"
|
||||||
|
)
|
||||||
|
print("test_explicit_merge_strategy_still_merges [PASS]")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_default_options_merge_all_text_parts_into_one()
|
||||||
|
test_merge_none_preserves_all_text_parts()
|
||||||
|
test_explicit_merge_strategy_still_merges()
|
||||||
|
print("\nAll merge-strategy tests passed.")
|
||||||
Loading…
Reference in a new issue