gateway/tests/unit/services/test_extraction_merge_strategy.py
Ida a7f4055130 fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks
The default MergeStrategy concatenates every extracted text part into a
single ContentPart, collapsing a 500-page PDF into one chunk with a
blurred average embedding — RAG retrieval was effectively broken.

- ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing
  None preserves per-part granularity. Default factory kept for
  backward compatibility.
- routeDataFiles._autoIndexFile, _workspaceTools.readFile, and
  _documentTools.describeImage explicitly pass mergeStrategy=None.
- Agent tools no longer carry redundant extraction + requestIngestion
  fallback paths: the unified ingestion lane owns all corpus writes,
  and readFile/describeImage are pure consumers of the knowledge store.
- Unit test asserts runExtraction(mergeStrategy=None) keeps every part.
2026-04-29 14:39:40 +02:00

124 lines
4.4 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Test that runExtraction preserves per-part granularity when mergeStrategy=None.
The default MergeStrategy concatenates all text parts into a single ContentPart, which
collapses multi-page documents into one blob. This destroys RAG retrieval because every
document ends up as a single ContentChunk with a "blurred average" embedding.
Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve
per-page / per-section chunks.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.datamodels.datamodelExtraction import (
ContentPart,
ExtractionOptions,
MergeStrategy,
)
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ChunkerRegistry,
Extractor,
ExtractorRegistry,
)
class _FakeMultiPagePdfExtractor(Extractor):
"""Emits one text ContentPart per simulated page."""
def __init__(self, pageCount: int = 10):
self.pageCount = pageCount
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf"
def getSupportedExtensions(self):
return [".pdf"]
def getSupportedMimeTypes(self):
return ["application/pdf"]
def extract(self, fileBytes: bytes, context):
return [
ContentPart(
id=f"page-{i}",
parentId=None,
label=f"page_{i + 1}",
typeGroup="text",
mimeType="text/plain",
data=f"Page {i + 1} content — distinct semantic anchor #{i}",
metadata={"pageIndex": i, "size": 64},
)
for i in range(self.pageCount)
]
def _buildRegistry(pageCount: int) -> ExtractorRegistry:
registry = ExtractorRegistry()
fake = _FakeMultiPagePdfExtractor(pageCount)
registry.register("application/pdf", fake)
registry.register("pdf", fake)
return registry
def test_default_options_merge_all_text_parts_into_one():
"""Regression safeguard: default ExtractionOptions still merges (legacy behaviour).
Non-ingestion callers (AI processing, summarization) rely on this default.
"""
registry = _buildRegistry(pageCount=5)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 1, (
f"Default options should merge all text parts into one, got {len(textParts)}"
)
assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, (
"Merged text should contain content from all pages"
)
print("test_default_options_merge_all_text_parts_into_one [PASS]")
def test_merge_none_preserves_all_text_parts():
"""Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion."""
registry = _buildRegistry(pageCount=500)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(mergeStrategy=None),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 500, (
f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}"
)
assert textParts[0].label == "page_1"
assert textParts[-1].label == "page_500"
print("test_merge_none_preserves_all_text_parts [PASS]")
def test_explicit_merge_strategy_still_merges():
"""Callers can still opt in to merging by passing an explicit MergeStrategy."""
registry = _buildRegistry(pageCount=3)
extracted = runExtraction(
registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
ExtractionOptions(mergeStrategy=MergeStrategy()),
)
textParts = [p for p in extracted.parts if p.typeGroup == "text"]
assert len(textParts) == 1, (
f"Explicit MergeStrategy should merge, got {len(textParts)} parts"
)
print("test_explicit_merge_strategy_still_merges [PASS]")
if __name__ == "__main__":
test_default_options_merge_all_text_parts_into_one()
test_merge_none_preserves_all_text_parts()
test_explicit_merge_strategy_still_merges()
print("\nAll merge-strategy tests passed.")