#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """Test that runExtraction preserves per-part granularity when mergeStrategy=None. The default MergeStrategy concatenates all text parts into a single ContentPart, which collapses multi-page documents into one blob. This destroys RAG retrieval because every document ends up as a single ContentChunk with a "blurred average" embedding. Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve per-page / per-section chunks. """ import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) from modules.datamodels.datamodelExtraction import ( ContentPart, ExtractionOptions, MergeStrategy, ) from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction from modules.serviceCenter.services.serviceExtraction.subRegistry import ( ChunkerRegistry, Extractor, ExtractorRegistry, ) class _FakeMultiPagePdfExtractor(Extractor): """Emits one text ContentPart per simulated page.""" def __init__(self, pageCount: int = 10): self.pageCount = pageCount def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: return mimeType == "application/pdf" def getSupportedExtensions(self): return [".pdf"] def getSupportedMimeTypes(self): return ["application/pdf"] def extract(self, fileBytes: bytes, context): return [ ContentPart( id=f"page-{i}", parentId=None, label=f"page_{i + 1}", typeGroup="text", mimeType="text/plain", data=f"Page {i + 1} content — distinct semantic anchor #{i}", metadata={"pageIndex": i, "size": 64}, ) for i in range(self.pageCount) ] def _buildRegistry(pageCount: int) -> ExtractorRegistry: registry = ExtractorRegistry() fake = _FakeMultiPagePdfExtractor(pageCount) registry.register("application/pdf", fake) registry.register("pdf", fake) return registry def test_default_options_merge_all_text_parts_into_one(): """Regression safeguard: default ExtractionOptions still merges (legacy behaviour). Non-ingestion callers (AI processing, summarization) rely on this default. """ registry = _buildRegistry(pageCount=5) extracted = runExtraction( registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", ExtractionOptions(), ) textParts = [p for p in extracted.parts if p.typeGroup == "text"] assert len(textParts) == 1, ( f"Default options should merge all text parts into one, got {len(textParts)}" ) assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, ( "Merged text should contain content from all pages" ) print("test_default_options_merge_all_text_parts_into_one [PASS]") def test_merge_none_preserves_all_text_parts(): """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion.""" registry = _buildRegistry(pageCount=500) extracted = runExtraction( registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", ExtractionOptions(mergeStrategy=None), ) textParts = [p for p in extracted.parts if p.typeGroup == "text"] assert len(textParts) == 500, ( f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}" ) assert textParts[0].label == "page_1" assert textParts[-1].label == "page_500" print("test_merge_none_preserves_all_text_parts [PASS]") def test_explicit_merge_strategy_still_merges(): """Callers can still opt in to merging by passing an explicit MergeStrategy.""" registry = _buildRegistry(pageCount=3) extracted = runExtraction( registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", ExtractionOptions(mergeStrategy=MergeStrategy()), ) textParts = [p for p in extracted.parts if p.typeGroup == "text"] assert len(textParts) == 1, ( f"Explicit MergeStrategy should merge, got {len(textParts)} parts" ) print("test_explicit_merge_strategy_still_merges [PASS]") if __name__ == "__main__": test_default_options_merge_all_text_parts_into_one() test_merge_none_preserves_all_text_parts() test_explicit_merge_strategy_still_merges() print("\nAll merge-strategy tests passed.")