#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Test script for JSON extraction response detection and merging. The methods under test (``_isJsonExtractionResponse``, ``_mergeJsonExtractionResponses``, etc.) are pure data-manipulation and do NOT touch ``self._context`` / ``self._get_service`` / the DB. We therefore bypass ``ExtractionService.__init__`` (which would require a live ``ServiceCenterContext`` + service-resolver) by instantiating with ``__new__`` — same as constructing a stub without dependency wiring. Run: python gateway/tests/unit/services/test_json_extraction_merging.py """ import json import sys import os # Add gateway to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../..')) from modules.datamodels.datamodelExtraction import ContentPart from modules.serviceCenter.services.serviceExtraction.mainServiceExtraction import ExtractionService def test_detects_json_with_code_fences(): """Test that JSON extraction responses with markdown code fences are detected""" print("Test 1: Detecting JSON with code fences...") service = ExtractionService.__new__(ExtractionService) content_part = ContentPart( id="test1", label="test1", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"text": "Sample text", "tables": []}}\n```' ) result = service._isJsonExtractionResponse([content_part]) assert result == True, "Should detect JSON with code fences" print(" [PASS]") def test_detects_json_without_code_fences(): """Test that JSON extraction responses without code fences are detected""" print("Test 2: Detecting JSON without code fences...") service = ExtractionService.__new__(ExtractionService) content_part = ContentPart( id="test2", label="test2", typeGroup="text", mimeType="text/plain", data='{"extracted_content": {"text": "Sample text", "tables": []}}' ) result = service._isJsonExtractionResponse([content_part]) assert result == True, "Should detect JSON without code fences" print(" [PASS]") def test_rejects_non_extraction_json(): """Test that regular JSON (without extracted_content) is rejected""" print("Test 3: Rejecting non-extraction JSON...") service = ExtractionService.__new__(ExtractionService) content_part = ContentPart( id="test3", label="test3", typeGroup="text", mimeType="text/plain", data='{"documents": [{"sections": []}]}' ) result = service._isJsonExtractionResponse([content_part]) assert result == False, "Should reject non-extraction JSON" print(" [PASS]") def test_rejects_non_json_content(): """Test that non-JSON content is rejected""" print("Test 4: Rejecting non-JSON content...") service = ExtractionService.__new__(ExtractionService) content_part = ContentPart( id="test4", label="test4", typeGroup="text", mimeType="text/plain", data="This is plain text, not JSON" ) result = service._isJsonExtractionResponse([content_part]) assert result == False, "Should reject non-JSON content" print(" [PASS]") def test_merges_tables_with_same_headers(): """Test that tables with identical headers are merged""" print("Test 5: Merging tables with same headers...") service = ExtractionService.__new__(ExtractionService) part1 = ContentPart( id="test1", label="test1", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Alice", "100"], ["Bob", "200"]]}]}}\n```' ) part2 = ContentPart( id="test2", label="test2", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Charlie", "300"], ["Alice", "100"]]}]}}\n```' ) merged = service._mergeJsonExtractionResponses([part1, part2]) # Should have one table group with merged rows assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}" table = merged["extracted_content"]["tables"][0] assert table["headers"] == ["Name", "Amount"], f"Headers should match, got {table['headers']}" # Per the documented merge contract ("Tables: Combines all table rows, # ... duplicates preserved" — see _mergeJsonExtractionResponses # docstring), identical rows from different parts are NOT deduplicated. # Alice appears in both parts, so the merged table has 4 rows. assert len(table["rows"]) == 4, f"Should have 4 rows (duplicates preserved), got {len(table['rows'])}" assert ["Alice", "100"] in table["rows"], "Alice row should be present" assert ["Bob", "200"] in table["rows"], "Bob row should be present" assert ["Charlie", "300"] in table["rows"], "Charlie row should be present" assert table["rows"].count(["Alice", "100"]) == 2, "Alice row must be preserved twice (no dedup)" print(" [PASS]") def test_merges_multiple_json_blocks_separated_by_dash(): """Test that multiple JSON blocks separated by --- are merged""" print("Test 6: Merging multiple JSON blocks separated by ---...") service = ExtractionService.__new__(ExtractionService) # Create content part with multiple JSON blocks separated by --- part1 = ContentPart( id="test1", label="test1", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Alice", "100"]]}]}}\n```\n---\n```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Bob", "200"]]}]}}\n```' ) merged = service._mergeJsonExtractionResponses([part1]) # Should have one table with merged rows from both JSON blocks assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}" table = merged["extracted_content"]["tables"][0] assert table["headers"] == ["Name", "Amount"], f"Headers should match, got {table['headers']}" assert len(table["rows"]) == 2, f"Should have 2 rows, got {len(table['rows'])}" assert ["Alice", "100"] in table["rows"], "Alice row should be present" assert ["Bob", "200"] in table["rows"], "Bob row should be present" print(" [PASS]") def test_merges_text_content(): """Test that text content from multiple parts is merged""" print("Test 7: Merging text content...") service = ExtractionService.__new__(ExtractionService) part1 = ContentPart( id="test1", label="test1", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"text": "First paragraph."}}\n```' ) part2 = ContentPart( id="test2", label="test2", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"text": "Second paragraph."}}\n```' ) merged = service._mergeJsonExtractionResponses([part1, part2]) # Text should be concatenated with newlines text = merged["extracted_content"]["text"] assert "First paragraph." in text, "First paragraph should be present" assert "Second paragraph." in text, "Second paragraph should be present" print(" [PASS]") def test_merges_headings_and_lists(): """Test that headings and lists are merged""" print("Test 8: Merging headings and lists...") service = ExtractionService.__new__(ExtractionService) part1 = ContentPart( id="test1", label="test1", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"headings": [{"level": 1, "text": "Title 1"}], "lists": [{"type": "bullet", "items": ["Item 1"]}]}}\n```' ) part2 = ContentPart( id="test2", label="test2", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"headings": [{"level": 2, "text": "Subtitle 1"}], "lists": [{"type": "bullet", "items": ["Item 2"]}]}}\n```' ) merged = service._mergeJsonExtractionResponses([part1, part2]) # Should have 2 headings assert len(merged["extracted_content"]["headings"]) == 2, f"Should have 2 headings, got {len(merged['extracted_content']['headings'])}" assert merged["extracted_content"]["headings"][0]["text"] == "Title 1", "First heading should be Title 1" assert merged["extracted_content"]["headings"][1]["text"] == "Subtitle 1", "Second heading should be Subtitle 1" # Should have 2 lists assert len(merged["extracted_content"]["lists"]) == 2, f"Should have 2 lists, got {len(merged['extracted_content']['lists'])}" assert merged["extracted_content"]["lists"][0]["items"] == ["Item 1"], "First list should have Item 1" assert merged["extracted_content"]["lists"][1]["items"] == ["Item 2"], "Second list should have Item 2" print(" [PASS]") def test_handles_empty_content_parts(): """Test that empty content parts are handled gracefully""" print("Test 9: Handling empty content parts...") service = ExtractionService.__new__(ExtractionService) part1 = ContentPart( id="test1", label="test1", typeGroup="text", mimeType="text/plain", data='```json\n{"extracted_content": {"text": "Some text"}}\n```' ) part2 = ContentPart( id="test2", label="test2", typeGroup="text", mimeType="text/plain", data="" # Empty part ) merged = service._mergeJsonExtractionResponses([part1, part2]) # Should still have the text from part1 assert merged["extracted_content"]["text"] == "Some text", "Should have text from part1" print(" [PASS]") def test_merges_tables_with_different_headers(): """Test that tables with different headers are kept separate""" print("Test 10: Keeping tables with different headers separate...") service = ExtractionService.__new__(ExtractionService) part1 = ContentPart( id="test1", label="test1", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Date", "Amount"], "rows": [["2024-01-01", "100"]]}]}}\n```' ) part2 = ContentPart( id="test2", label="test2", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Email"], "rows": [["Alice", "alice@example.com"]]}]}}\n```' ) merged = service._mergeJsonExtractionResponses([part1, part2]) # Should have 2 separate tables (different headers) assert len(merged["extracted_content"]["tables"]) == 2, f"Should have 2 separate tables, got {len(merged['extracted_content']['tables'])}" # Check first table table1 = merged["extracted_content"]["tables"][0] assert table1["headers"] == ["Date", "Amount"], "First table should have Date/Amount headers" assert len(table1["rows"]) == 1, "First table should have 1 row" # Check second table table2 = merged["extracted_content"]["tables"][1] assert table2["headers"] == ["Name", "Email"], "Second table should have Name/Email headers" assert len(table2["rows"]) == 1, "Second table should have 1 row" print(" [PASS]") def test_real_world_scenario(): """Test with a realistic scenario similar to the debug file""" print("Test 11: Real-world scenario (multiple documents, multiple JSON blocks)...") service = ExtractionService.__new__(ExtractionService) # Simulate 3 documents, each with a table extraction response part1 = ContentPart( id="doc1", label="doc1", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN001", "2024-01-01", "100.00"], ["TXN002", "2024-01-02", "200.00"]]}]}}\n```' ) part2 = ContentPart( id="doc2", label="doc2", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN003", "2024-01-03", "300.00"], ["TXN001", "2024-01-01", "100.00"]]}]}}\n```' ) # Part 3 has multiple JSON blocks separated by --- part3 = ContentPart( id="doc3", label="doc3", typeGroup="table", mimeType="application/json", data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN004", "2024-01-04", "400.00"]]}]}}\n```\n---\n```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN005", "2024-01-05", "500.00"]]}]}}\n```' ) merged = service._mergeJsonExtractionResponses([part1, part2, part3]) # Should have one merged table with all transactions assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}" table = merged["extracted_content"]["tables"][0] assert table["headers"] == ["Transaction ID", "Date", "Amount"], "Headers should match" # Per the documented merge contract, duplicate rows are preserved. # TXN001 occurs in both doc1 and doc2 -> 6 rows total. assert len(table["rows"]) == 6, f"Should have 6 rows (duplicates preserved), got {len(table['rows'])}" transaction_ids = [row[0] for row in table["rows"]] for txn in ("TXN001", "TXN002", "TXN003", "TXN004", "TXN005"): assert txn in transaction_ids, f"{txn} should be present" # TXN001 must appear twice (no dedup at merge time — dedup is the # responsibility of downstream consumers if needed). assert transaction_ids.count("TXN001") == 2, "TXN001 must appear twice (duplicates preserved)" print(" [PASS]") def main(): """Run all tests""" print("=" * 60) print("Testing JSON Extraction Response Detection and Merging") print("=" * 60) print() tests = [ test_detects_json_with_code_fences, test_detects_json_without_code_fences, test_rejects_non_extraction_json, test_rejects_non_json_content, test_merges_tables_with_same_headers, test_merges_multiple_json_blocks_separated_by_dash, test_merges_text_content, test_merges_headings_and_lists, test_handles_empty_content_parts, test_merges_tables_with_different_headers, test_real_world_scenario, ] passed = 0 failed = 0 for test in tests: try: test() passed += 1 except AssertionError as e: print(f" [FAIL] {e}") failed += 1 except Exception as e: print(f" [ERROR] {e}") import traceback traceback.print_exc() failed += 1 print() print("=" * 60) print(f"Results: {passed} passed, {failed} failed") print("=" * 60) return 0 if failed == 0 else 1 if __name__ == "__main__": sys.exit(main())