gateway/tests/unit/services/test_json_extraction_merging.py

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Test script for JSON extraction response detection and merging.

The methods under test (``_isJsonExtractionResponse``,
``_mergeJsonExtractionResponses``, etc.) are pure data-manipulation and
do NOT touch ``self._context`` / ``self._get_service`` / the DB. We
therefore bypass ``ExtractionService.__init__`` (which would require a
live ``ServiceCenterContext`` + service-resolver) by instantiating with
``__new__`` — same as constructing a stub without dependency wiring.

Run: python gateway/tests/unit/services/test_json_extraction_merging.py
"""

import json
import sys
import os

# Add gateway to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../..'))

from modules.datamodels.datamodelExtraction import ContentPart
from modules.serviceCenter.services.serviceExtraction.mainServiceExtraction import ExtractionService


def test_detects_json_with_code_fences():
    """Test that JSON extraction responses with markdown code fences are detected"""
    print("Test 1: Detecting JSON with code fences...")
    service = ExtractionService.__new__(ExtractionService)

    content_part = ContentPart(
        id="test1",
        label="test1",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"text": "Sample text", "tables": []}}\n```'
    )

    result = service._isJsonExtractionResponse([content_part])
    assert result == True, "Should detect JSON with code fences"
    print("  [PASS]")


def test_detects_json_without_code_fences():
    """Test that JSON extraction responses without code fences are detected"""
    print("Test 2: Detecting JSON without code fences...")
    service = ExtractionService.__new__(ExtractionService)

    content_part = ContentPart(
        id="test2",
        label="test2",
        typeGroup="text",
        mimeType="text/plain",
        data='{"extracted_content": {"text": "Sample text", "tables": []}}'
    )

    result = service._isJsonExtractionResponse([content_part])
    assert result == True, "Should detect JSON without code fences"
    print("  [PASS]")


def test_rejects_non_extraction_json():
    """Test that regular JSON (without extracted_content) is rejected"""
    print("Test 3: Rejecting non-extraction JSON...")
    service = ExtractionService.__new__(ExtractionService)

    content_part = ContentPart(
        id="test3",
        label="test3",
        typeGroup="text",
        mimeType="text/plain",
        data='{"documents": [{"sections": []}]}'
    )

    result = service._isJsonExtractionResponse([content_part])
    assert result == False, "Should reject non-extraction JSON"
    print("  [PASS]")


def test_rejects_non_json_content():
    """Test that non-JSON content is rejected"""
    print("Test 4: Rejecting non-JSON content...")
    service = ExtractionService.__new__(ExtractionService)

    content_part = ContentPart(
        id="test4",
        label="test4",
        typeGroup="text",
        mimeType="text/plain",
        data="This is plain text, not JSON"
    )

    result = service._isJsonExtractionResponse([content_part])
    assert result == False, "Should reject non-JSON content"
    print("  [PASS]")


def test_merges_tables_with_same_headers():
    """Test that tables with identical headers are merged"""
    print("Test 5: Merging tables with same headers...")
    service = ExtractionService.__new__(ExtractionService)

    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Alice", "100"], ["Bob", "200"]]}]}}\n```'
    )

    part2 = ContentPart(
        id="test2",
        label="test2",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Charlie", "300"], ["Alice", "100"]]}]}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1, part2])

    # Should have one table group with merged rows
    assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}"
    table = merged["extracted_content"]["tables"][0]
    assert table["headers"] == ["Name", "Amount"], f"Headers should match, got {table['headers']}"
    # Per the documented merge contract ("Tables: Combines all table rows,
    # ... duplicates preserved" — see _mergeJsonExtractionResponses
    # docstring), identical rows from different parts are NOT deduplicated.
    # Alice appears in both parts, so the merged table has 4 rows.
    assert len(table["rows"]) == 4, f"Should have 4 rows (duplicates preserved), got {len(table['rows'])}"
    assert ["Alice", "100"] in table["rows"], "Alice row should be present"
    assert ["Bob", "200"] in table["rows"], "Bob row should be present"
    assert ["Charlie", "300"] in table["rows"], "Charlie row should be present"
    assert table["rows"].count(["Alice", "100"]) == 2, "Alice row must be preserved twice (no dedup)"
    print("  [PASS]")


def test_merges_multiple_json_blocks_separated_by_dash():
    """Test that multiple JSON blocks separated by --- are merged"""
    print("Test 6: Merging multiple JSON blocks separated by ---...")
    service = ExtractionService.__new__(ExtractionService)

    # Create content part with multiple JSON blocks separated by ---
    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Alice", "100"]]}]}}\n```\n---\n```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Amount"], "rows": [["Bob", "200"]]}]}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1])

    # Should have one table with merged rows from both JSON blocks
    assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}"
    table = merged["extracted_content"]["tables"][0]
    assert table["headers"] == ["Name", "Amount"], f"Headers should match, got {table['headers']}"
    assert len(table["rows"]) == 2, f"Should have 2 rows, got {len(table['rows'])}"
    assert ["Alice", "100"] in table["rows"], "Alice row should be present"
    assert ["Bob", "200"] in table["rows"], "Bob row should be present"
    print("  [PASS]")


def test_merges_text_content():
    """Test that text content from multiple parts is merged"""
    print("Test 7: Merging text content...")
    service = ExtractionService.__new__(ExtractionService)

    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"text": "First paragraph."}}\n```'
    )

    part2 = ContentPart(
        id="test2",
        label="test2",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"text": "Second paragraph."}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1, part2])

    # Text should be concatenated with newlines
    text = merged["extracted_content"]["text"]
    assert "First paragraph." in text, "First paragraph should be present"
    assert "Second paragraph." in text, "Second paragraph should be present"
    print("  [PASS]")


def test_merges_headings_and_lists():
    """Test that headings and lists are merged"""
    print("Test 8: Merging headings and lists...")
    service = ExtractionService.__new__(ExtractionService)

    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"headings": [{"level": 1, "text": "Title 1"}], "lists": [{"type": "bullet", "items": ["Item 1"]}]}}\n```'
    )

    part2 = ContentPart(
        id="test2",
        label="test2",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"headings": [{"level": 2, "text": "Subtitle 1"}], "lists": [{"type": "bullet", "items": ["Item 2"]}]}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1, part2])

    # Should have 2 headings
    assert len(merged["extracted_content"]["headings"]) == 2, f"Should have 2 headings, got {len(merged['extracted_content']['headings'])}"
    assert merged["extracted_content"]["headings"][0]["text"] == "Title 1", "First heading should be Title 1"
    assert merged["extracted_content"]["headings"][1]["text"] == "Subtitle 1", "Second heading should be Subtitle 1"

    # Should have 2 lists
    assert len(merged["extracted_content"]["lists"]) == 2, f"Should have 2 lists, got {len(merged['extracted_content']['lists'])}"
    assert merged["extracted_content"]["lists"][0]["items"] == ["Item 1"], "First list should have Item 1"
    assert merged["extracted_content"]["lists"][1]["items"] == ["Item 2"], "Second list should have Item 2"
    print("  [PASS]")


def test_handles_empty_content_parts():
    """Test that empty content parts are handled gracefully"""
    print("Test 9: Handling empty content parts...")
    service = ExtractionService.__new__(ExtractionService)

    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="text",
        mimeType="text/plain",
        data='```json\n{"extracted_content": {"text": "Some text"}}\n```'
    )

    part2 = ContentPart(
        id="test2",
        label="test2",
        typeGroup="text",
        mimeType="text/plain",
        data=""  # Empty part
    )

    merged = service._mergeJsonExtractionResponses([part1, part2])

    # Should still have the text from part1
    assert merged["extracted_content"]["text"] == "Some text", "Should have text from part1"
    print("  [PASS]")


def test_merges_tables_with_different_headers():
    """Test that tables with different headers are kept separate"""
    print("Test 10: Keeping tables with different headers separate...")
    service = ExtractionService.__new__(ExtractionService)

    part1 = ContentPart(
        id="test1",
        label="test1",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Date", "Amount"], "rows": [["2024-01-01", "100"]]}]}}\n```'
    )

    part2 = ContentPart(
        id="test2",
        label="test2",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Name", "Email"], "rows": [["Alice", "alice@example.com"]]}]}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1, part2])

    # Should have 2 separate tables (different headers)
    assert len(merged["extracted_content"]["tables"]) == 2, f"Should have 2 separate tables, got {len(merged['extracted_content']['tables'])}"

    # Check first table
    table1 = merged["extracted_content"]["tables"][0]
    assert table1["headers"] == ["Date", "Amount"], "First table should have Date/Amount headers"
    assert len(table1["rows"]) == 1, "First table should have 1 row"

    # Check second table
    table2 = merged["extracted_content"]["tables"][1]
    assert table2["headers"] == ["Name", "Email"], "Second table should have Name/Email headers"
    assert len(table2["rows"]) == 1, "Second table should have 1 row"
    print("  [PASS]")


def test_real_world_scenario():
    """Test with a realistic scenario similar to the debug file"""
    print("Test 11: Real-world scenario (multiple documents, multiple JSON blocks)...")
    service = ExtractionService.__new__(ExtractionService)

    # Simulate 3 documents, each with a table extraction response
    part1 = ContentPart(
        id="doc1",
        label="doc1",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN001", "2024-01-01", "100.00"], ["TXN002", "2024-01-02", "200.00"]]}]}}\n```'
    )

    part2 = ContentPart(
        id="doc2",
        label="doc2",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN003", "2024-01-03", "300.00"], ["TXN001", "2024-01-01", "100.00"]]}]}}\n```'
    )

    # Part 3 has multiple JSON blocks separated by ---
    part3 = ContentPart(
        id="doc3",
        label="doc3",
        typeGroup="table",
        mimeType="application/json",
        data='```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN004", "2024-01-04", "400.00"]]}]}}\n```\n---\n```json\n{"extracted_content": {"tables": [{"headers": ["Transaction ID", "Date", "Amount"], "rows": [["TXN005", "2024-01-05", "500.00"]]}]}}\n```'
    )

    merged = service._mergeJsonExtractionResponses([part1, part2, part3])

    # Should have one merged table with all transactions
    assert len(merged["extracted_content"]["tables"]) == 1, f"Should have one merged table, got {len(merged['extracted_content']['tables'])}"
    table = merged["extracted_content"]["tables"][0]
    assert table["headers"] == ["Transaction ID", "Date", "Amount"], "Headers should match"

    # Per the documented merge contract, duplicate rows are preserved.
    # TXN001 occurs in both doc1 and doc2 -> 6 rows total.
    assert len(table["rows"]) == 6, f"Should have 6 rows (duplicates preserved), got {len(table['rows'])}"

    transaction_ids = [row[0] for row in table["rows"]]
    for txn in ("TXN001", "TXN002", "TXN003", "TXN004", "TXN005"):
        assert txn in transaction_ids, f"{txn} should be present"

    # TXN001 must appear twice (no dedup at merge time — dedup is the
    # responsibility of downstream consumers if needed).
    assert transaction_ids.count("TXN001") == 2, "TXN001 must appear twice (duplicates preserved)"

    print("  [PASS]")


def main():
    """Run all tests"""
    print("=" * 60)
    print("Testing JSON Extraction Response Detection and Merging")
    print("=" * 60)
    print()

    tests = [
        test_detects_json_with_code_fences,
        test_detects_json_without_code_fences,
        test_rejects_non_extraction_json,
        test_rejects_non_json_content,
        test_merges_tables_with_same_headers,
        test_merges_multiple_json_blocks_separated_by_dash,
        test_merges_text_content,
        test_merges_headings_and_lists,
        test_handles_empty_content_parts,
        test_merges_tables_with_different_headers,
        test_real_world_scenario,
    ]

    passed = 0
    failed = 0

    for test in tests:
        try:
            test()
            passed += 1
        except AssertionError as e:
            print(f"  [FAIL] {e}")
            failed += 1
        except Exception as e:
            print(f"  [ERROR] {e}")
            import traceback
            traceback.print_exc()
            failed += 1
        print()

    print("=" * 60)
    print(f"Results: {passed} passed, {failed} failed")
    print("=" * 60)

    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())