gateway/tests/functional/test08_json_finalization.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Test JSON finalization process after accumulation is complete.

This test suite validates the finalization process that happens after receiving
the full accumulated JSON from the AI service. It tests:

1. Finalization with real-world accumulated JSON from debug files
2. Cleaning of markdown code fences that got embedded in JSON values
3. Finalization with complete, clean JSON
4. Building final result from sections (simulating _buildFinalResultFromSections)
5. End-to-end finalization process simulating the failure scenario

Key Findings:
- Row 373 in the prime numbers table had corruption: "349```json\n19" instead of "34919"
- This corruption can cause final result serialization to fail or produce invalid JSON
- The cleanCorruptionFromSections() helper function successfully cleans this corruption
- After cleaning, the final result can be serialized and parsed correctly

Note: The cleanCorruptionFromSections() function should be integrated into the
actual codebase (e.g., in mainServiceAi.py before building final result) to
prevent corruption from causing final result production to fail.
"""
import json
import sys
import os

# Add gateway directory to path (go up 2 levels from tests/functional/)
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _gateway_path not in sys.path:
    sys.path.insert(0, _gateway_path)

# Import after path setup
from modules.serviceCenter.services.serviceAi.subJsonResponseHandling import JsonResponseHandler  # type: ignore
from modules.shared.jsonUtils import extractSectionsFromDocument, extractJsonString, repairBrokenJson  # type: ignore


def cleanCorruptionFromSections(sections: list) -> list:
    """
    Clean corruption (like markdown code fences) from section values.
    This simulates what should happen before building final result.
    """
    cleanedSections = []
    for section in sections:
        cleanedSection = _cleanCorruptionRecursive(section)
        cleanedSections.append(cleanedSection)
    return cleanedSections


def _cleanCorruptionRecursive(obj: any) -> any:
    """Recursively clean corruption from nested structures."""
    if isinstance(obj, dict):
        cleaned = {}
        for key, value in obj.items():
            cleaned[key] = _cleanCorruptionRecursive(value)
        return cleaned
    elif isinstance(obj, list):
        cleaned = []
        for item in obj:
            cleaned.append(_cleanCorruptionRecursive(item))
        return cleaned
    elif isinstance(obj, str):
        # Clean markdown code fences and other corruption
        cleaned = obj.replace('```json', '').replace('```', '').replace('\n', '').strip()
        # Try to reconstruct numbers if they were split by corruption
        # E.g., "349```json\n19" -> "34919"
        if cleaned and cleaned[0].isdigit():
            # Remove any non-digit characters in the middle and reconstruct
            parts = cleaned.split()
            if len(parts) > 1:
                # Try to merge consecutive number parts
                merged = ''.join(parts)
                if merged.isdigit():
                    cleaned = merged
        return cleaned
    else:
        return obj


def testFinalizationWithRealWorldAccumulatedJson():
    """Test finalization process with real-world accumulated JSON from debug files"""
    print("\n" + "="*60)
    print("TEST: Finalization with Real-World Accumulated JSON")
    print("="*60)

    # Load the accumulated JSON from debug file
    debugFile = os.path.join(
        os.path.dirname(__file__),
        "..", "..", "..", "local", "debug", "prompts",
        "20251130-205629-015-document_generation_accumulated_json_iteration_2.json"
    )

    if not os.path.exists(debugFile):
        print(f"❌ Debug file not found: {debugFile}")
        print("   Skipping test - file may not exist in this environment")
        return

    # Read the JSON file
    with open(debugFile, 'r', encoding='utf-8') as f:
        jsonContent = f.read()

    print(f"Loaded JSON file: {len(jsonContent)} chars")

    # Step 1: Extract JSON string (handles code fences, normalization)
    extractedJson = extractJsonString(jsonContent)
    print(f"After extractJsonString: {len(extractedJson)} chars")

    # Step 2: Clean encoding issues
    cleanedJson = JsonResponseHandler.cleanEncodingIssues(extractedJson)
    print(f"After cleanEncodingIssues: {len(cleanedJson)} chars")

    # Step 3: Try to parse
    try:
        parsedJson = json.loads(cleanedJson)
        print("✅ JSON parsing succeeded")
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing failed: {e}")
        print("   Attempting repair...")

        # Try to repair
        repairedJson = repairBrokenJson(cleanedJson)
        if repairedJson:
            parsedJson = repairedJson
            print("✅ JSON repair succeeded")
        else:
            print("❌ JSON repair failed")
            # Find the problematic line
            errorLine = getattr(e, 'lineno', None)
            if errorLine:
                lines = cleanedJson.split('\n')
                if errorLine <= len(lines):
                    print(f"   Error at line {errorLine}: {lines[errorLine-1][:100]}")
            assert False, f"Failed to parse or repair JSON: {e}"

    # Step 4: Check completeness
    isComplete = JsonResponseHandler.isJsonComplete(parsedJson)
    print(f"JSON completeness check: {isComplete}")

    # Step 5: Finalize JSON
    finalizedJson = JsonResponseHandler.finalizeJson(parsedJson)
    print("✅ JSON finalized")

    # Step 6: Extract sections
    sections = extractSectionsFromDocument(finalizedJson)
    print(f"✅ Extracted {len(sections)} sections")

    # Step 7: Verify sections
    if sections:
        for i, section in enumerate(sections):
            sectionId = section.get('id', f'unknown_{i}')
            contentType = section.get('content_type', 'unknown')
            print(f"   Section {i+1}: id={sectionId}, type={contentType}")

            # Check for the prime numbers table section
            if sectionId == 'section_prime_numbers_table':
                elements = section.get('elements', [])
                if isinstance(elements, list) and len(elements) > 0:
                    element = elements[0]
                    rows = element.get('rows', [])
                    print(f"      Found {len(rows)} rows in prime numbers table")

                    # Check for corruption in rows (known issue with markdown code fences)
                    corruptionFound = False
                    for rowIdx in range(min(373, len(rows))):  # Check up to row 373
                        row = rows[rowIdx]
                        rowStr = json.dumps(row)
                        if '```json' in rowStr or '```' in rowStr:
                            corruptionFound = True
                            print(f"      ⚠️  WARNING: Row {rowIdx+1} contains markdown code fences")
                            # Show the problematic value
                            for valIdx, val in enumerate(row):
                                valStr = str(val)
                                if '```' in valStr:
                                    print(f"         Value {valIdx}: {valStr[:80]}")
                                    # Try to clean it
                                    cleanedVal = valStr.replace('```json', '').replace('```', '').replace('\n', '').strip()
                                    print(f"         Cleaned: {cleanedVal}")
                            break

                    if not corruptionFound:
                        print(f"      ✅ No markdown code fence corruption detected in first 373 rows")

                    # Verify row 373 specifically
                    if len(rows) >= 373:
                        row373 = rows[372]  # Index 372 = row 373
                        print(f"      Row 373: {row373[:5]}... (first 5 values)")

                    # Verify we have 400 rows
                    assert len(rows) == 400, f"Expected 400 rows, got {len(rows)}"
                    print(f"      ✅ All 400 rows present")

                    # Verify last row is row 400
                    lastRow = rows[-1]
                    assert lastRow[0] == "400", f"Expected last row index to be 400, got {lastRow[0]}"
                    print(f"      ✅ Last row is row 400")
    else:
        print("❌ No sections extracted")
        assert False, "Should have extracted at least one section"

    # Step 8: Verify final JSON structure
    assert 'documents' in finalizedJson, "Finalized JSON should have 'documents' key"
    assert isinstance(finalizedJson['documents'], list), "documents should be a list"
    assert len(finalizedJson['documents']) > 0, "documents list should not be empty"
    print("✅ Final JSON structure is valid")

    print("\n✅ Finalization test completed successfully")


def testCleaningMarkdownCodeFences():
    """Test cleaning of markdown code fences that got embedded in JSON values"""
    print("\n" + "="*60)
    print("TEST: Cleaning Markdown Code Fences from JSON")
    print("="*60)

    # Simulate the corruption found in the real-world JSON
    # Row 373 had: "349```json\n19" instead of "34919"
    corruptedJson = {
        "documents": [{
            "sections": [{
                "id": "section_test",
                "content_type": "table",
                "elements": [{
                    "rows": [
                        ["373", "34883", "34897", "34913", "34919", "349```json\n19", "34939"]
                    ]
                }]
            }]
        }]
    }

    jsonStr = json.dumps(corruptedJson, ensure_ascii=False)
    print(f"Original JSON string length: {len(jsonStr)} chars")

    # Test cleaning
    cleaned = JsonResponseHandler.cleanEncodingIssues(jsonStr)
    print(f"After cleanEncodingIssues: {len(cleaned)} chars")

    # Try to parse
    try:
        parsed = json.loads(cleaned)
        print("✅ Parsed successfully (but corruption may still be in values)")

        # Check if corruption is still present in values
        rows = parsed['documents'][0]['sections'][0]['elements'][0]['rows']
        row373 = rows[0]
        hasCorruption = any('```' in str(val) for val in row373)

        if hasCorruption:
            print("⚠️  Corruption still present in values (expected - cleanEncodingIssues only handles encoding)")
            print(f"   Row 373: {row373}")

            # Manual cleaning of values
            cleanedRow373 = []
            for val in row373:
                cleanedVal = str(val).replace('```json', '').replace('```', '').replace('\n', '').strip()
                # Try to parse as number if it looks like one
                try:
                    if cleanedVal.isdigit():
                        cleanedRow373.append(cleanedVal)
                    else:
                        cleanedRow373.append(cleanedVal)
                except:
                    cleanedRow373.append(cleanedVal)

            print(f"   Cleaned row 373: {cleanedRow373}")

            # Verify "34919" is reconstructed
            assert "34919" in cleanedRow373, "Should have reconstructed 34919"
            print("✅ Successfully reconstructed corrupted value")
        else:
            print("✅ No corruption found in values")

    except json.JSONDecodeError as e:
        print(f"❌ Parsing failed: {e}")
        assert False, f"Failed to parse cleaned JSON: {e}"


def testFinalizationWithCompleteJson():
    """Test finalization process with a complete, valid JSON"""
    print("\n" + "="*60)
    print("TEST: Finalization with Complete JSON")
    print("="*60)

    # Create a complete JSON structure
    completeJson = {
        "metadata": {
            "split_strategy": "single_document",
            "source_documents": [],
            "extraction_method": "ai_generation"
        },
        "documents": [{
            "id": "doc_1",
            "title": "Test Document",
            "sections": [{
                "id": "section_test",
                "content_type": "table",
                "elements": [{
                    "headers": ["Col1", "Col2", "Col3"],
                    "rows": [
                        ["1", "2", "3"],
                        ["4", "5", "6"]
                    ]
                }]
            }]
        }]
    }

    jsonStr = json.dumps(completeJson, ensure_ascii=False)
    parsedJson = json.loads(jsonStr)

    # Test completeness check
    isComplete = JsonResponseHandler.isJsonComplete(parsedJson)
    assert isComplete, "Complete JSON should pass completeness check"
    print("✅ Completeness check passed")

    # Test finalization
    finalizedJson = JsonResponseHandler.finalizeJson(parsedJson)
    assert finalizedJson == parsedJson, "Finalized JSON should be same as input for complete JSON"
    print("✅ Finalization completed")

    # Test section extraction
    sections = extractSectionsFromDocument(finalizedJson)
    assert len(sections) == 1, f"Expected 1 section, got {len(sections)}"
    assert sections[0]['id'] == 'section_test', "Section ID should match"
    print("✅ Section extraction successful")

    print("✅ Complete JSON finalization test passed")


def testBuildingFinalResultFromSections():
    """Test building final result from sections (simulating _buildFinalResultFromSections)"""
    print("\n" + "="*60)
    print("TEST: Building Final Result from Sections")
    print("="*60)

    # Create sections (as would be extracted from accumulated JSON)
    sections = [{
        "id": "section_prime_numbers_table",
        "content_type": "table",
        "elements": [{
            "headers": ["Index", "Prime 1", "Prime 2", "Prime 3"],
            "rows": [
                ["1", "2", "3", "5"],
                ["2", "7", "11", "13"],
                # Simulate corruption in row 373
                ["373", "34883", "34897", "34913", "34919", "349```json\n19", "34939"]
            ]
        }]
    }]

    # Build final result structure (simulating _buildFinalResultFromSections)
    documentMetadata = {
        "title": "Prime Numbers Table",
        "filename": "prime_numbers_table.json"
    }

    title = documentMetadata.get("title", "Generated Document")
    filename = documentMetadata.get("filename", "document.json")

    documents = [{
        "id": "doc_1",
        "title": title,
        "filename": filename,
        "sections": sections
    }]

    result = {
        "metadata": {
            "split_strategy": "single_document",
            "source_documents": [],
            "extraction_method": "ai_generation"
        },
        "documents": documents
    }

    # Try to serialize to JSON string
    try:
        finalResultStr = json.dumps(result, indent=2, ensure_ascii=False)
        print(f"✅ Final result JSON string created: {len(finalResultStr)} chars")

        # Verify it can be parsed back
        parsedBack = json.loads(finalResultStr)
        assert parsedBack['documents'][0]['title'] == title
        assert len(parsedBack['documents'][0]['sections']) == 1
        print("✅ Final result can be parsed back successfully")

        # Check if corruption is still present
        rows = parsedBack['documents'][0]['sections'][0]['elements'][0]['rows']
        row373 = rows[2]  # Third row (index 2)
        hasCorruption = any('```' in str(val) for val in row373)

        if hasCorruption:
            print("⚠️  Corruption still present in final result (expected)")
            print(f"   Row 373: {row373}")

            # Clean the corruption using helper function
            cleanedSections = cleanCorruptionFromSections(sections)

            # Rebuild final result with cleaned sections
            documents[0]['sections'] = cleanedSections
            result['documents'] = documents
            cleanedFinalResultStr = json.dumps(result, indent=2, ensure_ascii=False)

            # Verify cleaned result
            cleanedParsed = json.loads(cleanedFinalResultStr)
            cleanedRows = cleanedParsed['documents'][0]['sections'][0]['elements'][0]['rows']
            cleanedRow373 = cleanedRows[2]
            assert not any('```' in str(val) for val in cleanedRow373), "Cleaned row should not have corruption"
            assert "34919" in cleanedRow373, "Should have reconstructed 34919"
            print("✅ Corruption cleaned successfully")
            print(f"   Cleaned row 373: {cleanedRow373}")
        else:
            print("✅ No corruption found in final result")

    except json.JSONEncodeError as e:
        print(f"❌ Failed to serialize final result: {e}")
        assert False, f"Failed to serialize final result: {e}"
    except json.JSONDecodeError as e:
        print(f"❌ Failed to parse final result back: {e}")
        assert False, f"Failed to parse final result back: {e}"

    print("✅ Final result building test completed")


def testEndToEndFinalizationWithCorruption():
    """Test end-to-end finalization process simulating the exact failure scenario"""
    print("\n" + "="*60)
    print("TEST: End-to-End Finalization with Corruption (Failure Scenario)")
    print("="*60)

    # Load the real accumulated JSON (with corruption)
    debugFile = os.path.join(
        os.path.dirname(__file__),
        "..", "..", "..", "local", "debug", "prompts",
        "20251130-205629-015-document_generation_accumulated_json_iteration_2.json"
    )

    if not os.path.exists(debugFile):
        print(f"⚠️  Debug file not found: {debugFile}")
        print("   Skipping test - file may not exist in this environment")
        return

    # Step 1: Load and parse accumulated JSON
    with open(debugFile, 'r', encoding='utf-8') as f:
        jsonContent = f.read()

    extractedJson = extractJsonString(jsonContent)
    cleanedJson = JsonResponseHandler.cleanEncodingIssues(extractedJson)

    try:
        parsedJson = json.loads(cleanedJson)
    except json.JSONDecodeError as e:
        repairedJson = repairBrokenJson(cleanedJson)
        if not repairedJson:
            print(f"❌ Failed to parse or repair JSON: {e}")
            assert False, f"Failed to parse or repair JSON: {e}"
        parsedJson = repairedJson

    # Step 2: Extract sections (as done in mainServiceAi)
    sections = extractSectionsFromDocument(parsedJson)
    print(f"✅ Extracted {len(sections)} sections")

    # Step 3: Complete incomplete structures (as done in mainServiceAi)
    completedSections = JsonResponseHandler.completeIncompleteStructures(sections)
    print(f"✅ Completed structures for {len(completedSections)} sections")

    # Step 4: Check for corruption BEFORE building final result
    corruptionFound = False
    for section in completedSections:
        sectionStr = json.dumps(section)
        if '```json' in sectionStr or '```' in sectionStr:
            corruptionFound = True
            print(f"⚠️  Corruption detected in section {section.get('id', 'unknown')}")
            break

    # Step 5: Clean corruption if found (this should be done before building final result)
    if corruptionFound:
        print("   Cleaning corruption from sections...")
        cleanedSections = cleanCorruptionFromSections(completedSections)
        print("✅ Corruption cleaned from sections")
    else:
        cleanedSections = completedSections
        print("✅ No corruption found")

    # Step 6: Build final result (simulating _buildFinalResultFromSections)
    documentMetadata = {
        "title": "Prime Numbers Table",
        "filename": "prime_numbers_table.json"
    }

    title = documentMetadata.get("title", "Generated Document")
    filename = documentMetadata.get("filename", "document.json")

    documents = [{
        "id": "doc_1",
        "title": title,
        "filename": filename,
        "sections": cleanedSections
    }]

    result = {
        "metadata": {
            "split_strategy": "single_document",
            "source_documents": [],
            "extraction_method": "ai_generation"
        },
        "documents": documents
    }

    # Step 7: Serialize final result (this is where it might have failed)
    try:
        finalResultStr = json.dumps(result, indent=2, ensure_ascii=False)
        print(f"✅ Final result serialized successfully: {len(finalResultStr)} chars")

        # Step 8: Verify it can be parsed back
        parsedBack = json.loads(finalResultStr)
        assert parsedBack['documents'][0]['title'] == title
        assert len(parsedBack['documents'][0]['sections']) == len(cleanedSections)
        print("✅ Final result can be parsed back successfully")

        # Step 9: Verify no corruption in final result
        finalResultStr_check = json.dumps(parsedBack)
        if '```json' in finalResultStr_check or '```' in finalResultStr_check:
            print("⚠️  WARNING: Corruption still present in final result")
        else:
            print("✅ Final result is clean (no corruption)")

        # Step 10: Verify section content
        if parsedBack['documents'][0]['sections']:
            section = parsedBack['documents'][0]['sections'][0]
            if section.get('id') == 'section_prime_numbers_table':
                elements = section.get('elements', [])
                if elements and 'rows' in elements[0]:
                    rows = elements[0]['rows']
                    print(f"✅ Final result contains {len(rows)} rows")
                    assert len(rows) == 400, f"Expected 400 rows, got {len(rows)}"

                    # Verify row 373 is clean
                    if len(rows) >= 373:
                        row373 = rows[372]
                        row373Str = json.dumps(row373)
                        if '```' in row373Str:
                            print(f"⚠️  WARNING: Row 373 still has corruption: {row373Str[:100]}")
                        else:
                            print(f"✅ Row 373 is clean: {row373[:5]}...")

        print("\n✅ End-to-end finalization test completed successfully")
        print(f"   Final result ready to write to debug file ({len(finalResultStr)} chars)")

    except json.JSONEncodeError as e:
        print(f"❌ Failed to serialize final result: {e}")
        print("   This is likely why the final_result.txt file was empty")
        assert False, f"Failed to serialize final result: {e}"
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        assert False, f"Unexpected error: {e}"


if __name__ == "__main__":
    print("\n" + "="*60)
    print("JSON FINALIZATION TEST SUITE")
    print("="*60)
    print("Testing finalization process after accumulation is complete")
    print("="*60)

    try:
        # Test 1: Finalization with real-world accumulated JSON
        testFinalizationWithRealWorldAccumulatedJson()

        # Test 2: Cleaning markdown code fences
        testCleaningMarkdownCodeFences()

        # Test 3: Finalization with complete JSON
        testFinalizationWithCompleteJson()

        # Test 4: Building final result from sections
        testBuildingFinalResultFromSections()

        # Test 5: End-to-end finalization with corruption (simulating failure scenario)
        testEndToEndFinalizationWithCorruption()

        print("\n" + "="*60)
        print("✅ ALL TESTS COMPLETED")
        print("="*60)
    except AssertionError as e:
        print(f"\n❌ TEST FAILED: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)