gateway/tests/functional/test08_json_finalization.py
ValueOn AG c8b7517209 refactor: modules/services/ abgeloest durch serviceCenter + serviceHub
serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung
serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter)

- modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface()
- 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt
- resolver.py: legacy fallback auf altes services/ entfernt
- modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py)
- pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
2026-03-14 11:51:45 +01:00

596 lines
24 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Test JSON finalization process after accumulation is complete.
This test suite validates the finalization process that happens after receiving
the full accumulated JSON from the AI service. It tests:
1. Finalization with real-world accumulated JSON from debug files
2. Cleaning of markdown code fences that got embedded in JSON values
3. Finalization with complete, clean JSON
4. Building final result from sections (simulating _buildFinalResultFromSections)
5. End-to-end finalization process simulating the failure scenario
Key Findings:
- Row 373 in the prime numbers table had corruption: "349```json\n19" instead of "34919"
- This corruption can cause final result serialization to fail or produce invalid JSON
- The cleanCorruptionFromSections() helper function successfully cleans this corruption
- After cleaning, the final result can be serialized and parsed correctly
Note: The cleanCorruptionFromSections() function should be integrated into the
actual codebase (e.g., in mainServiceAi.py before building final result) to
prevent corruption from causing final result production to fail.
"""
import json
import sys
import os
# Add gateway directory to path (go up 2 levels from tests/functional/)
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _gateway_path not in sys.path:
sys.path.insert(0, _gateway_path)
# Import after path setup
from modules.serviceCenter.services.serviceAi.subJsonResponseHandling import JsonResponseHandler # type: ignore
from modules.shared.jsonUtils import extractSectionsFromDocument, extractJsonString, repairBrokenJson # type: ignore
def cleanCorruptionFromSections(sections: list) -> list:
"""
Clean corruption (like markdown code fences) from section values.
This simulates what should happen before building final result.
"""
cleanedSections = []
for section in sections:
cleanedSection = _cleanCorruptionRecursive(section)
cleanedSections.append(cleanedSection)
return cleanedSections
def _cleanCorruptionRecursive(obj: any) -> any:
"""Recursively clean corruption from nested structures."""
if isinstance(obj, dict):
cleaned = {}
for key, value in obj.items():
cleaned[key] = _cleanCorruptionRecursive(value)
return cleaned
elif isinstance(obj, list):
cleaned = []
for item in obj:
cleaned.append(_cleanCorruptionRecursive(item))
return cleaned
elif isinstance(obj, str):
# Clean markdown code fences and other corruption
cleaned = obj.replace('```json', '').replace('```', '').replace('\n', '').strip()
# Try to reconstruct numbers if they were split by corruption
# E.g., "349```json\n19" -> "34919"
if cleaned and cleaned[0].isdigit():
# Remove any non-digit characters in the middle and reconstruct
parts = cleaned.split()
if len(parts) > 1:
# Try to merge consecutive number parts
merged = ''.join(parts)
if merged.isdigit():
cleaned = merged
return cleaned
else:
return obj
def testFinalizationWithRealWorldAccumulatedJson():
"""Test finalization process with real-world accumulated JSON from debug files"""
print("\n" + "="*60)
print("TEST: Finalization with Real-World Accumulated JSON")
print("="*60)
# Load the accumulated JSON from debug file
debugFile = os.path.join(
os.path.dirname(__file__),
"..", "..", "..", "local", "debug", "prompts",
"20251130-205629-015-document_generation_accumulated_json_iteration_2.json"
)
if not os.path.exists(debugFile):
print(f"❌ Debug file not found: {debugFile}")
print(" Skipping test - file may not exist in this environment")
return
# Read the JSON file
with open(debugFile, 'r', encoding='utf-8') as f:
jsonContent = f.read()
print(f"Loaded JSON file: {len(jsonContent)} chars")
# Step 1: Extract JSON string (handles code fences, normalization)
extractedJson = extractJsonString(jsonContent)
print(f"After extractJsonString: {len(extractedJson)} chars")
# Step 2: Clean encoding issues
cleanedJson = JsonResponseHandler.cleanEncodingIssues(extractedJson)
print(f"After cleanEncodingIssues: {len(cleanedJson)} chars")
# Step 3: Try to parse
try:
parsedJson = json.loads(cleanedJson)
print("✅ JSON parsing succeeded")
except json.JSONDecodeError as e:
print(f"❌ JSON parsing failed: {e}")
print(" Attempting repair...")
# Try to repair
repairedJson = repairBrokenJson(cleanedJson)
if repairedJson:
parsedJson = repairedJson
print("✅ JSON repair succeeded")
else:
print("❌ JSON repair failed")
# Find the problematic line
errorLine = getattr(e, 'lineno', None)
if errorLine:
lines = cleanedJson.split('\n')
if errorLine <= len(lines):
print(f" Error at line {errorLine}: {lines[errorLine-1][:100]}")
assert False, f"Failed to parse or repair JSON: {e}"
# Step 4: Check completeness
isComplete = JsonResponseHandler.isJsonComplete(parsedJson)
print(f"JSON completeness check: {isComplete}")
# Step 5: Finalize JSON
finalizedJson = JsonResponseHandler.finalizeJson(parsedJson)
print("✅ JSON finalized")
# Step 6: Extract sections
sections = extractSectionsFromDocument(finalizedJson)
print(f"✅ Extracted {len(sections)} sections")
# Step 7: Verify sections
if sections:
for i, section in enumerate(sections):
sectionId = section.get('id', f'unknown_{i}')
contentType = section.get('content_type', 'unknown')
print(f" Section {i+1}: id={sectionId}, type={contentType}")
# Check for the prime numbers table section
if sectionId == 'section_prime_numbers_table':
elements = section.get('elements', [])
if isinstance(elements, list) and len(elements) > 0:
element = elements[0]
rows = element.get('rows', [])
print(f" Found {len(rows)} rows in prime numbers table")
# Check for corruption in rows (known issue with markdown code fences)
corruptionFound = False
for rowIdx in range(min(373, len(rows))): # Check up to row 373
row = rows[rowIdx]
rowStr = json.dumps(row)
if '```json' in rowStr or '```' in rowStr:
corruptionFound = True
print(f" ⚠️ WARNING: Row {rowIdx+1} contains markdown code fences")
# Show the problematic value
for valIdx, val in enumerate(row):
valStr = str(val)
if '```' in valStr:
print(f" Value {valIdx}: {valStr[:80]}")
# Try to clean it
cleanedVal = valStr.replace('```json', '').replace('```', '').replace('\n', '').strip()
print(f" Cleaned: {cleanedVal}")
break
if not corruptionFound:
print(f" ✅ No markdown code fence corruption detected in first 373 rows")
# Verify row 373 specifically
if len(rows) >= 373:
row373 = rows[372] # Index 372 = row 373
print(f" Row 373: {row373[:5]}... (first 5 values)")
# Verify we have 400 rows
assert len(rows) == 400, f"Expected 400 rows, got {len(rows)}"
print(f" ✅ All 400 rows present")
# Verify last row is row 400
lastRow = rows[-1]
assert lastRow[0] == "400", f"Expected last row index to be 400, got {lastRow[0]}"
print(f" ✅ Last row is row 400")
else:
print("❌ No sections extracted")
assert False, "Should have extracted at least one section"
# Step 8: Verify final JSON structure
assert 'documents' in finalizedJson, "Finalized JSON should have 'documents' key"
assert isinstance(finalizedJson['documents'], list), "documents should be a list"
assert len(finalizedJson['documents']) > 0, "documents list should not be empty"
print("✅ Final JSON structure is valid")
print("\n✅ Finalization test completed successfully")
def testCleaningMarkdownCodeFences():
"""Test cleaning of markdown code fences that got embedded in JSON values"""
print("\n" + "="*60)
print("TEST: Cleaning Markdown Code Fences from JSON")
print("="*60)
# Simulate the corruption found in the real-world JSON
# Row 373 had: "349```json\n19" instead of "34919"
corruptedJson = {
"documents": [{
"sections": [{
"id": "section_test",
"content_type": "table",
"elements": [{
"rows": [
["373", "34883", "34897", "34913", "34919", "349```json\n19", "34939"]
]
}]
}]
}]
}
jsonStr = json.dumps(corruptedJson, ensure_ascii=False)
print(f"Original JSON string length: {len(jsonStr)} chars")
# Test cleaning
cleaned = JsonResponseHandler.cleanEncodingIssues(jsonStr)
print(f"After cleanEncodingIssues: {len(cleaned)} chars")
# Try to parse
try:
parsed = json.loads(cleaned)
print("✅ Parsed successfully (but corruption may still be in values)")
# Check if corruption is still present in values
rows = parsed['documents'][0]['sections'][0]['elements'][0]['rows']
row373 = rows[0]
hasCorruption = any('```' in str(val) for val in row373)
if hasCorruption:
print("⚠️ Corruption still present in values (expected - cleanEncodingIssues only handles encoding)")
print(f" Row 373: {row373}")
# Manual cleaning of values
cleanedRow373 = []
for val in row373:
cleanedVal = str(val).replace('```json', '').replace('```', '').replace('\n', '').strip()
# Try to parse as number if it looks like one
try:
if cleanedVal.isdigit():
cleanedRow373.append(cleanedVal)
else:
cleanedRow373.append(cleanedVal)
except:
cleanedRow373.append(cleanedVal)
print(f" Cleaned row 373: {cleanedRow373}")
# Verify "34919" is reconstructed
assert "34919" in cleanedRow373, "Should have reconstructed 34919"
print("✅ Successfully reconstructed corrupted value")
else:
print("✅ No corruption found in values")
except json.JSONDecodeError as e:
print(f"❌ Parsing failed: {e}")
assert False, f"Failed to parse cleaned JSON: {e}"
def testFinalizationWithCompleteJson():
"""Test finalization process with a complete, valid JSON"""
print("\n" + "="*60)
print("TEST: Finalization with Complete JSON")
print("="*60)
# Create a complete JSON structure
completeJson = {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": [{
"id": "doc_1",
"title": "Test Document",
"sections": [{
"id": "section_test",
"content_type": "table",
"elements": [{
"headers": ["Col1", "Col2", "Col3"],
"rows": [
["1", "2", "3"],
["4", "5", "6"]
]
}]
}]
}]
}
jsonStr = json.dumps(completeJson, ensure_ascii=False)
parsedJson = json.loads(jsonStr)
# Test completeness check
isComplete = JsonResponseHandler.isJsonComplete(parsedJson)
assert isComplete, "Complete JSON should pass completeness check"
print("✅ Completeness check passed")
# Test finalization
finalizedJson = JsonResponseHandler.finalizeJson(parsedJson)
assert finalizedJson == parsedJson, "Finalized JSON should be same as input for complete JSON"
print("✅ Finalization completed")
# Test section extraction
sections = extractSectionsFromDocument(finalizedJson)
assert len(sections) == 1, f"Expected 1 section, got {len(sections)}"
assert sections[0]['id'] == 'section_test', "Section ID should match"
print("✅ Section extraction successful")
print("✅ Complete JSON finalization test passed")
def testBuildingFinalResultFromSections():
"""Test building final result from sections (simulating _buildFinalResultFromSections)"""
print("\n" + "="*60)
print("TEST: Building Final Result from Sections")
print("="*60)
# Create sections (as would be extracted from accumulated JSON)
sections = [{
"id": "section_prime_numbers_table",
"content_type": "table",
"elements": [{
"headers": ["Index", "Prime 1", "Prime 2", "Prime 3"],
"rows": [
["1", "2", "3", "5"],
["2", "7", "11", "13"],
# Simulate corruption in row 373
["373", "34883", "34897", "34913", "34919", "349```json\n19", "34939"]
]
}]
}]
# Build final result structure (simulating _buildFinalResultFromSections)
documentMetadata = {
"title": "Prime Numbers Table",
"filename": "prime_numbers_table.json"
}
title = documentMetadata.get("title", "Generated Document")
filename = documentMetadata.get("filename", "document.json")
documents = [{
"id": "doc_1",
"title": title,
"filename": filename,
"sections": sections
}]
result = {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": documents
}
# Try to serialize to JSON string
try:
finalResultStr = json.dumps(result, indent=2, ensure_ascii=False)
print(f"✅ Final result JSON string created: {len(finalResultStr)} chars")
# Verify it can be parsed back
parsedBack = json.loads(finalResultStr)
assert parsedBack['documents'][0]['title'] == title
assert len(parsedBack['documents'][0]['sections']) == 1
print("✅ Final result can be parsed back successfully")
# Check if corruption is still present
rows = parsedBack['documents'][0]['sections'][0]['elements'][0]['rows']
row373 = rows[2] # Third row (index 2)
hasCorruption = any('```' in str(val) for val in row373)
if hasCorruption:
print("⚠️ Corruption still present in final result (expected)")
print(f" Row 373: {row373}")
# Clean the corruption using helper function
cleanedSections = cleanCorruptionFromSections(sections)
# Rebuild final result with cleaned sections
documents[0]['sections'] = cleanedSections
result['documents'] = documents
cleanedFinalResultStr = json.dumps(result, indent=2, ensure_ascii=False)
# Verify cleaned result
cleanedParsed = json.loads(cleanedFinalResultStr)
cleanedRows = cleanedParsed['documents'][0]['sections'][0]['elements'][0]['rows']
cleanedRow373 = cleanedRows[2]
assert not any('```' in str(val) for val in cleanedRow373), "Cleaned row should not have corruption"
assert "34919" in cleanedRow373, "Should have reconstructed 34919"
print("✅ Corruption cleaned successfully")
print(f" Cleaned row 373: {cleanedRow373}")
else:
print("✅ No corruption found in final result")
except json.JSONEncodeError as e:
print(f"❌ Failed to serialize final result: {e}")
assert False, f"Failed to serialize final result: {e}"
except json.JSONDecodeError as e:
print(f"❌ Failed to parse final result back: {e}")
assert False, f"Failed to parse final result back: {e}"
print("✅ Final result building test completed")
def testEndToEndFinalizationWithCorruption():
"""Test end-to-end finalization process simulating the exact failure scenario"""
print("\n" + "="*60)
print("TEST: End-to-End Finalization with Corruption (Failure Scenario)")
print("="*60)
# Load the real accumulated JSON (with corruption)
debugFile = os.path.join(
os.path.dirname(__file__),
"..", "..", "..", "local", "debug", "prompts",
"20251130-205629-015-document_generation_accumulated_json_iteration_2.json"
)
if not os.path.exists(debugFile):
print(f"⚠️ Debug file not found: {debugFile}")
print(" Skipping test - file may not exist in this environment")
return
# Step 1: Load and parse accumulated JSON
with open(debugFile, 'r', encoding='utf-8') as f:
jsonContent = f.read()
extractedJson = extractJsonString(jsonContent)
cleanedJson = JsonResponseHandler.cleanEncodingIssues(extractedJson)
try:
parsedJson = json.loads(cleanedJson)
except json.JSONDecodeError as e:
repairedJson = repairBrokenJson(cleanedJson)
if not repairedJson:
print(f"❌ Failed to parse or repair JSON: {e}")
assert False, f"Failed to parse or repair JSON: {e}"
parsedJson = repairedJson
# Step 2: Extract sections (as done in mainServiceAi)
sections = extractSectionsFromDocument(parsedJson)
print(f"✅ Extracted {len(sections)} sections")
# Step 3: Complete incomplete structures (as done in mainServiceAi)
completedSections = JsonResponseHandler.completeIncompleteStructures(sections)
print(f"✅ Completed structures for {len(completedSections)} sections")
# Step 4: Check for corruption BEFORE building final result
corruptionFound = False
for section in completedSections:
sectionStr = json.dumps(section)
if '```json' in sectionStr or '```' in sectionStr:
corruptionFound = True
print(f"⚠️ Corruption detected in section {section.get('id', 'unknown')}")
break
# Step 5: Clean corruption if found (this should be done before building final result)
if corruptionFound:
print(" Cleaning corruption from sections...")
cleanedSections = cleanCorruptionFromSections(completedSections)
print("✅ Corruption cleaned from sections")
else:
cleanedSections = completedSections
print("✅ No corruption found")
# Step 6: Build final result (simulating _buildFinalResultFromSections)
documentMetadata = {
"title": "Prime Numbers Table",
"filename": "prime_numbers_table.json"
}
title = documentMetadata.get("title", "Generated Document")
filename = documentMetadata.get("filename", "document.json")
documents = [{
"id": "doc_1",
"title": title,
"filename": filename,
"sections": cleanedSections
}]
result = {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": documents
}
# Step 7: Serialize final result (this is where it might have failed)
try:
finalResultStr = json.dumps(result, indent=2, ensure_ascii=False)
print(f"✅ Final result serialized successfully: {len(finalResultStr)} chars")
# Step 8: Verify it can be parsed back
parsedBack = json.loads(finalResultStr)
assert parsedBack['documents'][0]['title'] == title
assert len(parsedBack['documents'][0]['sections']) == len(cleanedSections)
print("✅ Final result can be parsed back successfully")
# Step 9: Verify no corruption in final result
finalResultStr_check = json.dumps(parsedBack)
if '```json' in finalResultStr_check or '```' in finalResultStr_check:
print("⚠️ WARNING: Corruption still present in final result")
else:
print("✅ Final result is clean (no corruption)")
# Step 10: Verify section content
if parsedBack['documents'][0]['sections']:
section = parsedBack['documents'][0]['sections'][0]
if section.get('id') == 'section_prime_numbers_table':
elements = section.get('elements', [])
if elements and 'rows' in elements[0]:
rows = elements[0]['rows']
print(f"✅ Final result contains {len(rows)} rows")
assert len(rows) == 400, f"Expected 400 rows, got {len(rows)}"
# Verify row 373 is clean
if len(rows) >= 373:
row373 = rows[372]
row373Str = json.dumps(row373)
if '```' in row373Str:
print(f"⚠️ WARNING: Row 373 still has corruption: {row373Str[:100]}")
else:
print(f"✅ Row 373 is clean: {row373[:5]}...")
print("\n✅ End-to-end finalization test completed successfully")
print(f" Final result ready to write to debug file ({len(finalResultStr)} chars)")
except json.JSONEncodeError as e:
print(f"❌ Failed to serialize final result: {e}")
print(" This is likely why the final_result.txt file was empty")
assert False, f"Failed to serialize final result: {e}"
except Exception as e:
print(f"❌ Unexpected error: {e}")
import traceback
traceback.print_exc()
assert False, f"Unexpected error: {e}"
if __name__ == "__main__":
print("\n" + "="*60)
print("JSON FINALIZATION TEST SUITE")
print("="*60)
print("Testing finalization process after accumulation is complete")
print("="*60)
try:
# Test 1: Finalization with real-world accumulated JSON
testFinalizationWithRealWorldAccumulatedJson()
# Test 2: Cleaning markdown code fences
testCleaningMarkdownCodeFences()
# Test 3: Finalization with complete JSON
testFinalizationWithCompleteJson()
# Test 4: Building final result from sections
testBuildingFinalResultFromSections()
# Test 5: End-to-end finalization with corruption (simulating failure scenario)
testEndToEndFinalizationWithCorruption()
print("\n" + "="*60)
print("✅ ALL TESTS COMPLETED")
print("="*60)
except AssertionError as e:
print(f"\n❌ TEST FAILED: {e}")
sys.exit(1)
except Exception as e:
print(f"\n❌ ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)