594 lines
22 KiB
Python
594 lines
22 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Test cases for JSON merger with different use cases and random cuts.
|
|
|
|
Tests the robustness of the JSON merger by:
|
|
1. Creating test JSON for different use cases
|
|
2. Cutting it randomly at various points
|
|
3. Running the merger for each piece
|
|
4. Checking completeness against original
|
|
"""
|
|
|
|
import json
|
|
import random
|
|
import logging
|
|
import sys
|
|
import os
|
|
from typing import Dict, Any, List, Tuple
|
|
|
|
# Add project root to Python path
|
|
# Find project root by looking for gateway/modules structure
|
|
currentFile = os.path.abspath(__file__)
|
|
currentDir = os.path.dirname(currentFile)
|
|
|
|
# Navigate up from: gateway/modules/services/serviceAi/test_json_merger.py
|
|
# To project root: D:\Athi\Local\Web\poweron
|
|
# Try different levels up
|
|
candidates = [
|
|
os.path.abspath(os.path.join(currentDir, '../../../../')), # From gateway/modules/services/serviceAi
|
|
os.path.abspath(os.path.join(currentDir, '../../..')), # Alternative
|
|
os.path.abspath(os.path.join(currentDir, '../..')), # Another alternative
|
|
]
|
|
|
|
projectRoot = None
|
|
for candidate in candidates:
|
|
gatewayModulesPath = os.path.join(candidate, 'gateway', 'modules')
|
|
if os.path.exists(gatewayModulesPath):
|
|
projectRoot = candidate
|
|
break
|
|
|
|
# If still not found, try to find by looking for gateway directory
|
|
if projectRoot is None:
|
|
searchDir = currentDir
|
|
for _ in range(10): # Max 10 levels up
|
|
gatewayPath = os.path.join(searchDir, 'gateway')
|
|
if os.path.exists(gatewayPath) and os.path.exists(os.path.join(gatewayPath, 'modules')):
|
|
projectRoot = searchDir
|
|
break
|
|
parent = os.path.dirname(searchDir)
|
|
if parent == searchDir: # Reached root
|
|
break
|
|
searchDir = parent
|
|
|
|
if projectRoot is None:
|
|
raise RuntimeError(f"Could not find project root. Current file: {currentFile}")
|
|
|
|
# Add gateway directory to Python path (not project root)
|
|
gatewayPath = os.path.join(projectRoot, 'gateway')
|
|
if gatewayPath not in sys.path:
|
|
sys.path.insert(0, gatewayPath)
|
|
|
|
# Verify the path is correct
|
|
modulesPath = os.path.join(projectRoot, 'gateway', 'modules')
|
|
if not os.path.exists(modulesPath):
|
|
raise RuntimeError(f"Project root verification failed. Expected gateway/modules at: {modulesPath}")
|
|
|
|
try:
|
|
from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler
|
|
from modules.services.serviceAi.subJsonMerger import JsonMergeLogger
|
|
from modules.shared.jsonUtils import (
|
|
normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson,
|
|
extractJsonStructureContext
|
|
)
|
|
except ImportError as e:
|
|
# Try to help debug
|
|
print(f"Import error: {e}")
|
|
print(f"Project root: {projectRoot}")
|
|
print(f"Gateway path: {gatewayPath}")
|
|
print(f"Python path (first 3): {sys.path[:3]}")
|
|
print(f"Looking for modules at: {modulesPath}")
|
|
print(f"Exists: {os.path.exists(modulesPath)}")
|
|
if os.path.exists(modulesPath):
|
|
print(f"Contents: {os.listdir(modulesPath)[:5]}")
|
|
raise
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def createTestJsonForUseCase(useCaseId: str, size: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Create test JSON for a specific use case.
|
|
|
|
Args:
|
|
useCaseId: Use case ID (section_content, chapter_structure, etc.)
|
|
size: Size of test data (number of elements/rows/items)
|
|
|
|
Returns:
|
|
Test JSON dictionary
|
|
"""
|
|
if useCaseId == "section_content":
|
|
# Create table with rows
|
|
elements = [{
|
|
"type": "table",
|
|
"content": {
|
|
"headers": ["Year", "Value"],
|
|
"rows": [[str(1947 + i), str(10000 + i * 100)] for i in range(size)]
|
|
}
|
|
}]
|
|
return {"elements": elements}
|
|
|
|
elif useCaseId == "chapter_structure":
|
|
chapters = [{
|
|
"id": f"chapter_{i}",
|
|
"title": f"Chapter {i}",
|
|
"level": 1
|
|
} for i in range(size)]
|
|
return {"documents": [{"chapters": chapters}]}
|
|
|
|
elif useCaseId == "code_structure":
|
|
files = [{
|
|
"id": f"file_{i}",
|
|
"filename": f"file_{i}.py",
|
|
"fileType": "python",
|
|
"functions": [f"function_{i}_{j}" for j in range(5)]
|
|
} for i in range(size)]
|
|
return {"files": files}
|
|
|
|
elif useCaseId == "code_content":
|
|
files = [{
|
|
"id": f"file_{i}",
|
|
"content": f"# File {i}\ndef function_{i}():\n pass\n" * 10,
|
|
"functions": [{"name": f"function_{i}_{j}", "line": j * 3} for j in range(5)]
|
|
} for i in range(size)]
|
|
return {"files": files}
|
|
|
|
else:
|
|
raise ValueError(f"Unknown use case: {useCaseId}")
|
|
|
|
|
|
def cutJsonRandomly(jsonString: str, numCuts: int = 5, overlapSize: int = 100) -> List[str]:
|
|
"""
|
|
Cut JSON string RANDOMLY at different points WITH OVERLAP between fragments.
|
|
Each fragment overlaps with the previous one to help merging.
|
|
|
|
Args:
|
|
jsonString: JSON string to cut
|
|
numCuts: Number of cuts to make
|
|
overlapSize: Size of overlap between fragments (in characters)
|
|
|
|
Returns:
|
|
List of JSON fragments with overlap
|
|
"""
|
|
fragments = []
|
|
currentPos = 0
|
|
totalLength = len(jsonString)
|
|
|
|
if totalLength == 0:
|
|
return []
|
|
|
|
# First fragment: from start to first cut point
|
|
if numCuts > 0:
|
|
# First cut point (between 20% and 40% of total)
|
|
firstCutPoint = random.randint(int(totalLength * 0.2), int(totalLength * 0.4))
|
|
fragment = jsonString[:firstCutPoint]
|
|
fragments.append(fragment)
|
|
currentPos = firstCutPoint
|
|
else:
|
|
# No cuts - return whole string
|
|
return [jsonString]
|
|
|
|
# Subsequent fragments: each starts with overlap from previous, then continues
|
|
for i in range(numCuts - 1):
|
|
if currentPos >= totalLength:
|
|
break
|
|
|
|
# Calculate overlap start (go back overlapSize from current position)
|
|
overlapStart = max(0, currentPos - overlapSize)
|
|
|
|
# Calculate next cut point (between 20% and 40% of remaining)
|
|
remaining = totalLength - currentPos
|
|
if remaining < overlapSize * 2:
|
|
# Not enough remaining - add rest as last fragment
|
|
fragment = jsonString[overlapStart:]
|
|
fragments.append(fragment)
|
|
break
|
|
|
|
# Next cut point from current position
|
|
nextCutPoint = currentPos + random.randint(int(remaining * 0.2), int(remaining * 0.4))
|
|
nextCutPoint = min(nextCutPoint, totalLength)
|
|
|
|
# Fragment: from overlap start to next cut point
|
|
fragment = jsonString[overlapStart:nextCutPoint]
|
|
fragments.append(fragment)
|
|
|
|
currentPos = nextCutPoint
|
|
|
|
# Add remaining as last fragment (with overlap)
|
|
if currentPos < totalLength:
|
|
overlapStart = max(0, currentPos - overlapSize)
|
|
fragment = jsonString[overlapStart:]
|
|
fragments.append(fragment)
|
|
|
|
return fragments
|
|
|
|
|
|
def testMergerWithFragments(
|
|
originalJson: Dict[str, Any],
|
|
fragments: List[str],
|
|
useCaseId: str
|
|
) -> Tuple[bool, Dict[str, Any], str]:
|
|
"""
|
|
Test merger by merging fragments sequentially.
|
|
|
|
Args:
|
|
originalJson: Original complete JSON
|
|
fragments: List of JSON fragments to merge
|
|
useCaseId: Use case ID
|
|
|
|
Returns:
|
|
Tuple of (success, merged_json, error_message)
|
|
"""
|
|
if not fragments:
|
|
return False, {}, "No fragments provided"
|
|
|
|
# Log structure context for each fragment (especially incomplete ones)
|
|
print(f"\n{'='*60}")
|
|
print(f"FRAGMENT ANALYSIS (use case: {useCaseId})")
|
|
print(f"{'='*60}")
|
|
|
|
for fragIdx, fragment in enumerate(fragments):
|
|
print(f"\nFragment {fragIdx + 1}/{len(fragments)}:")
|
|
print(f" Length: {len(fragment)} chars")
|
|
|
|
# Extract structure context for this fragment
|
|
try:
|
|
structureContext = extractJsonStructureContext(fragment, useCaseId)
|
|
|
|
templateStructure = structureContext.get("template_structure", "")
|
|
lastCompletePart = structureContext.get("last_complete_part", "")
|
|
incompletePart = structureContext.get("incomplete_part", "")
|
|
structureContextJson = structureContext.get("structure_context", "")
|
|
|
|
# Check if fragment is incomplete
|
|
normalized = stripCodeFences(normalizeJsonText(fragment)).strip()
|
|
parsed, parseErr, _ = tryParseJson(normalized)
|
|
isIncomplete = parseErr is not None or (parsed is None)
|
|
|
|
if isIncomplete:
|
|
print(f" Status: INCOMPLETE (cut off)")
|
|
print(f" Template Structure:")
|
|
if templateStructure:
|
|
# Show first few lines of template
|
|
templateLines = templateStructure.split('\n')
|
|
templateLinesToShow = templateLines[:5]
|
|
for line in templateLinesToShow:
|
|
print(f" {line}")
|
|
if len(templateLines) > 5:
|
|
remainingLines = len(templateLines) - 5
|
|
print(f" ... ({remainingLines} more lines)")
|
|
else:
|
|
print(f" (not available)")
|
|
|
|
print(f" Structure Context:")
|
|
if structureContextJson:
|
|
# Show structure context
|
|
contextLines = structureContextJson.split('\n')
|
|
contextLinesToShow = contextLines[:5]
|
|
for line in contextLinesToShow:
|
|
print(f" {line}")
|
|
if len(contextLines) > 5:
|
|
remainingContextLines = len(contextLines) - 5
|
|
print(f" ... ({remainingContextLines} more lines)")
|
|
else:
|
|
print(f" (not available)")
|
|
|
|
print(f" Last Complete Part:")
|
|
if lastCompletePart:
|
|
# Show last complete part (truncated if too long)
|
|
if len(lastCompletePart) > 200:
|
|
print(f" {lastCompletePart[:200]}... ({len(lastCompletePart)} chars total)")
|
|
else:
|
|
print(f" {lastCompletePart}")
|
|
else:
|
|
print(f" (not available)")
|
|
|
|
print(f" Incomplete Part:")
|
|
if incompletePart:
|
|
# Show incomplete part (truncated if too long)
|
|
if len(incompletePart) > 200:
|
|
print(f" {incompletePart[:200]}... ({len(incompletePart)} chars total)")
|
|
else:
|
|
print(f" {incompletePart}")
|
|
else:
|
|
print(f" (not available)")
|
|
else:
|
|
print(f" Status: COMPLETE")
|
|
if structureContextJson:
|
|
print(f" Structure Context:")
|
|
contextLines = structureContextJson.split('\n')
|
|
contextLinesToShow = contextLines[:3]
|
|
for line in contextLinesToShow:
|
|
print(f" {line}")
|
|
if len(contextLines) > 3:
|
|
remainingContextLines = len(contextLines) - 3
|
|
print(f" ... ({remainingContextLines} more lines)")
|
|
except Exception as e:
|
|
print(f" Error extracting structure context: {e}")
|
|
|
|
print(f"\n{'='*60}\n")
|
|
|
|
# Start with first fragment
|
|
accumulated = fragments[0]
|
|
|
|
# Merge each subsequent fragment
|
|
for i, fragment in enumerate(fragments[1:], 1):
|
|
try:
|
|
accumulated, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(
|
|
accumulated, fragment
|
|
)
|
|
# Log if no overlap was found (iterations would stop in real scenario)
|
|
if not hasOverlap:
|
|
print(f" ⚠️ Fragment {i}: No overlap found - iterations would stop here")
|
|
|
|
# Check if result is empty (should never happen)
|
|
if not accumulated or accumulated.strip() in ['{"elements": []}', '{}', '']:
|
|
return False, {}, f"Merge {i} returned empty JSON"
|
|
|
|
except Exception as e:
|
|
return False, {}, f"Merge {i} failed with error: {str(e)}"
|
|
|
|
# Parse merged result
|
|
try:
|
|
# Normalize and try to parse
|
|
normalized = stripCodeFences(normalizeJsonText(accumulated)).strip()
|
|
|
|
# Try to parse directly
|
|
parsed, parseErr, _ = tryParseJson(normalized)
|
|
|
|
if parseErr is not None:
|
|
# Try closing structures if incomplete
|
|
try:
|
|
closed = closeJsonStructures(normalized)
|
|
parsed, parseErr2, _ = tryParseJson(closed)
|
|
if parseErr2 is not None:
|
|
# Try to extract valid JSON prefix
|
|
# JsonResponseHandler is already imported at module level
|
|
validPrefix = JsonResponseHandler._extractValidJsonPrefix(normalized)
|
|
if validPrefix:
|
|
parsed, parseErr3, _ = tryParseJson(validPrefix)
|
|
if parseErr3 is not None:
|
|
return False, {}, f"Final parse error: {str(parseErr3)}"
|
|
else:
|
|
return False, {}, f"Final parse error: {str(parseErr2)}"
|
|
except Exception as parseErr:
|
|
return False, {}, f"Final parse error: {str(parseErr)}"
|
|
|
|
if not parsed:
|
|
return False, {}, "Final parse returned None"
|
|
|
|
# CRITICAL: Ensure parsed is a dict, not a list
|
|
# If it's a list, wrap it in the expected structure based on use case
|
|
if isinstance(parsed, list):
|
|
# Try to normalize list to expected structure
|
|
if useCaseId == "section_content":
|
|
# List of elements - wrap in elements structure
|
|
parsed = {"elements": parsed}
|
|
elif useCaseId == "chapter_structure":
|
|
# List of chapters - wrap in documents structure
|
|
parsed = {"documents": [{"chapters": parsed}]}
|
|
elif useCaseId == "code_structure":
|
|
# List of files - wrap in files structure
|
|
parsed = {"files": parsed}
|
|
elif useCaseId == "code_content":
|
|
# List of files - wrap in files structure
|
|
parsed = {"files": parsed}
|
|
else:
|
|
# Unknown use case - try to wrap as elements
|
|
parsed = {"elements": parsed}
|
|
|
|
# Ensure it's a dict now
|
|
if not isinstance(parsed, dict):
|
|
return False, {}, f"Final parse returned unexpected type: {type(parsed).__name__}"
|
|
|
|
return True, parsed, ""
|
|
|
|
except Exception as e:
|
|
return False, {}, f"Final parse failed: {str(e)}"
|
|
|
|
|
|
def compareJsonCompleteness(
|
|
original: Dict[str, Any],
|
|
merged: Dict[str, Any],
|
|
useCaseId: str
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Compare merged JSON with original to check completeness.
|
|
|
|
Args:
|
|
original: Original JSON
|
|
merged: Merged JSON (must be a dict)
|
|
useCaseId: Use case ID
|
|
|
|
Returns:
|
|
Tuple of (is_complete, message)
|
|
"""
|
|
# CRITICAL: Ensure merged is a dict
|
|
if not isinstance(merged, dict):
|
|
return False, f"Merged JSON is not a dict, got {type(merged).__name__}"
|
|
|
|
if useCaseId == "section_content":
|
|
origElements = original.get("elements", [])
|
|
mergedElements = merged.get("elements", [])
|
|
|
|
if not isinstance(origElements, list):
|
|
return False, f"Original elements is not a list: {type(origElements).__name__}"
|
|
if not isinstance(mergedElements, list):
|
|
return False, f"Merged elements is not a list: {type(mergedElements).__name__}"
|
|
|
|
if len(mergedElements) < len(origElements):
|
|
return False, f"Missing elements: {len(origElements)} expected, {len(mergedElements)} found"
|
|
|
|
# Check table rows
|
|
if origElements and mergedElements:
|
|
origTable = origElements[0] if isinstance(origElements[0], dict) else {}
|
|
mergedTable = mergedElements[0] if isinstance(mergedElements[0], dict) else {}
|
|
|
|
if not origTable or not mergedTable:
|
|
return False, f"Table structure missing: origTable={bool(origTable)}, mergedTable={bool(mergedTable)}"
|
|
|
|
origRows = origTable.get("content", {}).get("rows", []) if isinstance(origTable.get("content"), dict) else origTable.get("rows", [])
|
|
mergedRows = mergedTable.get("content", {}).get("rows", []) if isinstance(mergedTable.get("content"), dict) else mergedTable.get("rows", [])
|
|
|
|
if not isinstance(origRows, list):
|
|
return False, f"Original rows is not a list: {type(origRows).__name__}"
|
|
if not isinstance(mergedRows, list):
|
|
return False, f"Merged rows is not a list: {type(mergedRows).__name__}"
|
|
|
|
if len(mergedRows) < len(origRows):
|
|
return False, f"Missing rows: {len(origRows)} expected, {len(mergedRows)} found"
|
|
|
|
return True, "Complete"
|
|
|
|
elif useCaseId == "chapter_structure":
|
|
origChapters = original.get("documents", [{}])[0].get("chapters", [])
|
|
mergedChapters = merged.get("documents", [{}])[0].get("chapters", [])
|
|
|
|
if len(mergedChapters) < len(origChapters):
|
|
return False, f"Missing chapters: {len(origChapters)} expected, {len(mergedChapters)} found"
|
|
|
|
return True, "Complete"
|
|
|
|
elif useCaseId == "code_structure":
|
|
origFiles = original.get("files", [])
|
|
mergedFiles = merged.get("files", [])
|
|
|
|
if len(mergedFiles) < len(origFiles):
|
|
return False, f"Missing files: {len(origFiles)} expected, {len(mergedFiles)} found"
|
|
|
|
return True, "Complete"
|
|
|
|
elif useCaseId == "code_content":
|
|
origFiles = original.get("files", [])
|
|
mergedFiles = merged.get("files", [])
|
|
|
|
if len(mergedFiles) < len(origFiles):
|
|
return False, f"Missing files: {len(origFiles)} expected, {len(mergedFiles)} found"
|
|
|
|
return True, "Complete"
|
|
|
|
else:
|
|
return False, f"Unknown use case: {useCaseId}"
|
|
|
|
|
|
def runTestForUseCase(useCaseId: str, size: int = 50, numTests: int = 10) -> Dict[str, Any]:
|
|
"""
|
|
Run multiple tests for a use case with random cuts.
|
|
|
|
Args:
|
|
useCaseId: Use case ID
|
|
size: Size of test data
|
|
numTests: Number of test runs
|
|
|
|
Returns:
|
|
Test results dictionary
|
|
"""
|
|
results = {
|
|
"useCaseId": useCaseId,
|
|
"size": size,
|
|
"numTests": numTests,
|
|
"passed": 0,
|
|
"failed": 0,
|
|
"errors": []
|
|
}
|
|
|
|
for testNum in range(numTests):
|
|
try:
|
|
# Create test JSON
|
|
originalJson = createTestJsonForUseCase(useCaseId, size)
|
|
originalString = json.dumps(originalJson, indent=2, ensure_ascii=False)
|
|
|
|
# Cut randomly
|
|
fragments = cutJsonRandomly(originalString, numCuts=random.randint(3, 7))
|
|
|
|
# Test merger
|
|
success, mergedJson, errorMsg = testMergerWithFragments(
|
|
originalJson, fragments, useCaseId
|
|
)
|
|
|
|
if not success:
|
|
results["failed"] += 1
|
|
results["errors"].append(f"Test {testNum + 1}: {errorMsg}")
|
|
continue
|
|
|
|
# Check completeness
|
|
isComplete, completenessMsg = compareJsonCompleteness(
|
|
originalJson, mergedJson, useCaseId
|
|
)
|
|
|
|
if isComplete:
|
|
results["passed"] += 1
|
|
else:
|
|
results["failed"] += 1
|
|
results["errors"].append(f"Test {testNum + 1}: {completenessMsg}")
|
|
|
|
except Exception as e:
|
|
results["failed"] += 1
|
|
results["errors"].append(f"Test {testNum + 1}: Exception - {str(e)}")
|
|
|
|
return results
|
|
|
|
|
|
def runAllTests():
|
|
"""Run tests for all use cases."""
|
|
useCases = [
|
|
"section_content",
|
|
"chapter_structure",
|
|
"code_structure",
|
|
"code_content"
|
|
]
|
|
|
|
allResults = []
|
|
|
|
for useCaseId in useCases:
|
|
print(f"\n{'='*60}")
|
|
print(f"Testing use case: {useCaseId}")
|
|
print(f"{'='*60}")
|
|
|
|
# Initialize log file for this use case
|
|
# Initialize log file (overwrite on each test run)
|
|
logFileName = f"json_merger_{useCaseId}.txt"
|
|
JsonMergeLogger.initializeLogFile(logFileName)
|
|
print(f"Log file: {logFileName}")
|
|
|
|
results = runTestForUseCase(useCaseId, size=50, numTests=10)
|
|
allResults.append(results)
|
|
|
|
print(f"Passed: {results['passed']}/{results['numTests']}")
|
|
print(f"Failed: {results['failed']}/{results['numTests']}")
|
|
|
|
if results["errors"]:
|
|
print("\nErrors:")
|
|
for error in results["errors"][:5]: # Show first 5 errors
|
|
print(f" - {error}")
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("SUMMARY")
|
|
print(f"{'='*60}")
|
|
|
|
totalPassed = sum(r["passed"] for r in allResults)
|
|
totalFailed = sum(r["failed"] for r in allResults)
|
|
totalTests = sum(r["numTests"] for r in allResults)
|
|
|
|
print(f"Total tests: {totalTests}")
|
|
print(f"Passed: {totalPassed}")
|
|
print(f"Failed: {totalFailed}")
|
|
print(f"Success rate: {totalPassed / totalTests * 100:.1f}%")
|
|
|
|
return allResults
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Set up logging - use WARNING level to reduce noise from jsonUtils
|
|
logging.basicConfig(level=logging.WARNING)
|
|
|
|
# Run tests
|
|
results = runAllTests()
|
|
|
|
# Save results to file (in project root)
|
|
resultsFile = os.path.join(projectRoot, "test_json_merger_results.json")
|
|
with open(resultsFile, "w", encoding="utf-8") as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nResults saved to {resultsFile}")
|