gateway/tests/functional/test12_json_split_merge.py

704 lines
29 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
JSON Split and Merge Test 12 - Tests JSON splitting and merging using workflow tools
Tests random splitting of JSON files into 3 parts and merging them back using ModularJsonMerger.
"""
import asyncio
import json
import sys
import os
import time
import random
from typing import Dict, Any, List, Optional, Tuple
# Add the gateway to path (go up 2 levels from tests/functional/)
_gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if _gateway_path not in sys.path:
sys.path.insert(0, _gateway_path)
# Import JSON merger from workflow tools
from modules.services.serviceAi.subJsonMerger import ModularJsonMerger, JsonMergeLogger
from modules.shared.jsonContinuation import getContexts
class JsonSplitMergeTester12:
def __init__(self):
self.testResults = {}
self.testJsonFiles = []
self.logBuffer = []
self.logFile = None
def createTestJsonFiles(self) -> List[Dict[str, Any]]:
"""Create various test JSON files with different structures."""
testFiles = [
{
"name": "config.json",
"data": {
"application": "Customer Manager",
"version": "1.0.0",
"database": {
"host": "localhost",
"port": 5432,
"name": "customers_db"
},
"api": {
"baseUrl": "https://api.example.com",
"timeout": 30
}
}
},
{
"name": "customers.json",
"data": {
"customers": [
{"id": 1, "name": "John Doe", "email": "john@example.com", "phone": "+1234567890", "address": "123 Main St"},
{"id": 2, "name": "Jane Smith", "email": "jane@example.com", "phone": "+0987654321", "address": "456 Oak Ave"},
{"id": 3, "name": "Bob Johnson", "email": "bob@example.com", "phone": "+1122334455", "address": "789 Pine Rd"},
{"id": 4, "name": "Alice Williams", "email": "alice@example.com", "phone": "+5566778899", "address": "321 Elm St"},
{"id": 5, "name": "Charlie Brown", "email": "charlie@example.com", "phone": "+9988776655", "address": "654 Maple Dr"}
]
}
},
{
"name": "settings.json",
"data": {
"theme": {
"darkMode": True,
"fontSize": 14,
"language": "en"
},
"notifications": {
"email": True,
"sms": False,
"push": True
},
"features": {
"enableAnalytics": True,
"enableReports": False
}
}
},
{
"name": "products.json",
"data": {
"products": [
{"id": "P001", "name": "Product A", "price": 29.99, "category": "Electronics", "inStock": True},
{"id": "P002", "name": "Product B", "price": 49.99, "category": "Clothing", "inStock": True},
{"id": "P003", "name": "Product C", "price": 19.99, "category": "Books", "inStock": False},
{"id": "P004", "name": "Product D", "price": 99.99, "category": "Electronics", "inStock": True},
{"id": "P005", "name": "Product E", "price": 14.99, "category": "Books", "inStock": True},
{"id": "P006", "name": "Product F", "price": 79.99, "category": "Clothing", "inStock": True}
]
}
},
{
"name": "document_structure.json",
"data": {
"metadata": {
"title": "Test Document",
"author": "Test Author",
"date": "2025-01-05"
},
"documents": [
{
"id": "doc1",
"title": "Document 1",
"sections": [
{
"id": "sec1",
"content_type": "heading",
"elements": [
{"type": "heading", "content": {"text": "Introduction", "level": 1}}
]
},
{
"id": "sec2",
"content_type": "paragraph",
"elements": [
{"type": "paragraph", "content": {"text": "This is a test paragraph."}}
]
}
]
}
]
}
},
{
"name": "large_customers.json",
"data": self._createLargeCustomersData()
},
{
"name": "large_products.json",
"data": self._createLargeProductsData()
},
{
"name": "large_documents.json",
"data": self._createLargeDocumentsData()
}
]
return testFiles
def _createLargeCustomersData(self) -> Dict[str, Any]:
"""Create a large customers dataset for budget testing."""
customers = []
# Create 100 customers with long descriptions
for i in range(100):
customers.append({
"id": i + 1,
"name": f"Customer {i + 1}",
"email": f"customer{i+1}@example.com",
"phone": f"+1{5550000000 + i}",
"address": f"{100 + i} Main Street, City {i % 10}, State {i % 5}, ZIP {10000 + i}",
"description": f"This is a detailed description for customer {i + 1}. " * 10 +
f"They have been a loyal customer since {2000 + (i % 25)}. " +
f"Their preferences include various products and services. " * 5,
"orders": [
{
"orderId": f"ORD-{i+1}-{j+1}",
"date": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}",
"total": round(100.0 + (i * 10) + (j * 5), 2),
"items": [
{
"productId": f"PROD-{k+1}",
"quantity": (k % 5) + 1,
"price": round(10.0 + k * 2, 2)
}
for k in range(3)
]
}
for j in range(5)
],
"metadata": {
"created": f"2020-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}",
"lastLogin": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}",
"tags": [f"tag-{i % 10}", f"category-{i % 5}", f"segment-{i % 3}"]
}
})
return {"customers": customers}
def _createLargeProductsData(self) -> Dict[str, Any]:
"""Create a large products dataset for budget testing."""
products = []
# Create 200 products with detailed information
categories = ["Electronics", "Clothing", "Books", "Home & Garden", "Sports", "Toys", "Automotive", "Health"]
for i in range(200):
category = categories[i % len(categories)]
products.append({
"id": f"PROD-{i+1:04d}",
"name": f"Product {i+1} - {category}",
"category": category,
"price": round(10.0 + (i * 2.5), 2),
"cost": round(5.0 + (i * 1.5), 2),
"inStock": i % 3 != 0,
"stockQuantity": (i % 100) * 10,
"description": f"This is a comprehensive product description for Product {i+1}. " * 15 +
f"It belongs to the {category} category and offers excellent value. " * 10 +
f"Features include: feature-1, feature-2, feature-3, and many more. " * 5,
"specifications": {
"weight": f"{1.0 + (i % 10)} kg",
"dimensions": f"{10 + (i % 20)}x{5 + (i % 15)}x{3 + (i % 10)} cm",
"color": ["red", "blue", "green", "black", "white"][i % 5],
"material": ["plastic", "metal", "wood", "fabric"][i % 4],
"warranty": f"{1 + (i % 5)} years"
},
"reviews": [
{
"userId": f"USER-{j+1}",
"rating": (j % 5) + 1,
"comment": f"Review {j+1} for product {i+1}: " + "This is a detailed review comment. " * 10,
"date": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}"
}
for j in range(3)
],
"relatedProducts": [f"PROD-{k+1:04d}" for k in range(max(0, i-2), min(200, i+3)) if k != i]
})
return {"products": products}
def _createLargeDocumentsData(self) -> Dict[str, Any]:
"""Create a large documents dataset for budget testing."""
documents = []
# Create 50 documents with nested structures
for i in range(50):
sections = []
for j in range(10):
elements = []
for k in range(5):
if k % 2 == 0:
elements.append({
"type": "heading",
"level": (k % 3) + 1,
"content": {
"text": f"Section {j+1} Heading {k+1} for Document {i+1}",
"style": "bold"
}
})
else:
elements.append({
"type": "paragraph",
"content": {
"text": f"This is paragraph {k+1} in section {j+1} of document {i+1}. " * 20 +
f"It contains detailed information about various topics. " * 15 +
f"The content is structured and well-organized. " * 10
}
})
sections.append({
"id": f"sec-{i+1}-{j+1}",
"title": f"Section {j+1}",
"content_type": "mixed",
"elements": elements,
"metadata": {
"created": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}",
"modified": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}",
"author": f"Author-{(i % 10) + 1}",
"tags": [f"tag-{j % 10}", f"category-{i % 5}"]
}
})
documents.append({
"id": f"doc-{i+1:03d}",
"title": f"Document {i+1} - Comprehensive Report",
"description": f"This is a comprehensive document with detailed information. " * 30 +
f"It covers multiple topics and sections. " * 20 +
f"The content is extensive and well-structured. " * 15,
"sections": sections,
"metadata": {
"created": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}",
"modified": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}",
"author": f"Author-{(i % 10) + 1}",
"version": f"1.{(i % 10)}",
"status": ["draft", "review", "published"][i % 3],
"tags": [f"tag-{i % 20}" for _ in range(5)]
}
})
return {
"metadata": {
"title": "Large Document Collection",
"description": "A collection of 50 comprehensive documents for testing budget limits",
"totalDocuments": 50,
"created": "2024-01-01"
},
"documents": documents
}
def splitJsonRandomly(self, jsonString: str, numParts: int = 3) -> List[str]:
"""
Split JSON string randomly into specified number of parts.
Simulates real AI response cuts - can split anywhere, even in the middle of strings/numbers/structures.
This is the REAL scenario: AI response gets cut off randomly, not at convenient points.
"""
if numParts < 2:
return [jsonString]
jsonLength = len(jsonString)
# Generate truly random split points - can be anywhere!
# Only ensure minimum part size to avoid empty parts
minPartSize = max(10, jsonLength // (numParts * 3)) # Smaller minimum to allow more randomness
splitPoints = []
for _ in range(numParts - 1):
# Generate random point - can be anywhere in the string
# Only ensure we don't create parts smaller than minimum
minPoint = len(splitPoints) * minPartSize if splitPoints else minPartSize
maxPoint = jsonLength - (numParts - len(splitPoints) - 1) * minPartSize
if maxPoint <= minPoint:
# If we can't avoid minimum size, just use the boundary
splitPoint = minPoint
else:
# Truly random point - can be in the middle of anything!
splitPoint = random.randint(minPoint, maxPoint)
splitPoints.append(splitPoint)
splitPoints.sort()
# Create parts - these can be cut anywhere, even mid-string, mid-number, etc.
parts = []
start = 0
for splitPoint in splitPoints:
parts.append(jsonString[start:splitPoint])
start = splitPoint
parts.append(jsonString[start:]) # Last part
return parts
def _log(self, message: str):
"""Add message to log buffer."""
self.logBuffer.append(message)
print(message)
def normalizeJson(self, jsonString: str) -> Optional[Dict[str, Any]]:
"""Normalize JSON string by parsing and re-serializing. Returns None if parsing fails."""
try:
parsed = json.loads(jsonString)
return parsed
except json.JSONDecodeError:
# Try to close incomplete JSON structures
try:
from modules.shared.jsonUtils import closeJsonStructures, tryParseJson
closed = closeJsonStructures(jsonString)
parsed, error, _ = tryParseJson(closed)
if error is None and parsed is not None:
return parsed
except Exception:
pass
# Return None if all parsing attempts fail
return None
def compareJson(self, original: Dict[str, Any], merged: Dict[str, Any]) -> Dict[str, Any]:
"""Compare original and merged JSON structures."""
originalStr = json.dumps(original, sort_keys=True, indent=2)
mergedStr = json.dumps(merged, sort_keys=True, indent=2)
exactMatch = originalStr == mergedStr
# Deep comparison
differences = []
self._findDifferences(original, merged, "", differences)
return {
"exactMatch": exactMatch,
"differences": differences,
"originalSize": len(originalStr),
"mergedSize": len(mergedStr),
"sizeMatch": len(originalStr) == len(mergedStr)
}
def _findDifferences(self, obj1: Any, obj2: Any, path: str, differences: List[str]):
"""Recursively find differences between two JSON objects."""
if type(obj1) != type(obj2):
differences.append(f"{path}: Type mismatch - {type(obj1).__name__} vs {type(obj2).__name__}")
return
if isinstance(obj1, dict):
allKeys = set(obj1.keys()) | set(obj2.keys())
for key in allKeys:
newPath = f"{path}.{key}" if path else key
if key not in obj1:
differences.append(f"{newPath}: Missing in original")
elif key not in obj2:
differences.append(f"{newPath}: Missing in merged")
else:
self._findDifferences(obj1[key], obj2[key], newPath, differences)
elif isinstance(obj1, list):
if len(obj1) != len(obj2):
differences.append(f"{path}: Length mismatch - {len(obj1)} vs {len(obj2)}")
else:
for i, (item1, item2) in enumerate(zip(obj1, obj2)):
newPath = f"{path}[{i}]"
self._findDifferences(item1, item2, newPath, differences)
else:
if obj1 != obj2:
differences.append(f"{path}: Value mismatch - {obj1} vs {obj2}")
async def testJsonSplitMerge(self, jsonFile: Dict[str, Any]) -> Dict[str, Any]:
"""Test splitting and merging a single JSON file."""
fileName = jsonFile["name"]
originalData = jsonFile["data"]
self._log("")
self._log("="*80)
self._log(f"TESTING JSON SPLIT AND MERGE: {fileName}")
self._log("="*80)
# Convert to JSON string
originalJsonString = json.dumps(originalData, indent=2, ensure_ascii=False)
originalSize = len(originalJsonString)
# Log original JSON
self._log("")
self._log("="*80)
self._log("ORIGINAL JSON")
self._log("="*80)
self._log(f"JSON length: {originalSize} characters")
self._log("")
self._log("Full JSON content:")
self._log("-"*80)
jsonLines = originalJsonString.split('\n')
if len(jsonLines) > 50:
for line in jsonLines[:25]:
self._log(line)
self._log(f"... ({len(jsonLines) - 50} lines omitted) ...")
for line in jsonLines[-25:]:
self._log(line)
else:
for line in jsonLines:
self._log(line)
# Split JSON at random position (simulating AI response cut)
self._log("")
self._log("="*80)
self._log("SPLITTING JSON AT RANDOM POSITION (SIMULATING AI RESPONSE CUT)")
self._log("="*80)
# Find random cut position (not at start or end)
import random
minCutPos = max(100, originalSize // 10) # At least 10% from start
maxCutPos = min(originalSize - 100, originalSize * 9 // 10) # At least 10% from end
cutPosition = random.randint(minCutPos, maxCutPos)
# Get part from start to cut
partContent = originalJsonString[:cutPosition]
self._log("")
self._log("="*80)
self._log("PART (from start to cut):")
self._log("="*80)
self._log(f"Cut position: {cutPosition} characters")
self._log(f"Part length: {len(partContent)} characters")
self._log("")
self._log("Part content:")
partLines = partContent.split('\n')
if len(partLines) > 30:
for line in partLines[:15]:
self._log(f" {line}")
self._log(f" ... ({len(partLines) - 30} lines omitted) ...")
for line in partLines[-15:]:
self._log(f" {line}")
else:
for line in partLines:
self._log(f" {line}")
# Generate contexts using getContexts()
self._log("")
self._log("="*80)
self._log("GENERATING CONTINUATION CONTEXTS")
self._log("="*80)
contexts = getContexts(partContent)
# Log overlap context
self._log("")
self._log("="*80)
self._log("OVERLAP CONTEXT (for merging):")
self._log("="*80)
overlapLines = contexts.overlapContext.split('\n')
if len(overlapLines) > 30:
for line in overlapLines[:15]:
self._log(f" {line}")
self._log(f" ... ({len(overlapLines) - 30} lines omitted) ...")
for line in overlapLines[-15:]:
self._log(f" {line}")
else:
for line in overlapLines:
self._log(f" {line}")
# Log hierarchy context
self._log("")
self._log("="*80)
self._log("HIERARCHY CONTEXT (with budget logic):")
self._log("="*80)
hierarchyLines = contexts.hierarchyContext.split('\n')
if len(hierarchyLines) > 30:
for line in hierarchyLines[:15]:
self._log(f" {line}")
self._log(f" ... ({len(hierarchyLines) - 30} lines omitted) ...")
for line in hierarchyLines[-15:]:
self._log(f" {line}")
else:
for line in hierarchyLines:
self._log(f" {line}")
# Test completePart as valid JSON
self._log("")
self._log("="*80)
self._log("COMPLETE PART (should be valid JSON):")
self._log("="*80)
completeLines = contexts.completePart.split('\n')
if len(completeLines) > 30:
for line in completeLines[:15]:
self._log(f" {line}")
self._log(f" ... ({len(completeLines) - 30} lines omitted) ...")
for line in completeLines[-15:]:
self._log(f" {line}")
else:
for line in completeLines:
self._log(f" {line}")
# Validate completePart as JSON
self._log("")
self._log("="*80)
self._log("VALIDATING COMPLETE PART AS JSON:")
self._log("="*80)
isValidJson = False
parsedCompletePart = None
jsonError = None
try:
parsedCompletePart = json.loads(contexts.completePart)
isValidJson = True
self._log(" ✅ completePart is valid JSON")
self._log(f" Parsed type: {type(parsedCompletePart).__name__}")
# Compare with truncated JSON (not original) - parse the truncated part to compare
from modules.shared.jsonUtils import closeJsonStructures, tryParseJson
# Try to parse the truncated JSON part (with structures closed)
truncatedClosed = closeJsonStructures(partContent)
truncatedParsed, truncatedError, _ = tryParseJson(truncatedClosed)
if truncatedParsed is not None:
# Compare completePart with the parsed truncated JSON
if isinstance(parsedCompletePart, dict) and isinstance(truncatedParsed, dict):
comparison = self.compareJson(truncatedParsed, parsedCompletePart)
self._log(f" Comparison with truncated JSON (at cut position {cutPosition}):")
self._log(f" Exact match: {comparison['exactMatch']}")
self._log(f" Size match: {comparison['sizeMatch']}")
if comparison['differences']:
self._log(f" Differences found: {len(comparison['differences'])}")
for diff in comparison['differences'][:10]: # Show first 10 differences
self._log(f" - {diff}")
if len(comparison['differences']) > 10:
self._log(f" ... ({len(comparison['differences']) - 10} more differences)")
else:
self._log(" No differences found - completePart matches truncated JSON structure")
elif isinstance(parsedCompletePart, list) and isinstance(truncatedParsed, list):
self._log(f" Both are lists: truncated={len(truncatedParsed)} items, completePart={len(parsedCompletePart)} items")
else:
self._log(f" Different types: truncated={type(truncatedParsed).__name__}, completePart={type(parsedCompletePart).__name__}")
else:
self._log(f" Could not parse truncated JSON for comparison (error: {truncatedError})")
except json.JSONDecodeError as e:
isValidJson = False
jsonError = str(e)
self._log(f" ❌ completePart is NOT valid JSON")
self._log(f" Error: {jsonError}")
self._log(f" Error position: line {e.lineno}, column {e.colno}")
# Return test results
return {
"success": isValidJson,
"fileName": fileName,
"originalSize": originalSize,
"cutPosition": cutPosition,
"partSize": len(partContent),
"overlapContextSize": len(contexts.overlapContext),
"hierarchyContextSize": len(contexts.hierarchyContext),
"completePartSize": len(contexts.completePart),
"isValidJson": isValidJson,
"jsonError": jsonError,
"parsedCompletePart": parsedCompletePart is not None
}
async def testAllJsonFiles(self) -> Dict[str, Any]:
"""Test splitting and merging all test JSON files."""
print("\n" + "="*80)
print("TESTING JSON SPLIT AND MERGE")
print("="*80)
testFiles = self.createTestJsonFiles()
results = {}
for jsonFile in testFiles:
try:
result = await self.testJsonSplitMerge(jsonFile)
results[jsonFile["name"]] = result
# Small delay between tests
await asyncio.sleep(0.5)
except Exception as e:
import traceback
print(f"\n❌ Error testing {jsonFile['name']}: {str(e)}")
print(traceback.format_exc())
results[jsonFile["name"]] = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
return results
def _writeLogFile(self):
"""Write log buffer to file."""
logDir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "local", "debug")
os.makedirs(logDir, exist_ok=True)
logFilePath = os.path.join(logDir, "test12_json_split_merge_results.txt")
with open(logFilePath, 'w', encoding='utf-8') as f:
f.write('\n'.join(self.logBuffer))
self.logFile = logFilePath
print(f"\n📝 Detailed log written to: {logFilePath}")
async def runTest(self):
"""Run the complete test."""
self._log("="*80)
self._log("JSON SPLIT AND MERGE TEST 12")
self._log("="*80)
try:
# Test all JSON files
results = await self.testAllJsonFiles()
# Write log file
self._writeLogFile()
# Summary
print("\n" + "="*80)
print("TEST SUMMARY")
print("="*80)
successCount = 0
for fileName, result in results.items():
if result.get("success"):
successCount += 1
isValidJson = result.get("isValidJson", False)
if isValidJson:
print(f"{fileName:30s}: Valid JSON - completePart parsed successfully")
else:
jsonError = result.get("jsonError", "Unknown error")
print(f"⚠️ {fileName:30s}: Contexts generated but completePart is not valid JSON - {jsonError}")
else:
error = result.get("error", "Unknown error")
print(f"{fileName:30s}: FAILED - {error}")
print(f"\nResults: {successCount}/{len(results)} successful")
self.testResults = {
"success": successCount == len(results),
"totalFiles": len(results),
"successCount": successCount,
"results": results
}
return self.testResults
except Exception as e:
import traceback
print(f"\n❌ Test failed with error: {type(e).__name__}: {str(e)}")
print(f"Traceback:\n{traceback.format_exc()}")
self.testResults = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
return self.testResults
async def main():
"""Run JSON split and merge test 12."""
tester = JsonSplitMergeTester12()
results = await tester.runTest()
# Print final results as JSON for easy parsing
print("\n" + "="*80)
print("FINAL RESULTS (JSON)")
print("="*80)
print(json.dumps(results, indent=2, default=str))
if __name__ == "__main__":
asyncio.run(main())