#!/usr/bin/env python3 # Copyright (c) 2025 Patrick Motsch # All rights reserved. """ JSON Split and Merge Test 12 - Tests JSON splitting and merging using workflow tools Tests random splitting of JSON files into 3 parts and merging them back using ModularJsonMerger. """ import asyncio import json import sys import os import time import random from typing import Dict, Any, List, Optional, Tuple # Add the gateway to path (go up 2 levels from tests/functional/) _gateway_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) if _gateway_path not in sys.path: sys.path.insert(0, _gateway_path) # Import JSON merger from workflow tools from modules.services.serviceAi.subJsonMerger import ModularJsonMerger, JsonMergeLogger from modules.shared.jsonContinuation import getContexts class JsonSplitMergeTester12: def __init__(self): self.testResults = {} self.testJsonFiles = [] self.logBuffer = [] self.logFile = None def createTestJsonFiles(self) -> List[Dict[str, Any]]: """Create various test JSON files with different structures.""" testFiles = [ { "name": "config.json", "data": { "application": "Customer Manager", "version": "1.0.0", "database": { "host": "localhost", "port": 5432, "name": "customers_db" }, "api": { "baseUrl": "https://api.example.com", "timeout": 30 } } }, { "name": "customers.json", "data": { "customers": [ {"id": 1, "name": "John Doe", "email": "john@example.com", "phone": "+1234567890", "address": "123 Main St"}, {"id": 2, "name": "Jane Smith", "email": "jane@example.com", "phone": "+0987654321", "address": "456 Oak Ave"}, {"id": 3, "name": "Bob Johnson", "email": "bob@example.com", "phone": "+1122334455", "address": "789 Pine Rd"}, {"id": 4, "name": "Alice Williams", "email": "alice@example.com", "phone": "+5566778899", "address": "321 Elm St"}, {"id": 5, "name": "Charlie Brown", "email": "charlie@example.com", "phone": "+9988776655", "address": "654 Maple Dr"} ] } }, { "name": "settings.json", "data": { "theme": { "darkMode": True, "fontSize": 14, "language": "en" }, "notifications": { "email": True, "sms": False, "push": True }, "features": { "enableAnalytics": True, "enableReports": False } } }, { "name": "products.json", "data": { "products": [ {"id": "P001", "name": "Product A", "price": 29.99, "category": "Electronics", "inStock": True}, {"id": "P002", "name": "Product B", "price": 49.99, "category": "Clothing", "inStock": True}, {"id": "P003", "name": "Product C", "price": 19.99, "category": "Books", "inStock": False}, {"id": "P004", "name": "Product D", "price": 99.99, "category": "Electronics", "inStock": True}, {"id": "P005", "name": "Product E", "price": 14.99, "category": "Books", "inStock": True}, {"id": "P006", "name": "Product F", "price": 79.99, "category": "Clothing", "inStock": True} ] } }, { "name": "document_structure.json", "data": { "metadata": { "title": "Test Document", "author": "Test Author", "date": "2025-01-05" }, "documents": [ { "id": "doc1", "title": "Document 1", "sections": [ { "id": "sec1", "content_type": "heading", "elements": [ {"type": "heading", "content": {"text": "Introduction", "level": 1}} ] }, { "id": "sec2", "content_type": "paragraph", "elements": [ {"type": "paragraph", "content": {"text": "This is a test paragraph."}} ] } ] } ] } }, { "name": "large_customers.json", "data": self._createLargeCustomersData() }, { "name": "large_products.json", "data": self._createLargeProductsData() }, { "name": "large_documents.json", "data": self._createLargeDocumentsData() }, { "name": "table_example.json", "data": self._loadTableJsonExample() } ] return testFiles def _createLargeCustomersData(self) -> Dict[str, Any]: """Create a large customers dataset for budget testing.""" customers = [] # Create 100 customers with long descriptions for i in range(100): customers.append({ "id": i + 1, "name": f"Customer {i + 1}", "email": f"customer{i+1}@example.com", "phone": f"+1{5550000000 + i}", "address": f"{100 + i} Main Street, City {i % 10}, State {i % 5}, ZIP {10000 + i}", "description": f"This is a detailed description for customer {i + 1}. " * 10 + f"They have been a loyal customer since {2000 + (i % 25)}. " + f"Their preferences include various products and services. " * 5, "orders": [ { "orderId": f"ORD-{i+1}-{j+1}", "date": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}", "total": round(100.0 + (i * 10) + (j * 5), 2), "items": [ { "productId": f"PROD-{k+1}", "quantity": (k % 5) + 1, "price": round(10.0 + k * 2, 2) } for k in range(3) ] } for j in range(5) ], "metadata": { "created": f"2020-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}", "lastLogin": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}", "tags": [f"tag-{i % 10}", f"category-{i % 5}", f"segment-{i % 3}"] } }) return {"customers": customers} def _createLargeProductsData(self) -> Dict[str, Any]: """Create a large products dataset for budget testing.""" products = [] # Create 200 products with detailed information categories = ["Electronics", "Clothing", "Books", "Home & Garden", "Sports", "Toys", "Automotive", "Health"] for i in range(200): category = categories[i % len(categories)] products.append({ "id": f"PROD-{i+1:04d}", "name": f"Product {i+1} - {category}", "category": category, "price": round(10.0 + (i * 2.5), 2), "cost": round(5.0 + (i * 1.5), 2), "inStock": i % 3 != 0, "stockQuantity": (i % 100) * 10, "description": f"This is a comprehensive product description for Product {i+1}. " * 15 + f"It belongs to the {category} category and offers excellent value. " * 10 + f"Features include: feature-1, feature-2, feature-3, and many more. " * 5, "specifications": { "weight": f"{1.0 + (i % 10)} kg", "dimensions": f"{10 + (i % 20)}x{5 + (i % 15)}x{3 + (i % 10)} cm", "color": ["red", "blue", "green", "black", "white"][i % 5], "material": ["plastic", "metal", "wood", "fabric"][i % 4], "warranty": f"{1 + (i % 5)} years" }, "reviews": [ { "userId": f"USER-{j+1}", "rating": (j % 5) + 1, "comment": f"Review {j+1} for product {i+1}: " + "This is a detailed review comment. " * 10, "date": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}" } for j in range(3) ], "relatedProducts": [f"PROD-{k+1:04d}" for k in range(max(0, i-2), min(200, i+3)) if k != i] }) return {"products": products} def _createLargeDocumentsData(self) -> Dict[str, Any]: """Create a large documents dataset for budget testing.""" documents = [] # Create 50 documents with nested structures for i in range(50): sections = [] for j in range(10): elements = [] for k in range(5): if k % 2 == 0: elements.append({ "type": "heading", "level": (k % 3) + 1, "content": { "text": f"Section {j+1} Heading {k+1} for Document {i+1}", "style": "bold" } }) else: elements.append({ "type": "paragraph", "content": { "text": f"This is paragraph {k+1} in section {j+1} of document {i+1}. " * 20 + f"It contains detailed information about various topics. " * 15 + f"The content is structured and well-organized. " * 10 } }) sections.append({ "id": f"sec-{i+1}-{j+1}", "title": f"Section {j+1}", "content_type": "mixed", "elements": elements, "metadata": { "created": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}", "modified": f"2024-{(j % 12) + 1:02d}-{(j % 28) + 1:02d}", "author": f"Author-{(i % 10) + 1}", "tags": [f"tag-{j % 10}", f"category-{i % 5}"] } }) documents.append({ "id": f"doc-{i+1:03d}", "title": f"Document {i+1} - Comprehensive Report", "description": f"This is a comprehensive document with detailed information. " * 30 + f"It covers multiple topics and sections. " * 20 + f"The content is extensive and well-structured. " * 15, "sections": sections, "metadata": { "created": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}", "modified": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}", "author": f"Author-{(i % 10) + 1}", "version": f"1.{(i % 10)}", "status": ["draft", "review", "published"][i % 3], "tags": [f"tag-{i % 20}" for _ in range(5)] } }) return { "metadata": { "title": "Large Document Collection", "description": "A collection of 50 comprehensive documents for testing budget limits", "totalDocuments": 50, "created": "2024-01-01" }, "documents": documents } def _loadTableJsonExample(self) -> Dict[str, Any]: """Load the table JSON example from the debug prompts file.""" try: # Import jsonUtils for closing incomplete JSON structures from modules.shared.jsonUtils import closeJsonStructures, tryParseJson # Path to the JSON example file jsonExamplePath = os.path.join( os.path.dirname(__file__), "..", "..", "..", "local", "debug", "prompts", "20260105-214826-020-chapter_1_section_section_2_response_iteration_2.txt" ) # Read the file content with open(jsonExamplePath, 'r', encoding='utf-8') as f: content = f.read() # Remove markdown code block markers jsonContent = content.strip() if jsonContent.startswith('```json'): jsonContent = jsonContent[7:] # Remove ```json if jsonContent.startswith('```'): jsonContent = jsonContent[3:] # Remove ``` jsonContent = jsonContent.strip() if jsonContent.endswith('```'): jsonContent = jsonContent[:-3] # Remove trailing ``` jsonContent = jsonContent.strip() # The JSON is incomplete - use closeJsonStructures to complete it closedJson = closeJsonStructures(jsonContent) # Parse the closed JSON parsedJson, error, _ = tryParseJson(closedJson) if error is None and parsedJson is not None: return parsedJson else: raise Exception(f"Failed to parse JSON after closing structures: {error}") except Exception as e: # If loading fails, return a minimal valid structure print(f"Warning: Could not load table JSON example: {e}") return { "elements": [ { "type": "table", "content": { "headers": ["Spalte1", "Spalte2", "Spalte3"], "rows": [ [36761, 36767, 36779] ] } } ] } def splitJsonRandomly(self, jsonString: str, numParts: int = 3) -> List[str]: """ Split JSON string randomly into specified number of parts. Simulates real AI response cuts - can split anywhere, even in the middle of strings/numbers/structures. This is the REAL scenario: AI response gets cut off randomly, not at convenient points. """ if numParts < 2: return [jsonString] jsonLength = len(jsonString) # Generate truly random split points - can be anywhere! # Only ensure minimum part size to avoid empty parts minPartSize = max(10, jsonLength // (numParts * 3)) # Smaller minimum to allow more randomness splitPoints = [] for _ in range(numParts - 1): # Generate random point - can be anywhere in the string # Only ensure we don't create parts smaller than minimum minPoint = len(splitPoints) * minPartSize if splitPoints else minPartSize maxPoint = jsonLength - (numParts - len(splitPoints) - 1) * minPartSize if maxPoint <= minPoint: # If we can't avoid minimum size, just use the boundary splitPoint = minPoint else: # Truly random point - can be in the middle of anything! splitPoint = random.randint(minPoint, maxPoint) splitPoints.append(splitPoint) splitPoints.sort() # Create parts - these can be cut anywhere, even mid-string, mid-number, etc. parts = [] start = 0 for splitPoint in splitPoints: parts.append(jsonString[start:splitPoint]) start = splitPoint parts.append(jsonString[start:]) # Last part return parts def _log(self, message: str): """Add message to log buffer.""" self.logBuffer.append(message) print(message) def normalizeJson(self, jsonString: str) -> Optional[Dict[str, Any]]: """Normalize JSON string by parsing and re-serializing. Returns None if parsing fails.""" try: parsed = json.loads(jsonString) return parsed except json.JSONDecodeError: # Try to close incomplete JSON structures try: from modules.shared.jsonUtils import closeJsonStructures, tryParseJson closed = closeJsonStructures(jsonString) parsed, error, _ = tryParseJson(closed) if error is None and parsed is not None: return parsed except Exception: pass # Return None if all parsing attempts fail return None def compareJson(self, original: Dict[str, Any], merged: Dict[str, Any]) -> Dict[str, Any]: """Compare original and merged JSON structures.""" originalStr = json.dumps(original, sort_keys=True, indent=2) mergedStr = json.dumps(merged, sort_keys=True, indent=2) exactMatch = originalStr == mergedStr # Deep comparison differences = [] self._findDifferences(original, merged, "", differences) return { "exactMatch": exactMatch, "differences": differences, "originalSize": len(originalStr), "mergedSize": len(mergedStr), "sizeMatch": len(originalStr) == len(mergedStr) } def _findDifferences(self, obj1: Any, obj2: Any, path: str, differences: List[str]): """Recursively find differences between two JSON objects.""" if type(obj1) != type(obj2): differences.append(f"{path}: Type mismatch - {type(obj1).__name__} vs {type(obj2).__name__}") return if isinstance(obj1, dict): allKeys = set(obj1.keys()) | set(obj2.keys()) for key in allKeys: newPath = f"{path}.{key}" if path else key if key not in obj1: differences.append(f"{newPath}: Missing in original") elif key not in obj2: differences.append(f"{newPath}: Missing in merged") else: self._findDifferences(obj1[key], obj2[key], newPath, differences) elif isinstance(obj1, list): if len(obj1) != len(obj2): differences.append(f"{path}: Length mismatch - {len(obj1)} vs {len(obj2)}") else: for i, (item1, item2) in enumerate(zip(obj1, obj2)): newPath = f"{path}[{i}]" self._findDifferences(item1, item2, newPath, differences) else: if obj1 != obj2: differences.append(f"{path}: Value mismatch - {obj1} vs {obj2}") async def testJsonSplitMerge(self, jsonFile: Dict[str, Any]) -> Dict[str, Any]: """Test splitting and merging a single JSON file.""" fileName = jsonFile["name"] originalData = jsonFile["data"] self._log("") self._log("="*80) self._log(f"TESTING JSON SPLIT AND MERGE: {fileName}") self._log("="*80) # Convert to JSON string originalJsonString = json.dumps(originalData, indent=2, ensure_ascii=False) originalSize = len(originalJsonString) # Log original JSON self._log("") self._log("="*80) self._log("ORIGINAL JSON") self._log("="*80) self._log(f"JSON length: {originalSize} characters") self._log("") self._log("Full JSON content:") self._log("-"*80) jsonLines = originalJsonString.split('\n') if len(jsonLines) > 50: for line in jsonLines[:25]: self._log(line) self._log(f"... ({len(jsonLines) - 50} lines omitted) ...") for line in jsonLines[-25:]: self._log(line) else: for line in jsonLines: self._log(line) # Split JSON at random position (simulating AI response cut) self._log("") self._log("="*80) self._log("SPLITTING JSON AT RANDOM POSITION (SIMULATING AI RESPONSE CUT)") self._log("="*80) # Find random cut position (not at start or end) import random minCutPos = max(100, originalSize // 10) # At least 10% from start maxCutPos = min(originalSize - 100, originalSize * 9 // 10) # At least 10% from end cutPosition = random.randint(minCutPos, maxCutPos) # Get part from start to cut partContent = originalJsonString[:cutPosition] self._log("") self._log("="*80) self._log("PART (from start to cut):") self._log("="*80) self._log(f"Cut position: {cutPosition} characters") self._log(f"Part length: {len(partContent)} characters") self._log("") self._log("Part content:") partLines = partContent.split('\n') if len(partLines) > 30: for line in partLines[:15]: self._log(f" {line}") self._log(f" ... ({len(partLines) - 30} lines omitted) ...") for line in partLines[-15:]: self._log(f" {line}") else: for line in partLines: self._log(f" {line}") # Generate contexts using getContexts() self._log("") self._log("="*80) self._log("GENERATING CONTINUATION CONTEXTS") self._log("="*80) contexts = getContexts(partContent) # Log overlap context self._log("") self._log("="*80) self._log("OVERLAP CONTEXT (for merging):") self._log("="*80) overlapLines = contexts.overlapContext.split('\n') if len(overlapLines) > 30: for line in overlapLines[:15]: self._log(f" {line}") self._log(f" ... ({len(overlapLines) - 30} lines omitted) ...") for line in overlapLines[-15:]: self._log(f" {line}") else: for line in overlapLines: self._log(f" {line}") # Log hierarchy context (full, without budget) self._log("") self._log("="*80) self._log("HIERARCHY CONTEXT (full structure, no budget):") self._log("="*80) hierarchyLines = contexts.hierarchyContext.split('\n') if len(hierarchyLines) > 30: for line in hierarchyLines[:15]: self._log(f" {line}") self._log(f" ... ({len(hierarchyLines) - 30} lines omitted) ...") for line in hierarchyLines[-15:]: self._log(f" {line}") else: for line in hierarchyLines: self._log(f" {line}") # Log hierarchy context for prompt (with budget) self._log("") self._log("="*80) self._log("HIERARCHY CONTEXT FOR PROMPT (with budget logic):") self._log("="*80) hierarchyPromptLines = contexts.hierarchyContextForPrompt.split('\n') for line in hierarchyPromptLines: self._log(f" {line}") # Test completePart as valid JSON self._log("") self._log("="*80) self._log("COMPLETE PART (should be valid JSON):") self._log("="*80) completeLines = contexts.completePart.split('\n') if len(completeLines) > 30: for line in completeLines[:15]: self._log(f" {line}") self._log(f" ... ({len(completeLines) - 30} lines omitted) ...") for line in completeLines[-15:]: self._log(f" {line}") else: for line in completeLines: self._log(f" {line}") # Validate completePart as JSON self._log("") self._log("="*80) self._log("VALIDATING COMPLETE PART AS JSON:") self._log("="*80) isValidJson = False parsedCompletePart = None jsonError = None try: parsedCompletePart = json.loads(contexts.completePart) isValidJson = True self._log(" āœ… completePart is valid JSON") self._log(f" Parsed type: {type(parsedCompletePart).__name__}") # Compare with truncated JSON (not original) - parse the truncated part to compare from modules.shared.jsonUtils import closeJsonStructures, tryParseJson # Try to parse the truncated JSON part (with structures closed) truncatedClosed = closeJsonStructures(partContent) truncatedParsed, truncatedError, _ = tryParseJson(truncatedClosed) if truncatedParsed is not None: # Compare completePart with the parsed truncated JSON if isinstance(parsedCompletePart, dict) and isinstance(truncatedParsed, dict): comparison = self.compareJson(truncatedParsed, parsedCompletePart) self._log(f" Comparison with truncated JSON (at cut position {cutPosition}):") self._log(f" Exact match: {comparison['exactMatch']}") self._log(f" Size match: {comparison['sizeMatch']}") if comparison['differences']: self._log(f" Differences found: {len(comparison['differences'])}") for diff in comparison['differences'][:10]: # Show first 10 differences self._log(f" - {diff}") if len(comparison['differences']) > 10: self._log(f" ... ({len(comparison['differences']) - 10} more differences)") else: self._log(" No differences found - completePart matches truncated JSON structure") elif isinstance(parsedCompletePart, list) and isinstance(truncatedParsed, list): self._log(f" Both are lists: truncated={len(truncatedParsed)} items, completePart={len(parsedCompletePart)} items") else: self._log(f" Different types: truncated={type(truncatedParsed).__name__}, completePart={type(parsedCompletePart).__name__}") else: self._log(f" Could not parse truncated JSON for comparison (error: {truncatedError})") except json.JSONDecodeError as e: isValidJson = False jsonError = str(e) self._log(f" āŒ completePart is NOT valid JSON") self._log(f" Error: {jsonError}") self._log(f" Error position: line {e.lineno}, column {e.colno}") # Return test results return { "success": isValidJson, "fileName": fileName, "originalSize": originalSize, "cutPosition": cutPosition, "partSize": len(partContent), "overlapContextSize": len(contexts.overlapContext), "hierarchyContextSize": len(contexts.hierarchyContext), "hierarchyContextForPromptSize": len(contexts.hierarchyContextForPrompt), "completePartSize": len(contexts.completePart), "isValidJson": isValidJson, "jsonError": jsonError, "parsedCompletePart": parsedCompletePart is not None } async def testAllJsonFiles(self) -> Dict[str, Any]: """Test splitting and merging all test JSON files.""" print("\n" + "="*80) print("TESTING JSON SPLIT AND MERGE") print("="*80) testFiles = self.createTestJsonFiles() results = {} for jsonFile in testFiles: try: result = await self.testJsonSplitMerge(jsonFile) results[jsonFile["name"]] = result # Small delay between tests await asyncio.sleep(0.5) except Exception as e: import traceback print(f"\nāŒ Error testing {jsonFile['name']}: {str(e)}") print(traceback.format_exc()) results[jsonFile["name"]] = { "success": False, "error": str(e), "traceback": traceback.format_exc() } return results def _writeLogFile(self): """Write log buffer to file.""" logDir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "local", "debug") os.makedirs(logDir, exist_ok=True) logFilePath = os.path.join(logDir, "test12_json_split_merge_results.txt") with open(logFilePath, 'w', encoding='utf-8') as f: f.write('\n'.join(self.logBuffer)) self.logFile = logFilePath print(f"\nšŸ“ Detailed log written to: {logFilePath}") async def runTest(self): """Run the complete test.""" self._log("="*80) self._log("JSON SPLIT AND MERGE TEST 12") self._log("="*80) try: # Test all JSON files results = await self.testAllJsonFiles() # Write log file self._writeLogFile() # Summary print("\n" + "="*80) print("TEST SUMMARY") print("="*80) successCount = 0 for fileName, result in results.items(): if result.get("success"): successCount += 1 isValidJson = result.get("isValidJson", False) if isValidJson: print(f"āœ… {fileName:30s}: Valid JSON - completePart parsed successfully") else: jsonError = result.get("jsonError", "Unknown error") print(f"āš ļø {fileName:30s}: Contexts generated but completePart is not valid JSON - {jsonError}") else: error = result.get("error", "Unknown error") print(f"āŒ {fileName:30s}: FAILED - {error}") print(f"\nResults: {successCount}/{len(results)} successful") self.testResults = { "success": successCount == len(results), "totalFiles": len(results), "successCount": successCount, "results": results } return self.testResults except Exception as e: import traceback print(f"\nāŒ Test failed with error: {type(e).__name__}: {str(e)}") print(f"Traceback:\n{traceback.format_exc()}") self.testResults = { "success": False, "error": str(e), "traceback": traceback.format_exc() } return self.testResults async def main(): """Run JSON split and merge test 12.""" tester = JsonSplitMergeTester12() results = await tester.runTest() # Print final results as JSON for easy parsing print("\n" + "="*80) print("FINAL RESULTS (JSON)") print("="*80) print(json.dumps(results, indent=2, default=str)) if __name__ == "__main__": asyncio.run(main())