gateway/modules/services/serviceAi/subJsonMerger.py

2080 lines
90 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Modular JSON Merger - Intelligent JSON Fragment Merging
A clean, modular approach to merging JSON fragments that may be cut randomly.
Designed to be simple, robust, and always return valid data.
Architecture:
1. Data Extractor: Extracts all possible data from fragments (even incomplete)
2. Structure Detector: Detects JSON structure type (elements, documents, files, etc.)
3. Data Merger: Intelligently merges data with overlap detection
4. Result Builder: Always returns valid JSON structure
"""
import json
import re
import logging
import os
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.shared.jsonUtils import (
normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson
)
logger = logging.getLogger(__name__)
class JsonMergeLogger:
"""Consolidated logger for JSON merging process."""
_logBuffer: List[str] = []
_mergeId: int = 0
_currentLogFile: Optional[str] = None
_appendMode: bool = False
@staticmethod
def initializeLogFile(logFileName: Optional[str] = None):
"""Initialize a new log file for a test run."""
JsonMergeLogger._logBuffer = []
JsonMergeLogger._mergeId = 0
if logFileName:
JsonMergeLogger._currentLogFile = logFileName
JsonMergeLogger._appendMode = False
# Clear existing file
try:
currentFileDir = os.path.dirname(os.path.abspath(__file__))
logFilePath = os.path.join(currentFileDir, logFileName)
with open(logFilePath, 'w', encoding='utf-8') as f:
f.write("") # Clear file
except Exception:
pass
else:
JsonMergeLogger._currentLogFile = None
JsonMergeLogger._appendMode = False
@staticmethod
def startMerge(accumulated: str, newFragment: str) -> str:
"""Start a new merge operation and return merge ID."""
JsonMergeLogger._mergeId += 1
mergeId = f"merge_{JsonMergeLogger._mergeId}"
JsonMergeLogger._log(f"{'='*80}")
JsonMergeLogger._log(f"JSON MERGE OPERATION #{JsonMergeLogger._mergeId}")
JsonMergeLogger._log(f"{'='*80}")
JsonMergeLogger._log(f"Timestamp: {datetime.now().isoformat()}")
JsonMergeLogger._log("")
JsonMergeLogger._log("INPUT:")
JsonMergeLogger._log(f" Accumulated length: {len(accumulated)} chars")
JsonMergeLogger._log(f" New Fragment length: {len(newFragment)} chars")
# Log only summary (first 5 and last 5 lines) to avoid log spam
accLines = accumulated.split('\n')
fragLines = newFragment.split('\n')
JsonMergeLogger._log(f" Accumulated: {len(accLines)} lines (showing first 5 and last 5)")
if len(accLines) > 10:
for line in accLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(accLines) - 10} lines omitted) ...")
for line in accLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in accLines:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" New Fragment: {len(fragLines)} lines (showing first 5 and last 5)")
if len(fragLines) > 10:
for line in fragLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(fragLines) - 10} lines omitted) ...")
for line in fragLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in fragLines:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log("")
return mergeId
@staticmethod
def logStep(stepName: str, description: str, result: Any = None, error: Optional[str] = None):
"""Log a step with its result."""
JsonMergeLogger._log(f"STEP: {stepName}")
JsonMergeLogger._log(f" Description: {description}")
if error:
JsonMergeLogger._log(f" ❌ ERROR: {error}")
elif result is not None:
if isinstance(result, str):
resultLines = result.split('\n')
JsonMergeLogger._log(f" ✅ Result (string, {len(result)} chars, {len(resultLines)} lines)")
if len(resultLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in resultLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(resultLines) - 10} lines omitted) ...")
for line in resultLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in resultLines:
JsonMergeLogger._log(f" {line}")
elif isinstance(result, dict):
keys = list(result.keys())
JsonMergeLogger._log(f" ✅ Result (dict): keys={keys}, size={len(str(result))} chars")
# Log full structure with JSON formatting - NO TRUNCATION
try:
jsonStr = json.dumps(result, indent=2, ensure_ascii=False)
JsonMergeLogger._log(f" Full data (COMPLETE, {len(jsonStr)} chars):")
JsonMergeLogger._log(" " + "="*76)
for line in jsonStr.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76)
except Exception as e:
JsonMergeLogger._log(f" Could not serialize: {e}")
strRepr = str(result)
strLines = strRepr.split('\n')
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
if len(strLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in strLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
for line in strLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in strLines:
JsonMergeLogger._log(f" {line}")
# Log structure details
if "elements" in result:
elemCount = len(result["elements"]) if isinstance(result["elements"], list) else 0
JsonMergeLogger._log(f" - elements: {elemCount} items")
if isinstance(result["elements"], list) and elemCount > 0:
JsonMergeLogger._log(f" First element type: {result['elements'][0].get('type', 'unknown') if isinstance(result['elements'][0], dict) else 'not a dict'}")
if "documents" in result:
docCount = len(result["documents"]) if isinstance(result["documents"], list) else 0
JsonMergeLogger._log(f" - documents: {docCount} items")
elif isinstance(result, list):
JsonMergeLogger._log(f" ✅ Result (list): {len(result)} items (COMPLETE)")
if len(result) > 0:
JsonMergeLogger._log(f" First item type: {type(result[0]).__name__}")
try:
jsonStr = json.dumps(result, indent=2, ensure_ascii=False) # ALL items
JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
JsonMergeLogger._log(" " + "="*76)
for line in jsonStr.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76)
except Exception:
strRepr = str(result)
strLines = strRepr.split('\n')
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
if len(strLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in strLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
for line in strLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in strLines:
JsonMergeLogger._log(f" {line}")
else:
JsonMergeLogger._log(f" ✅ Result: {type(result).__name__} = {str(result)[:200]}")
else:
JsonMergeLogger._log(f" ⏳ In progress...")
JsonMergeLogger._log("")
@staticmethod
def logExtraction(strategy: str, success: bool, data: Any = None, error: Optional[str] = None):
"""Log extraction strategy result."""
status = "✅ SUCCESS" if success else "❌ FAILED"
JsonMergeLogger._log(f" Extraction Strategy: {strategy} - {status}")
if error:
JsonMergeLogger._log(f" Error: {error}")
elif data is not None:
if isinstance(data, dict):
keys = list(data.keys())
JsonMergeLogger._log(f" Extracted keys: {keys}")
# Log full extracted data - NO TRUNCATION
try:
jsonStr = json.dumps(data, indent=2, ensure_ascii=False)
JsonMergeLogger._log(f" Extracted data (COMPLETE, {len(jsonStr)} chars):")
JsonMergeLogger._log(" " + "="*76)
for line in jsonStr.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76)
except Exception as e:
JsonMergeLogger._log(f" Could not serialize extracted data: {e}")
strRepr = str(data)
strLines = strRepr.split('\n')
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
if len(strLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in strLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
for line in strLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in strLines:
JsonMergeLogger._log(f" {line}")
elif isinstance(data, list):
JsonMergeLogger._log(f" Extracted {len(data)} items (COMPLETE)")
if len(data) > 0:
try:
jsonStr = json.dumps(data, indent=2, ensure_ascii=False) # ALL items
JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
JsonMergeLogger._log(" " + "="*76)
for line in jsonStr.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76)
except Exception as e:
JsonMergeLogger._log(f" Could not serialize list: {e}")
strRepr = str(data)
strLines = strRepr.split('\n')
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
if len(strLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in strLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
for line in strLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in strLines:
JsonMergeLogger._log(f" {line}")
@staticmethod
def logOverlap(overlapType: str, overlapLen: int, accSuffix: Any = None, fragPrefix: Any = None):
"""Log overlap detection result."""
JsonMergeLogger._log(f" Overlap Detection ({overlapType}):")
JsonMergeLogger._log(f" Overlap length: {overlapLen}")
if overlapLen > 0:
JsonMergeLogger._log(f" ✅ Found overlap of {overlapLen} chars")
if accSuffix is not None:
if isinstance(accSuffix, str):
JsonMergeLogger._log(f" Accumulated suffix (COMPLETE, {len(accSuffix)} chars):")
JsonMergeLogger._log(" " + "="*76)
for line in accSuffix.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76)
else:
# For lists/arrays, only log summary to avoid log flooding
if isinstance(accSuffix, list):
JsonMergeLogger._log(f" Accumulated suffix: list with {len(accSuffix)} items")
else:
JsonMergeLogger._log(f" Accumulated suffix: {type(accSuffix).__name__}")
if fragPrefix is not None:
if isinstance(fragPrefix, str):
prefixLines = fragPrefix.split('\n')
JsonMergeLogger._log(f" Fragment prefix ({len(fragPrefix)} chars, {len(prefixLines)} lines)")
if len(prefixLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
for line in prefixLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(prefixLines) - 10} lines omitted) ...")
for line in prefixLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in prefixLines:
JsonMergeLogger._log(f" {line}")
else:
# For lists/arrays, only log summary to avoid log flooding
if isinstance(fragPrefix, list):
JsonMergeLogger._log(f" Fragment prefix: list with {len(fragPrefix)} items")
else:
JsonMergeLogger._log(f" Fragment prefix: {type(fragPrefix).__name__}")
else:
JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all")
@staticmethod
def logValidation(validationType: str, success: bool, error: Optional[str] = None):
"""Log validation result."""
status = "✅ VALID" if success else "❌ INVALID"
JsonMergeLogger._log(f" Validation ({validationType}): {status}")
if error:
JsonMergeLogger._log(f" Error: {error}")
@staticmethod
def finishMerge(mergeId: str, finalResult: str, success: bool):
"""Finish merge operation and write log file."""
JsonMergeLogger._log("")
JsonMergeLogger._log(f"{'='*80}")
JsonMergeLogger._log(f"MERGE RESULT: {'✅ SUCCESS' if success else '❌ FAILED'}")
JsonMergeLogger._log(f"{'='*80}")
JsonMergeLogger._log(f"Final result length: {len(finalResult)} chars")
JsonMergeLogger._log("Final result (COMPLETE):")
JsonMergeLogger._log("="*80)
for line in finalResult.split('\n'):
JsonMergeLogger._log(line)
JsonMergeLogger._log("="*80)
JsonMergeLogger._log("")
# Write log content to buffer (will be written at end of test run)
logContent = "\n".join(JsonMergeLogger._logBuffer)
# If we have a current log file, append to it
if JsonMergeLogger._currentLogFile:
try:
currentFileDir = os.path.dirname(os.path.abspath(__file__))
logFilePath = os.path.join(currentFileDir, JsonMergeLogger._currentLogFile)
mode = 'a' if JsonMergeLogger._appendMode else 'w'
with open(logFilePath, mode, encoding='utf-8') as f:
f.write(logContent)
f.write("\n\n") # Add separator between merges
JsonMergeLogger._appendMode = True # Next writes will append
logger.debug(f"JSON merge log appended to: {logFilePath}")
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
else:
# No log file set - write individual file (fallback)
currentFileDir = os.path.dirname(os.path.abspath(__file__))
logDir = currentFileDir
os.makedirs(logDir, exist_ok=True)
logFilePath = os.path.join(logDir, f"{mergeId}.txt")
try:
with open(logFilePath, 'w', encoding='utf-8') as f:
f.write(logContent)
logger.info(f"JSON merge log written to: {logFilePath}")
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
# Clear buffer for next merge
JsonMergeLogger._logBuffer = []
@staticmethod
def _log(message: str):
"""Internal log method."""
JsonMergeLogger._logBuffer.append(message)
logger.debug(message)
class JsonDataExtractor:
"""Extracts data from JSON fragments, even if incomplete."""
@staticmethod
def extract(jsonString: str, mergeId: Optional[str] = None, removeFromEnd: bool = True) -> Dict[str, Any]:
"""
Extract complete data from JSON fragment.
For merging: We know exactly where to clean:
- accumulated: remove incomplete parts at the END
- newFragment: remove incomplete parts at the BEGINNING
Simple approach: Remove incomplete parts at specified position, then parse.
"""
if mergeId:
position = "END" if removeFromEnd else "BEGINNING"
JsonMergeLogger.logStep("EXTRACTION", f"Extracting data from JSON fragment ({len(jsonString)} chars) - cleaning from {position}")
if not jsonString or not jsonString.strip():
if mergeId:
JsonMergeLogger.logExtraction("Empty input", False, error="Input is empty")
return {}
normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
if not normalized:
if mergeId:
JsonMergeLogger.logExtraction("Normalization", False, error="Normalized string is empty")
return {}
# Try to parse as complete JSON first
parsed, parseErr, _ = tryParseJson(normalized)
if parseErr is None and parsed is not None:
if isinstance(parsed, dict):
finalResult = parsed
elif isinstance(parsed, list):
finalResult = {"elements": parsed}
else:
finalResult = {"elements": [parsed]} if parsed else {}
if mergeId:
JsonMergeLogger.logExtraction("Direct parsing", True, finalResult)
JsonMergeLogger.logStep("EXTRACTION", "Direct parsing successful", finalResult)
return finalResult if finalResult else {}
# Remove incomplete parts from specified position
if removeFromEnd:
cleaned = JsonDataExtractor._removeIncompleteFromEnd(normalized)
else:
cleaned = JsonDataExtractor._removeIncompleteFromBeginning(normalized)
if cleaned:
# Close structures and try to parse
closed = closeJsonStructures(cleaned)
parsed, parseErr2, _ = tryParseJson(closed)
if parseErr2 is None and parsed is not None:
if isinstance(parsed, dict):
finalResult = parsed
elif isinstance(parsed, list):
finalResult = {"elements": parsed}
else:
finalResult = {"elements": [parsed]} if parsed else {}
if mergeId:
JsonMergeLogger.logExtraction("Remove incomplete + close", True, finalResult)
JsonMergeLogger.logStep("EXTRACTION", "Remove incomplete + close successful", finalResult)
return finalResult if finalResult else {}
# Return empty dict if nothing worked
if mergeId:
JsonMergeLogger.logStep("EXTRACTION", "No data extracted", {}, error="All strategies failed")
return {}
@staticmethod
def _removeIncompleteFromEnd(jsonString: str) -> str:
"""
Remove incomplete parts from the END of JSON string.
Goes through structure level by level, keeps complete elements, removes incomplete ones at the end.
"""
# Find first '{' or '[' to start
startIdx = -1
for i, char in enumerate(jsonString):
if char in '{[':
startIdx = i
break
if startIdx == -1:
return ""
# Remove incomplete parts from end recursively
cleaned = JsonDataExtractor._cleanJsonFromEnd(jsonString[startIdx:])
return cleaned
@staticmethod
def _removeIncompleteFromBeginning(jsonString: str) -> str:
"""
Remove incomplete parts from the BEGINNING of JSON string.
Finds where valid JSON starts and removes everything before it.
"""
# Find first '{' or '[' to start
startIdx = -1
for i, char in enumerate(jsonString):
if char in '{[':
startIdx = i
break
if startIdx == -1:
return ""
# Return from start position - beginning cleanup is just finding the start
return jsonString[startIdx:]
@staticmethod
def _cleanJsonFromEnd(jsonStr: str) -> str:
"""
Recursively clean JSON from the END: keep complete elements, remove incomplete ones at the end.
Goes through structure level by level.
"""
# Try to parse as-is first
try:
parsed = json.loads(jsonStr)
return jsonStr
except Exception:
pass
# If dict: go through each key-value pair, remove incomplete ones at the end
if jsonStr.strip().startswith('{'):
return JsonDataExtractor._cleanDictFromEnd(jsonStr)
# If array: go through each element, remove incomplete ones at the end
if jsonStr.strip().startswith('['):
return JsonDataExtractor._cleanArrayFromEnd(jsonStr)
return ""
@staticmethod
def _cleanDictFromEnd(jsonStr: str) -> str:
"""Clean dict from END: keep complete key-value pairs, remove incomplete ones at the end."""
if not jsonStr.strip().startswith('{'):
return ""
result = ['{']
i = 1 # Skip opening '{'
first = True
while i < len(jsonStr):
# Skip whitespace
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
i += 1
if i >= len(jsonStr):
break
# Check if we hit closing brace
if jsonStr[i] == '}':
break
# Skip comma
if jsonStr[i] == ',':
i += 1
continue
# Try to extract key-value pair
keyStart = i
# Find key (string)
if jsonStr[i] == '"':
i += 1
while i < len(jsonStr) and jsonStr[i] != '"':
if jsonStr[i] == '\\':
i += 2
else:
i += 1
if i < len(jsonStr):
i += 1 # Skip closing quote
else:
# Invalid key - stop here (incomplete at end)
break
# Skip whitespace and colon
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t:':
i += 1
if i >= len(jsonStr):
break
# Try to extract value
valueStart = i
valueEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
if valueEnd > valueStart:
# Try to parse this key-value pair
pairStr = jsonStr[keyStart:valueEnd]
try:
# Test if it's valid JSON
testStr = '{' + pairStr + '}'
json.loads(testStr)
# Valid pair - add it
if not first:
result.append(',')
result.append(pairStr)
first = False
i = valueEnd
except Exception:
# Invalid pair - stop here (incomplete at end)
break
else:
# Incomplete value - stop here (incomplete at end)
break
result.append('}')
return ''.join(result)
@staticmethod
def _cleanArrayFromEnd(jsonStr: str) -> str:
"""Clean array from END: keep complete elements, remove incomplete ones at the end."""
if not jsonStr.strip().startswith('['):
return ""
result = ['[']
i = 1 # Skip opening '['
first = True
while i < len(jsonStr):
# Skip whitespace
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
i += 1
if i >= len(jsonStr):
break
# Check if we hit closing bracket
if jsonStr[i] == ']':
break
# Skip comma
if jsonStr[i] == ',':
i += 1
continue
# Try to extract element
elemStart = i
elemEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
if elemEnd > elemStart:
# Try to parse this element
elemStr = jsonStr[elemStart:elemEnd]
try:
# Test if it's valid JSON
json.loads(elemStr)
# Valid element - add it
if not first:
result.append(',')
result.append(elemStr)
first = False
i = elemEnd
except Exception:
# Invalid element - stop here (incomplete at end)
break
else:
# Incomplete element - stop here (incomplete at end)
break
result.append(']')
return ''.join(result)
@staticmethod
def _findCompleteValue(jsonStr: str, start: int) -> int:
"""Find the end of a complete JSON value starting at start position."""
if start >= len(jsonStr):
return start
i = start
# Skip whitespace
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
i += 1
if i >= len(jsonStr):
return start
char = jsonStr[i]
# String
if char == '"':
i += 1
while i < len(jsonStr):
if jsonStr[i] == '\\':
i += 2
elif jsonStr[i] == '"':
return i + 1
else:
i += 1
return start # Incomplete string
# Number, boolean, null
if char in '-0123456789tfn':
while i < len(jsonStr) and jsonStr[i] not in ',}]':
i += 1
return i
# Object
if char == '{':
braceCount = 1
i += 1
while i < len(jsonStr) and braceCount > 0:
if jsonStr[i] == '\\':
i += 2
elif jsonStr[i] == '"':
# Skip string
i += 1
while i < len(jsonStr):
if jsonStr[i] == '\\':
i += 2
elif jsonStr[i] == '"':
i += 1
break
else:
i += 1
elif jsonStr[i] == '{':
braceCount += 1
i += 1
elif jsonStr[i] == '}':
braceCount -= 1
i += 1
else:
i += 1
if braceCount == 0:
return i
return start # Incomplete object
# Array
if char == '[':
bracketCount = 1
i += 1
while i < len(jsonStr) and bracketCount > 0:
if jsonStr[i] == '\\':
i += 2
elif jsonStr[i] == '"':
# Skip string
i += 1
while i < len(jsonStr):
if jsonStr[i] == '\\':
i += 2
elif jsonStr[i] == '"':
i += 1
break
else:
i += 1
elif jsonStr[i] == '[':
bracketCount += 1
i += 1
elif jsonStr[i] == ']':
bracketCount -= 1
i += 1
else:
i += 1
if bracketCount == 0:
return i
return start # Incomplete array
return start
@staticmethod
def _extractAllCompleteObjects(jsonString: str) -> List[Dict[str, Any]]:
"""
Extract ALL complete objects from JSON string using balanced brace matching.
Ignores incomplete objects at the end.
Core principle: Every fragment can be cut anywhere - extract only complete objects.
"""
foundObjs = []
braceCount = 0
startPos = -1
for i, char in enumerate(jsonString):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
# Found a complete object
objStr = jsonString[startPos:i+1]
try:
obj = json.loads(objStr)
if isinstance(obj, dict) and obj:
foundObjs.append(obj)
except Exception:
# Not valid JSON - skip it
pass
startPos = -1
elif braceCount < 0:
# Unbalanced - reset
braceCount = 0
startPos = -1
# If we end with an incomplete object (startPos >= 0 and braceCount > 0), ignore it
# It will be in the next fragment
return foundObjs
@staticmethod
def _extractElements(jsonString: str) -> List[Dict[str, Any]]:
"""Extract elements array from JSON string - extracts ALL complete elements."""
elements = []
# Pattern 1: Look for "elements": [...] (including incomplete at end)
elementsPattern = r'"elements"\s*:\s*\[(.*)'
match = re.search(elementsPattern, jsonString, re.DOTALL)
if match:
elementsContent = match.group(1)
# Extract ALL complete element objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(elementsContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
elementStr = elementsContent[startPos:i+1]
try:
element = json.loads(elementStr)
if isinstance(element, dict):
elements.append(element)
except Exception:
# Try to extract table rows from incomplete element
rows = JsonDataExtractor._extractTableRowsFromElement(elementStr)
if rows:
elements.append({
"type": "table",
"content": {
"rows": rows
}
})
startPos = -1
elif braceCount < 0:
break # Unbalanced - stop
# Pattern 2: Look for table structure directly (even if incomplete)
if not elements:
# Look for "type": "table" pattern
tablePattern = r'"type"\s*:\s*"table"[^}]*"rows"\s*:\s*\[(.*?)(?:\]|$)'
tableMatch = re.search(tablePattern, jsonString, re.DOTALL)
if tableMatch:
rowsContent = tableMatch.group(1)
rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
if rows:
elements.append({
"type": "table",
"content": {
"rows": rows
}
})
# Pattern 3: Look for table rows directly (without structure)
if not elements:
rows = JsonDataExtractor._extractTableRows(jsonString)
if rows:
elements.append({
"type": "table",
"content": {
"rows": rows
}
})
return elements
@staticmethod
def _extractTableRowsFromElement(elementStr: str) -> List[List[str]]:
"""Extract table rows from incomplete element string."""
# Look for rows array in element
rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
match = re.search(rowsPattern, elementStr, re.DOTALL)
if match:
return JsonDataExtractor._extractRowsFromContent(match.group(1))
return []
@staticmethod
def _extractRowsFromContent(rowsContent: str) -> List[List[str]]:
"""Extract rows from rows content string."""
rows = []
# Extract all array patterns: ["value1", "value2"]
# Use non-greedy matching but ensure we get complete arrays
arrayPattern = r'\[(.*?)\]'
arrayMatches = re.findall(arrayPattern, rowsContent)
for arrayContent in arrayMatches:
# Extract cells - handle both quoted strings and numbers
# First try to find quoted strings
cellPattern = r'"([^"]*)"'
cells = re.findall(cellPattern, arrayContent)
# If no quoted strings, try numbers or other values
if not cells:
# Try to find any values (numbers, booleans, etc.)
valuePattern = r'(-?\d+\.?\d*|true|false|null)'
cells = re.findall(valuePattern, arrayContent)
# Only add rows with at least 1 cell (allow single-column tables)
if len(cells) >= 1:
rows.append(cells)
return rows
@staticmethod
def _extractTableRows(jsonString: str) -> List[List[str]]:
"""Extract table rows from JSON string using multiple strategies."""
rows = []
# Strategy 1: Look for "rows": [[...], [...]]
rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
match = re.search(rowsPattern, jsonString, re.DOTALL)
if match:
rowsContent = match.group(1)
rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
if rows:
return rows
# Strategy 2: Look for standalone array patterns ["value1", "value2"]
# Pattern for complete arrays with 2 columns
completeArrayPattern = r'\["([^"]*)",\s*"([^"]*)"\]'
matches = re.findall(completeArrayPattern, jsonString)
if len(matches) >= 2: # Need at least 2 rows to be confident
return [[m[0], m[1]] for m in matches]
# Strategy 3: Extract any array patterns (more lenient)
# Find all [ ... ] patterns that contain quoted strings
allArrays = re.findall(r'\[([^\]]*)\]', jsonString)
for arrayContent in allArrays:
# Extract quoted strings
cells = re.findall(r'"([^"]*)"', arrayContent)
if len(cells) >= 2: # At least 2 columns
rows.append(cells)
# Only return if we have multiple rows (likely a table)
if len(rows) >= 2:
return rows
return []
@staticmethod
def _extractDocuments(jsonString: str) -> List[Dict[str, Any]]:
"""
Extract documents structure from JSON string - extracts ALL complete documents/chapters/sections.
Ignores incomplete ones at the end.
Core principle: Fragment can be cut anywhere - extract only complete objects.
"""
documents = []
# Pattern 1: Look for "documents": [...] structure (including incomplete at end)
documentsPattern = r'"documents"\s*:\s*\[(.*)'
match = re.search(documentsPattern, jsonString, re.DOTALL)
if match:
documentsContent = match.group(1)
# Extract ALL complete document objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(documentsContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
# Found a complete document object
docStr = documentsContent[startPos:i+1]
try:
doc = json.loads(docStr)
if isinstance(doc, dict):
# Extract chapters/sections from document
chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
if chapters:
doc["chapters"] = chapters
if sections:
doc["sections"] = sections
if doc:
documents.append(doc)
except Exception:
# Not valid JSON - try to extract chapters/sections directly
chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
if chapters or sections:
doc = {}
if chapters:
doc["chapters"] = chapters
if sections:
doc["sections"] = sections
if doc:
documents.append(doc)
startPos = -1
elif braceCount < 0:
break
# If we end with an incomplete document (startPos >= 0 and braceCount > 0), ignore it
# It will be in the next fragment
if documents:
return documents
# Pattern 2: Look for "chapters": [...] pattern directly (fragment might start mid-document)
chapters = JsonDataExtractor._extractChaptersFromString(jsonString)
if chapters:
documents.append({"chapters": chapters})
# Pattern 3: Look for "sections": [...] pattern directly
sections = JsonDataExtractor._extractSectionsFromString(jsonString)
if sections:
documents.append({"sections": sections})
return documents
@staticmethod
def _extractChaptersFromDocument(docStr: str) -> List[Dict[str, Any]]:
"""Extract chapters array from document string."""
return JsonDataExtractor._extractChaptersFromString(docStr)
@staticmethod
def _extractChaptersFromString(jsonString: str) -> List[Dict[str, Any]]:
"""
Extract chapters array from JSON string - extracts ALL complete chapters.
Ignores incomplete chapters at the end.
Core principle: Fragment can be cut anywhere - extract only complete objects.
"""
chapters = []
# Look for "chapters": [...] pattern (including incomplete at end)
chaptersPattern = r'"chapters"\s*:\s*\[(.*)'
match = re.search(chaptersPattern, jsonString, re.DOTALL)
if match:
chaptersContent = match.group(1)
# Extract ALL complete chapter objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(chaptersContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
# Found a complete chapter object
chapterStr = chaptersContent[startPos:i+1]
try:
chapter = json.loads(chapterStr)
if isinstance(chapter, dict):
chapters.append(chapter)
except Exception:
# Not valid JSON - skip it (incomplete chapter)
pass
startPos = -1
elif braceCount < 0:
# Unbalanced - stop here
break
# If we end with an incomplete chapter (startPos >= 0 and braceCount > 0), ignore it
# It will be in the next fragment
# Also try to extract chapters that might be standalone (fragment starts mid-array)
# Look for complete chapter objects anywhere in the string
if not chapters:
# Try to find complete chapter objects using balanced brace matching
allObjs = JsonDataExtractor._extractAllCompleteObjects(jsonString)
# Filter for objects that look like chapters (have id and title)
for obj in allObjs:
if isinstance(obj, dict) and "id" in obj and "title" in obj:
chapters.append(obj)
return chapters
@staticmethod
def _extractSectionsFromDocument(docStr: str) -> List[Dict[str, Any]]:
"""Extract sections array from document string."""
return JsonDataExtractor._extractSectionsFromString(docStr)
@staticmethod
def _extractSectionsFromString(jsonString: str) -> List[Dict[str, Any]]:
"""Extract sections array from JSON string, even if incomplete."""
sections = []
# Look for "sections": [...]
sectionsPattern = r'"sections"\s*:\s*\[(.*?)(?:\]|$)'
match = re.search(sectionsPattern, jsonString, re.DOTALL)
if match:
sectionsContent = match.group(1)
# Extract section objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(sectionsContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
sectionStr = sectionsContent[startPos:i+1]
try:
section = json.loads(sectionStr)
if isinstance(section, dict):
sections.append(section)
except Exception:
# Incomplete section - try to extract what we can
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', sectionStr)
contentTypeMatch = re.search(r'"content_type"\s*:\s*"([^"]*)"', sectionStr)
if idMatch or contentTypeMatch:
section = {}
if idMatch:
section["id"] = idMatch.group(1)
if contentTypeMatch:
section["content_type"] = contentTypeMatch.group(1)
if section:
sections.append(section)
startPos = -1
return sections
@staticmethod
def _extractFiles(jsonString: str) -> List[Dict[str, Any]]:
"""Extract files array from JSON string, even if incomplete."""
files = []
# Look for "files": [...]
filesPattern = r'"files"\s*:\s*\[(.*?)(?:\]|$)'
match = re.search(filesPattern, jsonString, re.DOTALL)
if match:
filesContent = match.group(1)
# Extract file objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(filesContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
fileStr = filesContent[startPos:i+1]
try:
fileObj = json.loads(fileStr)
if isinstance(fileObj, dict):
files.append(fileObj)
except Exception:
# Incomplete file - try to extract what we can
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', fileStr)
filenameMatch = re.search(r'"filename"\s*:\s*"([^"]*)"', fileStr)
if idMatch or filenameMatch:
fileObj = {}
if idMatch:
fileObj["id"] = idMatch.group(1)
if filenameMatch:
fileObj["filename"] = filenameMatch.group(1)
if fileObj:
files.append(fileObj)
startPos = -1
return files
@staticmethod
def _extractImages(jsonString: str) -> List[Dict[str, Any]]:
"""Extract images array from JSON string, even if incomplete."""
images = []
# Look for "images": [...]
imagesPattern = r'"images"\s*:\s*\[(.*?)(?:\]|$)'
match = re.search(imagesPattern, jsonString, re.DOTALL)
if match:
imagesContent = match.group(1)
# Extract image objects using balanced brace matching
braceCount = 0
startPos = -1
for i, char in enumerate(imagesContent):
if char == '{':
if braceCount == 0:
startPos = i
braceCount += 1
elif char == '}':
braceCount -= 1
if braceCount == 0 and startPos >= 0:
imageStr = imagesContent[startPos:i+1]
try:
image = json.loads(imageStr)
if isinstance(image, dict):
images.append(image)
except Exception:
# Incomplete image - try to extract what we can
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', imageStr)
urlMatch = re.search(r'"url"\s*:\s*"([^"]*)"', imageStr)
if idMatch or urlMatch:
image = {}
if idMatch:
image["id"] = idMatch.group(1)
if urlMatch:
image["url"] = urlMatch.group(1)
if image:
images.append(image)
startPos = -1
return images
class JsonStructureDetector:
"""Detects JSON structure type from extracted data."""
@staticmethod
def detect(data: Dict[str, Any], mergeId: Optional[str] = None) -> str:
"""
Detect structure type from data - GENERIC approach.
Only checks for top-level keys, no content analysis.
Returns:
Structure type: "elements", "documents", "files", "images", or "unknown"
"""
if "elements" in data:
structureType = "elements"
elif "documents" in data:
structureType = "documents"
elif "files" in data:
structureType = "files"
elif "images" in data:
structureType = "images"
else:
# Unknown structure - will be handled generically
structureType = "unknown"
if mergeId:
JsonMergeLogger.logStep("DETECTION", f"Detected structure type: {structureType}", structureType)
return structureType
class JsonDataMerger:
"""Merges JSON data intelligently with overlap detection."""
@staticmethod
def merge(
accumulated: Dict[str, Any],
newFragment: Dict[str, Any],
structureType: str,
mergeId: Optional[str] = None
) -> Dict[str, Any]:
"""
Merge two JSON data structures.
Args:
accumulated: Previously accumulated data
newFragment: New fragment data
structureType: Detected structure type
mergeId: Optional merge ID for logging
Returns:
Merged data structure
"""
if mergeId:
JsonMergeLogger.logStep("MERGING", f"Merging {structureType} structures", {
"acc_keys": list(accumulated.keys()) if accumulated else [],
"frag_keys": list(newFragment.keys()) if newFragment else []
})
if not accumulated:
if mergeId:
JsonMergeLogger.logStep("MERGING", "No accumulated data, returning fragment", newFragment)
return newFragment if newFragment else {}
if not newFragment:
if mergeId:
JsonMergeLogger.logStep("MERGING", "No fragment data, returning accumulated", accumulated)
return accumulated
# Merge based on structure type
if structureType == "elements":
result = JsonDataMerger._mergeElements(accumulated, newFragment)
elif structureType == "documents":
result = JsonDataMerger._mergeDocuments(accumulated, newFragment)
elif structureType == "files":
result = JsonDataMerger._mergeFiles(accumulated, newFragment)
elif structureType == "images":
result = JsonDataMerger._mergeImages(accumulated, newFragment)
else:
# Unknown structure - try to merge generically
result = JsonDataMerger._mergeGeneric(accumulated, newFragment)
if mergeId:
JsonMergeLogger.logStep("MERGING", f"Merged {structureType} structures", result)
return result
@staticmethod
def _mergeElements(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
"""Merge elements structures."""
accElements = accumulated.get("elements", [])
fragElements = newFragment.get("elements", [])
if not accElements:
return {"elements": fragElements} if fragElements else accumulated
if not fragElements:
return {"elements": accElements}
# Merge elements with overlap detection
mergedElements = JsonDataMerger._mergeElementList(accElements, fragElements)
return {"elements": mergedElements}
@staticmethod
def _mergeElementList(accElements: List[Dict[str, Any]], fragElements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Merge two element lists with overlap detection."""
if not accElements:
return fragElements
if not fragElements:
return accElements
# Special handling: if both have table elements, merge them intelligently
accTables = [e for e in accElements if isinstance(e, dict) and e.get("type") == "table"]
fragTables = [e for e in fragElements if isinstance(e, dict) and e.get("type") == "table"]
if accTables and fragTables:
# Merge table elements
mergedTable = JsonDataMerger._mergeTableElements(accTables[0], fragTables[0])
if mergedTable:
# Replace tables with merged table
otherAccElements = [e for e in accElements if not (isinstance(e, dict) and e.get("type") == "table")]
otherFragElements = [e for e in fragElements if not (isinstance(e, dict) and e.get("type") == "table")]
return otherAccElements + [mergedTable] + otherFragElements
# Find overlap by comparing elements
overlapStart = JsonDataMerger._findOverlap(accElements, fragElements, None, "elements")
if overlapStart > 0:
# Found overlap - remove overlapping elements from fragment
merged = accElements + fragElements[overlapStart:]
return merged
else:
# No overlap - append all
return accElements + fragElements
@staticmethod
def _mergeTableElements(accTable: Dict[str, Any], fragTable: Dict[str, Any]) -> Dict[str, Any]:
"""Merge two table elements by merging their rows."""
accRows = JsonDataMerger._getTableRows(accTable)
fragRows = JsonDataMerger._getTableRows(fragTable)
if not accRows:
return fragTable
if not fragRows:
return accTable
# Find overlap in rows
overlapStart = JsonDataMerger._findOverlap(accRows, fragRows, None, "table_rows")
# Merge rows
mergedRows = accRows + fragRows[overlapStart:] if overlapStart > 0 else accRows + fragRows
# Build merged table
mergedTable = accTable.copy()
content = mergedTable.get("content", {})
if not isinstance(content, dict):
content = {}
content["rows"] = mergedRows
# Preserve headers
if "headers" not in content:
fragContent = fragTable.get("content", {})
if isinstance(fragContent, dict) and "headers" in fragContent:
content["headers"] = fragContent["headers"]
mergedTable["content"] = content
return mergedTable
@staticmethod
def _findOverlap(accList: List[Any], fragList: List[Any], mergeId: Optional[str] = None, overlapType: str = "generic") -> int:
"""Find overlap between two lists. Returns index where overlap starts in fragList."""
if not accList or not fragList:
if mergeId:
JsonMergeLogger.logOverlap(overlapType, 0)
return 0
# Try to find longest common suffix/prefix
maxOverlap = min(len(accList), len(fragList))
for overlapLen in range(maxOverlap, 0, -1):
accSuffix = accList[-overlapLen:]
fragPrefix = fragList[:overlapLen]
# Compare elements
if JsonDataMerger._listsEqual(accSuffix, fragPrefix):
if mergeId:
JsonMergeLogger.logOverlap(overlapType, overlapLen, accSuffix, fragPrefix)
return overlapLen
if mergeId:
JsonMergeLogger.logOverlap(overlapType, 0)
return 0
@staticmethod
def _listsEqual(list1: List[Any], list2: List[Any]) -> bool:
"""Check if two lists are equal (deep comparison for dicts)."""
if len(list1) != len(list2):
return False
for i in range(len(list1)):
if isinstance(list1[i], dict) and isinstance(list2[i], dict):
# Compare dicts by comparing their content
if not JsonDataMerger._dictsEqual(list1[i], list2[i]):
return False
elif list1[i] != list2[i]:
return False
return True
@staticmethod
def _dictsEqual(dict1: Dict[str, Any], dict2: Dict[str, Any]) -> bool:
"""Check if two dicts are equal (comparing key content)."""
# For table elements, compare rows
if dict1.get("type") == "table" and dict2.get("type") == "table":
rows1 = JsonDataMerger._getTableRows(dict1)
rows2 = JsonDataMerger._getTableRows(dict2)
return rows1 == rows2
# For other elements, compare type and key content
if dict1.get("type") != dict2.get("type"):
return False
# Compare content
content1 = dict1.get("content", {})
content2 = dict2.get("content", {})
if isinstance(content1, dict) and isinstance(content2, dict):
# Compare rows for tables
if "rows" in content1 and "rows" in content2:
return content1["rows"] == content2["rows"]
# Compare items for lists
if "items" in content1 and "items" in content2:
return content1["items"] == content2["items"]
return dict1 == dict2
@staticmethod
def _getTableRows(element: Dict[str, Any]) -> List[List[str]]:
"""Extract table rows from element."""
content = element.get("content", {})
if isinstance(content, dict):
return content.get("rows", [])
return element.get("rows", [])
@staticmethod
def _mergeDocuments(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
"""Merge documents structures."""
accDocs = accumulated.get("documents", [])
fragDocs = newFragment.get("documents", [])
if not accDocs:
return {"documents": fragDocs} if fragDocs else accumulated
if not fragDocs:
return {"documents": accDocs}
# Merge documents (simplified - would need proper merging logic)
mergedDocs = accDocs + fragDocs
return {"documents": mergedDocs}
@staticmethod
def _mergeFiles(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
"""Merge files structures."""
accFiles = accumulated.get("files", [])
fragFiles = newFragment.get("files", [])
if not accFiles:
return {"files": fragFiles} if fragFiles else accumulated
if not fragFiles:
return {"files": accFiles}
mergedFiles = accFiles + fragFiles
return {"files": mergedFiles}
@staticmethod
def _mergeImages(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
"""Merge images structures."""
accImages = accumulated.get("images", [])
fragImages = newFragment.get("images", [])
if not accImages:
return {"images": fragImages} if fragImages else accumulated
if not fragImages:
return {"images": accImages}
mergedImages = accImages + fragImages
return {"images": mergedImages}
@staticmethod
def _mergeGeneric(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
"""Generic merge for unknown structures."""
# Try to merge by combining keys
merged = accumulated.copy()
for key, value in newFragment.items():
if key in merged:
# Key exists - try to merge values
if isinstance(merged[key], list) and isinstance(value, list):
merged[key] = merged[key] + value
elif isinstance(merged[key], dict) and isinstance(value, dict):
merged[key] = JsonDataMerger._mergeGeneric(merged[key], value)
else:
merged[key] = value
else:
merged[key] = value
return merged
class JsonResultBuilder:
"""Builds final JSON result, ensuring it's always valid."""
@staticmethod
def build(mergedData: Dict[str, Any], structureType: str, mergeId: Optional[str] = None) -> str:
"""
Build final JSON string from merged data.
Args:
mergedData: Merged data structure
structureType: Detected structure type
Returns:
Valid JSON string (never empty)
"""
if not mergedData:
# Return empty structure based on type
if structureType == "elements":
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
elif structureType == "documents":
return json.dumps({"documents": [{}]}, indent=2, ensure_ascii=False)
elif structureType == "files":
return json.dumps({"files": []}, indent=2, ensure_ascii=False)
elif structureType == "images":
return json.dumps({"images": []}, indent=2, ensure_ascii=False)
else:
return json.dumps({}, indent=2, ensure_ascii=False)
# Ensure structure is correct - GENERIC approach
if structureType == "elements" and "elements" not in mergedData:
# Try to wrap data in elements structure
if isinstance(mergedData, dict):
# Generic: If it has any data, wrap it as an element
if mergedData:
mergedData = {"elements": [mergedData]}
if mergeId:
JsonMergeLogger.logStep("BUILDING", "Wrapping single object as element (generic)", mergedData)
else:
# Empty dict - return empty elements
mergedData = {"elements": []}
elif structureType == "documents" and "documents" not in mergedData:
# Try to wrap data in documents structure
if isinstance(mergedData, dict):
if mergedData:
# Generic: Wrap single object in documents structure
# Try to detect if it should be chapters or sections by checking accumulated data
# But for now, use generic approach: wrap in documents with a generic key
mergedData = {"documents": [mergedData]}
if mergeId:
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in documents structure (generic)", mergedData)
else:
mergedData = {"documents": [{}]}
elif structureType == "files" and "files" not in mergedData:
# Try to wrap data in files structure
if isinstance(mergedData, dict):
if mergedData:
mergedData = {"files": [mergedData]}
if mergeId:
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in files structure (generic)", mergedData)
else:
mergedData = {"files": []}
elif structureType == "images" and "images" not in mergedData:
# Try to wrap data in images structure
if isinstance(mergedData, dict):
if mergedData:
mergedData = {"images": [mergedData]}
if mergeId:
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in images structure (generic)", mergedData)
else:
mergedData = {"images": []}
elif structureType == "unknown" and isinstance(mergedData, dict) and mergedData:
# Unknown structure but has data - wrap generically as elements
mergedData = {"elements": [mergedData]}
if mergeId:
JsonMergeLogger.logStep("BUILDING", "Unknown structure, wrapping as elements (generic)", mergedData)
# Clean data structure before serialization
cleanedData = JsonResultBuilder._cleanDataStructure(mergedData)
# Try to serialize
try:
jsonString = json.dumps(cleanedData, indent=2, ensure_ascii=False)
# Validate the JSON string by trying to parse it
try:
parsed, parseErr, _ = tryParseJson(jsonString)
if parseErr is None:
# Valid JSON - return it
return jsonString
else:
# Invalid JSON - try to repair
logger.warning(f"Generated JSON is invalid: {parseErr}, attempting repair")
repaired = closeJsonStructures(jsonString)
parsed2, parseErr2, _ = tryParseJson(repaired)
if parseErr2 is None:
return repaired
else:
# Repair failed - return minimal valid structure
logger.error(f"Repair failed: {parseErr2}, returning minimal structure")
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
except Exception as parseEx:
# Parse validation failed - try repair
logger.warning(f"Parse validation failed: {parseEx}, attempting repair")
try:
repaired = closeJsonStructures(jsonString)
parsed2, parseErr2, _ = tryParseJson(repaired)
if parseErr2 is None:
return repaired
except Exception:
pass
# Return minimal valid structure
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
except (TypeError, ValueError) as e:
logger.error(f"Error serializing JSON: {e}")
# Try to clean more aggressively and retry
try:
cleanedData2 = JsonResultBuilder._cleanDataStructure(cleanedData, aggressive=True)
jsonString = json.dumps(cleanedData2, indent=2, ensure_ascii=False)
# Validate
parsed, parseErr, _ = tryParseJson(jsonString)
if parseErr is None:
return jsonString
except Exception:
pass
# Fallback to empty structure
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
except Exception as e:
logger.error(f"Unexpected error building JSON: {e}")
# Fallback to empty structure
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
@staticmethod
def _cleanDataStructure(data: Any, aggressive: bool = False) -> Any:
"""
Clean data structure to ensure it's JSON-serializable.
Removes None values, ensures lists contain only valid items,
and repairs incomplete structures.
"""
if data is None:
return {} if aggressive else None
if isinstance(data, dict):
cleaned = {}
for key, value in data.items():
if value is None and aggressive:
continue # Skip None values in aggressive mode
cleaned[key] = JsonResultBuilder._cleanDataStructure(value, aggressive)
return cleaned
elif isinstance(data, list):
cleaned = []
for item in data:
cleanedItem = JsonResultBuilder._cleanDataStructure(item, aggressive)
if cleanedItem is not None or not aggressive:
cleaned.append(cleanedItem)
return cleaned
elif isinstance(data, (str, int, float, bool)):
return data
else:
# Unknown type - try to convert to string or skip
if aggressive:
return str(data)
return data
class ModularJsonMerger:
"""
Modular JSON Merger - Main entry point.
Simple pipeline:
1. Find overlap between JSON strings
2. Merge strings together
3. Parse and clean the merged JSON
"""
@staticmethod
def _findStringOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
"""
Find overlap between two JSON strings - GENERIC solution.
Works for any JSON structure (arrays, objects, nested, minified, formatted).
Uses multiple strategies to find overlap regardless of JSON format.
Strategy:
1. Exact suffix/prefix match (fastest, works for any format)
2. Structure-aware: Find last complete JSON elements in accumulated that match start of fragment
3. Line-based: If JSON is formatted, use line matching (for better performance)
4. Partial match: Handle incomplete elements at cut point
Returns the length of the overlap (number of characters).
"""
if not accStr or not fragStr:
if mergeId:
JsonMergeLogger.logOverlap("string", 0)
return 0
# Strategy 1: Try exact suffix/prefix match (fastest, works for any format)
maxOverlap = min(len(accStr), len(fragStr))
# Start from maximum possible overlap and work backwards
for overlapLen in range(maxOverlap, 0, -1):
accSuffix = accStr[-overlapLen:]
fragPrefix = fragStr[:overlapLen]
if accSuffix == fragPrefix:
if mergeId:
JsonMergeLogger.logOverlap("string (exact)", overlapLen, accSuffix[:200], fragPrefix[:200])
return overlapLen
# Strategy 2: Structure-aware overlap detection (GENERIC - works for any JSON structure)
# Find last complete JSON elements in accumulated and check if they appear at start of fragment
overlapLen = ModularJsonMerger._findStructureBasedOverlap(accStr, fragStr, mergeId)
if overlapLen > 0:
return overlapLen
# Strategy 3: Line-based overlap (works well for formatted JSON)
# Only use if JSON appears to be formatted (has newlines)
if '\n' in accStr and '\n' in fragStr:
overlapLen = ModularJsonMerger._findLineBasedOverlap(accStr, fragStr, mergeId)
if overlapLen > 0:
return overlapLen
# Strategy 4: Partial overlap (incomplete element at cut point)
overlapLen = ModularJsonMerger._findPartialOverlap(accStr, fragStr, mergeId)
if overlapLen > 0:
return overlapLen
if mergeId:
JsonMergeLogger.logOverlap("string", 0)
return 0
@staticmethod
def _findStructureBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
"""
Find overlap by detecting complete JSON elements (structure-aware, GENERIC).
Works for ANY JSON structure:
- Arrays: Finds last complete array elements
- Objects: Finds last complete object properties
- Nested structures: Recursively finds complete elements
- Minified or formatted JSON: Structure-aware, not format-dependent
- Any use case: section_content, chapter_structure, code_structure, etc.
Strategy: Find last complete JSON elements in accumulated that match start of fragment.
Uses balanced bracket/brace matching to identify complete elements regardless of format.
"""
accTrimmed = accStr.rstrip()
fragTrimmed = fragStr.lstrip()
if not accTrimmed or not fragTrimmed:
return 0
# Find last complete elements in accumulated by parsing backwards
# Look for complete array elements or object properties
# Strategy: Find where accumulated has complete elements at the end
# and check if fragment starts with the same elements
# Use a sliding window approach: check different suffix lengths from accumulated
maxCheckLength = min(2000, len(accTrimmed), len(fragTrimmed))
# Check in reverse order (largest to smallest) to find longest overlap first
for checkLen in range(maxCheckLength, 50, -5): # Step by 5 for performance
if checkLen > len(accTrimmed) or checkLen > len(fragTrimmed):
continue
accSuffix = accTrimmed[-checkLen:]
fragPrefix = fragTrimmed[:checkLen]
# Check if accSuffix ends with complete JSON element(s) and fragPrefix starts with same
# A complete element ends with proper closing brackets/braces
# Verify that accSuffix ends with complete structure
# and fragPrefix starts with the same structure
if ModularJsonMerger._isCompleteJsonElement(accSuffix) and \
ModularJsonMerger._startsWithSameElement(accSuffix, fragPrefix):
# Found overlap! Verify it's meaningful (not just whitespace)
if len(accSuffix.strip()) > 20:
if mergeId:
JsonMergeLogger.logOverlap("string (structure-based)", checkLen, accSuffix[:200], fragPrefix[:200])
return checkLen
# Alternative: Try to find common substring that represents complete elements
# Look for patterns like complete array rows or object properties
# Check last 500 chars of accumulated against first 500 chars of fragment
checkWindow = min(500, len(accTrimmed), len(fragTrimmed))
if checkWindow > 100:
accWindow = accTrimmed[-checkWindow:]
fragWindow = fragTrimmed[:checkWindow]
# Find longest common substring that represents complete elements
# Look for boundaries like ], [ or }, { or ", "
for i in range(checkWindow - 50, 50, -5):
accSub = accWindow[-i:]
fragSub = fragWindow[:i]
if accSub == fragSub:
# Check if it's a complete element boundary
if ModularJsonMerger._isCompleteElementBoundary(accSub):
if mergeId:
JsonMergeLogger.logOverlap("string (structure-boundary)", i, accSub[:200], fragSub[:200])
return i
return 0
@staticmethod
def _isCompleteJsonElement(jsonStr: str) -> bool:
"""Check if string ends with a complete JSON element (balanced brackets/braces)."""
jsonStr = jsonStr.strip()
if not jsonStr:
return False
# Check if it ends with complete structure markers
# Complete array element: ends with ] or ], or ],
# Complete object element: ends with } or }, or },
if jsonStr[-1] in ']}':
# Check if brackets/braces are balanced
braceCount = jsonStr.count('{') - jsonStr.count('}')
bracketCount = jsonStr.count('[') - jsonStr.count(']')
return braceCount == 0 and bracketCount == 0
return False
@staticmethod
def _startsWithSameElement(accSuffix: str, fragPrefix: str) -> bool:
"""Check if fragment prefix starts with the same element as accumulated suffix."""
# Normalize whitespace for comparison
accNorm = accSuffix.strip()
fragNorm = fragPrefix.strip()
# Check if fragPrefix starts with accSuffix (or vice versa for partial matches)
if fragNorm.startswith(accNorm):
return True
# Check if they have common prefix (for partial element completion)
minLen = min(len(accNorm), len(fragNorm))
if minLen > 20:
# Check if first 80% of accSuffix matches start of fragPrefix
checkLen = int(minLen * 0.8)
return accNorm[:checkLen] == fragNorm[:checkLen]
return False
@staticmethod
def _isCompleteElementBoundary(jsonStr: str) -> bool:
"""Check if string represents a complete element boundary (e.g., ], [ or }, {)."""
jsonStr = jsonStr.strip()
if not jsonStr:
return False
# Check if it contains complete element boundaries
# Pattern: ends with ], or }, or ],\n or },\n
if jsonStr.rstrip().endswith(('],', '},', ']', '}')):
return True
# Check if it's a complete array element or object property
if '],' in jsonStr or '},' in jsonStr:
return True
return False
@staticmethod
def _findLineBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
"""
Find overlap using line-based matching (for formatted JSON).
"""
accLines = accStr.rstrip().split('\n')
fragLines = fragStr.lstrip().split('\n')
# Try to find matching lines from the end of accumulated at the start of fragment
maxLinesToCheck = min(10, len(accLines), len(fragLines))
for numLines in range(maxLinesToCheck, 0, -1):
# Get last N lines from accumulated (excluding empty lines)
accLastLines = [line.strip() for line in accLines[-numLines:] if line.strip()]
# Get first N lines from fragment (excluding empty lines)
fragFirstLines = [line.strip() for line in fragLines[:numLines] if line.strip()]
# Check if they match
if len(accLastLines) > 0 and len(fragFirstLines) > 0:
# Try to find where accLastLines match fragFirstLines
for i in range(len(accLastLines)):
# Check if accLastLines[i:] matches fragFirstLines[:len(accLastLines)-i]
accSuffixLines = accLastLines[i:]
fragPrefixLines = fragFirstLines[:len(accSuffixLines)]
if accSuffixLines == fragPrefixLines and len(accSuffixLines) > 0:
# Found overlap! Calculate character length
accSuffixText = '\n'.join(accLastLines[i:])
fragPrefixText = '\n'.join(fragPrefixLines)
# Find where this text appears in the original strings
accPos = accStr.rfind(accSuffixText)
fragPos = fragStr.find(fragPrefixText)
if accPos >= 0 and fragPos == 0:
# Found valid overlap
overlapLen = len(accSuffixText)
if mergeId:
JsonMergeLogger.logOverlap("string (line-based)", overlapLen, accSuffixText[:200], fragPrefixText[:200])
return overlapLen
return 0
@staticmethod
def _findPartialOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
"""
Find partial overlap (incomplete element at cut point).
"""
accLines = accStr.rstrip().split('\n')
fragLines = fragStr.lstrip().split('\n')
if accLines and fragLines:
lastAccLine = accLines[-1].strip()
firstFragLine = fragLines[0].strip()
# Check if lastAccLine is a prefix of firstFragLine (incomplete line completed)
if lastAccLine and firstFragLine.startswith(lastAccLine):
# Also check if there are more matching lines after
overlapLen = len(lastAccLine)
# Try to extend overlap with more lines
for i in range(1, min(len(accLines), len(fragLines))):
if accLines[-1-i].strip() == fragLines[i].strip():
overlapLen += len('\n' + fragLines[i])
else:
break
if overlapLen > 20: # Only if meaningful overlap
if mergeId:
JsonMergeLogger.logOverlap("string (partial line)", overlapLen, lastAccLine[:200], firstFragLine[:200])
return overlapLen
return 0
@staticmethod
def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str:
"""
Merge two JSON strings together, removing the overlap.
Handles whitespace at cut points properly for seamless merging.
"""
if overlapLength > 0:
# Remove overlap from fragment and append
# CRITICAL: Handle whitespace properly - if accumulated ends with whitespace
# and fragment starts with the same content, we need to preserve whitespace structure
merged = accStr + fragStr[overlapLength:]
else:
# No overlap - just concatenate (might need comma or other separator)
# CRITICAL: Preserve whitespace structure when merging
# Get trailing whitespace from accumulated (spaces, tabs, but not newlines)
accTrailingWs = ""
i = len(accStr) - 1
while i >= 0 and accStr[i] in [' ', '\t']:
accTrailingWs = accStr[i] + accTrailingWs
i -= 1
# Get leading whitespace from fragment (spaces, tabs, but not newlines)
fragLeadingWs = ""
i = 0
while i < len(fragStr) and fragStr[i] in [' ', '\t']:
fragLeadingWs += fragStr[i]
i += 1
# Trim for content detection but preserve whitespace structure
accTrimmed = accStr.rstrip().rstrip(',')
fragTrimmed = fragStr.lstrip().lstrip(',')
# Check if we need a separator
if accTrimmed and fragTrimmed:
# If accumulated ends with } or ] and fragment starts with { or [, we might need comma
if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['):
# Add comma with appropriate whitespace
merged = accTrimmed + ',' + fragLeadingWs + fragTrimmed
else:
# Merge with preserved whitespace structure
# Use the whitespace from fragment (it knows the proper spacing)
merged = accTrimmed + accTrailingWs + fragLeadingWs + fragTrimmed
else:
# One is empty - just concatenate with preserved whitespace
merged = accStr + fragStr
return merged
@staticmethod
def merge(accumulated: str, newFragment: str) -> Tuple[str, bool]:
"""
Merge two JSON fragments intelligently.
Args:
accumulated: Previously accumulated JSON string
newFragment: New fragment JSON string
Returns:
Tuple of (merged_json_string, has_overlap):
- merged_json_string: Merged JSON string (closed if no overlap, unclosed if overlap found)
- has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop)
"""
# Start logging
mergeId = JsonMergeLogger.startMerge(accumulated, newFragment)
if not accumulated:
result = newFragment if newFragment else "{}"
JsonMergeLogger.finishMerge(mergeId, result, True)
return (result, False) # No overlap if no accumulated data
if not newFragment:
JsonMergeLogger.finishMerge(mergeId, accumulated, True)
return (accumulated, False) # No overlap if no new fragment
try:
# Normalize both strings
accNormalized = stripCodeFences(normalizeJsonText(accumulated)).strip()
fragNormalized = stripCodeFences(normalizeJsonText(newFragment)).strip()
JsonMergeLogger._log(f"\n Normalized Accumulated ({len(accNormalized)} chars)")
accNormLines = accNormalized.split('\n')
if len(accNormLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(accNormLines)} lines)")
for line in accNormLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(accNormLines) - 10} lines omitted) ...")
for line in accNormLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in accNormLines:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f"\n Normalized New Fragment ({len(fragNormalized)} chars)")
fragNormLines = fragNormalized.split('\n')
if len(fragNormLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(fragNormLines)} lines)")
for line in fragNormLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(fragNormLines) - 10} lines omitted) ...")
for line in fragNormLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in fragNormLines:
JsonMergeLogger._log(f" {line}")
# Step 1: Find overlap between JSON strings
JsonMergeLogger.logStep("PHASE 1", "Finding overlap between JSON strings", None)
overlapLength = ModularJsonMerger._findStringOverlap(accNormalized, fragNormalized, mergeId)
if overlapLength > 0:
accSuffix = accNormalized[-overlapLength:]
fragPrefix = fragNormalized[:overlapLength]
JsonMergeLogger._log(f"\n Overlap found ({overlapLength} chars):")
JsonMergeLogger._log(f" Accumulated suffix: {accSuffix}")
JsonMergeLogger._log(f" Fragment prefix: {fragPrefix}")
else:
# CRITICAL: No overlap found - this means iterations should stop
JsonMergeLogger._log(f"\n ⚠️ NO OVERLAP FOUND - This indicates iterations should stop")
JsonMergeLogger._log(f" Closing JSON and returning final result")
# Close the accumulated JSON (it's complete as far as we can tell)
closedJson = closeJsonStructures(accNormalized)
JsonMergeLogger._log(f"\n Closed JSON ({len(closedJson)} chars):")
JsonMergeLogger._log(" " + "="*78)
for line in closedJson.split('\n'):
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*78)
JsonMergeLogger.finishMerge(mergeId, closedJson, True)
# Return closed JSON with has_overlap=False to indicate iterations should stop
return (closedJson, False)
# Step 2: Merge strings together (only if overlap was found)
JsonMergeLogger.logStep("PHASE 2", f"Merging strings (overlap: {overlapLength} chars)", None)
mergedString = ModularJsonMerger._mergeStrings(accNormalized, fragNormalized, overlapLength)
JsonMergeLogger._log(f"\n Merged String ({len(mergedString)} chars)")
mergedLines = mergedString.split('\n')
if len(mergedLines) > 10:
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(mergedLines)} lines)")
for line in mergedLines[:5]:
JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(f" ... ({len(mergedLines) - 10} lines omitted) ...")
for line in mergedLines[-5:]:
JsonMergeLogger._log(f" {line}")
else:
for line in mergedLines:
JsonMergeLogger._log(f" {line}")
# Step 3: Return merged string (with incomplete element at end for next iteration)
JsonMergeLogger.logStep("PHASE 3", "Returning merged string (may be unclosed)", None)
JsonMergeLogger._log(f"\n Returning merged string (preserving incomplete element at end for next iteration)")
JsonMergeLogger.finishMerge(mergeId, mergedString, True)
# Return merged string with has_overlap=True to indicate iterations should continue
return (mergedString, True)
except Exception as e:
logger.error(f"Error in modular merger: {e}")
JsonMergeLogger.logStep("ERROR", f"Exception occurred: {str(e)}", None, error=str(e))
# Fallback: try to return accumulated if valid
try:
accParsed, accErr, _ = tryParseJson(accumulated)
if accErr is None:
JsonMergeLogger.finishMerge(mergeId, accumulated, False)
return (accumulated, False) # No overlap on error
except Exception:
pass
# Last resort: return empty valid JSON
fallback = json.dumps({"elements": []}, indent=2, ensure_ascii=False)
JsonMergeLogger.finishMerge(mergeId, fallback, False)
return (fallback, False) # No overlap on error