2049 lines
88 KiB
Python
2049 lines
88 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Modular JSON Merger - Intelligent JSON Fragment Merging
|
|
|
|
A clean, modular approach to merging JSON fragments that may be cut randomly.
|
|
Designed to be simple, robust, and always return valid data.
|
|
|
|
Architecture:
|
|
1. Data Extractor: Extracts all possible data from fragments (even incomplete)
|
|
2. Structure Detector: Detects JSON structure type (elements, documents, files, etc.)
|
|
3. Data Merger: Intelligently merges data with overlap detection
|
|
4. Result Builder: Always returns valid JSON structure
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import logging
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
|
|
from modules.shared.jsonUtils import (
|
|
normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class JsonMergeLogger:
|
|
"""Consolidated logger for JSON merging process."""
|
|
|
|
_logBuffer: List[str] = []
|
|
_mergeId: int = 0
|
|
_currentLogFile: Optional[str] = None
|
|
_appendMode: bool = False
|
|
|
|
@staticmethod
|
|
def initializeLogFile(logFileName: Optional[str] = None):
|
|
"""Initialize a new log file for a test run."""
|
|
JsonMergeLogger._logBuffer = []
|
|
JsonMergeLogger._mergeId = 0
|
|
|
|
if logFileName:
|
|
JsonMergeLogger._currentLogFile = logFileName
|
|
JsonMergeLogger._appendMode = False
|
|
# Clear existing file
|
|
try:
|
|
currentFileDir = os.path.dirname(os.path.abspath(__file__))
|
|
logFilePath = os.path.join(currentFileDir, logFileName)
|
|
with open(logFilePath, 'w', encoding='utf-8') as f:
|
|
f.write("") # Clear file
|
|
except Exception:
|
|
pass
|
|
else:
|
|
JsonMergeLogger._currentLogFile = None
|
|
JsonMergeLogger._appendMode = False
|
|
|
|
@staticmethod
|
|
def startMerge(accumulated: str, newFragment: str) -> str:
|
|
"""Start a new merge operation and return merge ID."""
|
|
JsonMergeLogger._mergeId += 1
|
|
mergeId = f"merge_{JsonMergeLogger._mergeId}"
|
|
|
|
JsonMergeLogger._log(f"{'='*80}")
|
|
JsonMergeLogger._log(f"JSON MERGE OPERATION #{JsonMergeLogger._mergeId}")
|
|
JsonMergeLogger._log(f"{'='*80}")
|
|
JsonMergeLogger._log(f"Timestamp: {datetime.now().isoformat()}")
|
|
JsonMergeLogger._log("")
|
|
|
|
JsonMergeLogger._log("INPUT:")
|
|
JsonMergeLogger._log(f" Accumulated length: {len(accumulated)} chars")
|
|
JsonMergeLogger._log(f" New Fragment length: {len(newFragment)} chars")
|
|
# Log only summary (first 5 and last 5 lines) to avoid log spam
|
|
accLines = accumulated.split('\n')
|
|
fragLines = newFragment.split('\n')
|
|
JsonMergeLogger._log(f" Accumulated: {len(accLines)} lines (showing first 5 and last 5)")
|
|
if len(accLines) > 10:
|
|
for line in accLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(accLines) - 10} lines omitted) ...")
|
|
for line in accLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in accLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" New Fragment: {len(fragLines)} lines (showing first 5 and last 5)")
|
|
if len(fragLines) > 10:
|
|
for line in fragLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(fragLines) - 10} lines omitted) ...")
|
|
for line in fragLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in fragLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log("")
|
|
|
|
return mergeId
|
|
|
|
@staticmethod
|
|
def logStep(stepName: str, description: str, result: Any = None, error: Optional[str] = None):
|
|
"""Log a step with its result."""
|
|
JsonMergeLogger._log(f"STEP: {stepName}")
|
|
JsonMergeLogger._log(f" Description: {description}")
|
|
|
|
if error:
|
|
JsonMergeLogger._log(f" ❌ ERROR: {error}")
|
|
elif result is not None:
|
|
if isinstance(result, str):
|
|
resultLines = result.split('\n')
|
|
JsonMergeLogger._log(f" ✅ Result (string, {len(result)} chars, {len(resultLines)} lines)")
|
|
if len(resultLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in resultLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(resultLines) - 10} lines omitted) ...")
|
|
for line in resultLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in resultLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
elif isinstance(result, dict):
|
|
keys = list(result.keys())
|
|
JsonMergeLogger._log(f" ✅ Result (dict): keys={keys}, size={len(str(result))} chars")
|
|
# Log full structure with JSON formatting - NO TRUNCATION
|
|
try:
|
|
jsonStr = json.dumps(result, indent=2, ensure_ascii=False)
|
|
JsonMergeLogger._log(f" Full data (COMPLETE, {len(jsonStr)} chars):")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
for line in jsonStr.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
except Exception as e:
|
|
JsonMergeLogger._log(f" Could not serialize: {e}")
|
|
strRepr = str(result)
|
|
strLines = strRepr.split('\n')
|
|
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
|
|
if len(strLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in strLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
|
|
for line in strLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in strLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
# Log structure details
|
|
if "elements" in result:
|
|
elemCount = len(result["elements"]) if isinstance(result["elements"], list) else 0
|
|
JsonMergeLogger._log(f" - elements: {elemCount} items")
|
|
if isinstance(result["elements"], list) and elemCount > 0:
|
|
JsonMergeLogger._log(f" First element type: {result['elements'][0].get('type', 'unknown') if isinstance(result['elements'][0], dict) else 'not a dict'}")
|
|
if "documents" in result:
|
|
docCount = len(result["documents"]) if isinstance(result["documents"], list) else 0
|
|
JsonMergeLogger._log(f" - documents: {docCount} items")
|
|
elif isinstance(result, list):
|
|
JsonMergeLogger._log(f" ✅ Result (list): {len(result)} items (COMPLETE)")
|
|
if len(result) > 0:
|
|
JsonMergeLogger._log(f" First item type: {type(result[0]).__name__}")
|
|
try:
|
|
jsonStr = json.dumps(result, indent=2, ensure_ascii=False) # ALL items
|
|
JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
for line in jsonStr.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
except Exception:
|
|
strRepr = str(result)
|
|
strLines = strRepr.split('\n')
|
|
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
|
|
if len(strLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in strLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
|
|
for line in strLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in strLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
JsonMergeLogger._log(f" ✅ Result: {type(result).__name__} = {str(result)[:200]}")
|
|
else:
|
|
JsonMergeLogger._log(f" ⏳ In progress...")
|
|
|
|
JsonMergeLogger._log("")
|
|
|
|
@staticmethod
|
|
def logExtraction(strategy: str, success: bool, data: Any = None, error: Optional[str] = None):
|
|
"""Log extraction strategy result."""
|
|
status = "✅ SUCCESS" if success else "❌ FAILED"
|
|
JsonMergeLogger._log(f" Extraction Strategy: {strategy} - {status}")
|
|
if error:
|
|
JsonMergeLogger._log(f" Error: {error}")
|
|
elif data is not None:
|
|
if isinstance(data, dict):
|
|
keys = list(data.keys())
|
|
JsonMergeLogger._log(f" Extracted keys: {keys}")
|
|
# Log full extracted data - NO TRUNCATION
|
|
try:
|
|
jsonStr = json.dumps(data, indent=2, ensure_ascii=False)
|
|
JsonMergeLogger._log(f" Extracted data (COMPLETE, {len(jsonStr)} chars):")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
for line in jsonStr.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
except Exception as e:
|
|
JsonMergeLogger._log(f" Could not serialize extracted data: {e}")
|
|
strRepr = str(data)
|
|
strLines = strRepr.split('\n')
|
|
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
|
|
if len(strLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in strLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
|
|
for line in strLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in strLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
elif isinstance(data, list):
|
|
JsonMergeLogger._log(f" Extracted {len(data)} items (COMPLETE)")
|
|
if len(data) > 0:
|
|
try:
|
|
jsonStr = json.dumps(data, indent=2, ensure_ascii=False) # ALL items
|
|
JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
for line in jsonStr.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
except Exception as e:
|
|
JsonMergeLogger._log(f" Could not serialize list: {e}")
|
|
strRepr = str(data)
|
|
strLines = strRepr.split('\n')
|
|
JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)")
|
|
if len(strLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in strLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...")
|
|
for line in strLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in strLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
|
|
@staticmethod
|
|
def logOverlap(overlapType: str, overlapLen: int, accSuffix: Any = None, fragPrefix: Any = None):
|
|
"""Log overlap detection result."""
|
|
JsonMergeLogger._log(f" Overlap Detection ({overlapType}):")
|
|
JsonMergeLogger._log(f" Overlap length: {overlapLen}")
|
|
if overlapLen > 0:
|
|
JsonMergeLogger._log(f" ✅ Found overlap of {overlapLen} chars")
|
|
if accSuffix is not None:
|
|
if isinstance(accSuffix, str):
|
|
JsonMergeLogger._log(f" Accumulated suffix (COMPLETE, {len(accSuffix)} chars):")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
for line in accSuffix.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*76)
|
|
else:
|
|
JsonMergeLogger._log(f" Accumulated suffix (COMPLETE): {accSuffix}")
|
|
if fragPrefix is not None:
|
|
if isinstance(fragPrefix, str):
|
|
prefixLines = fragPrefix.split('\n')
|
|
JsonMergeLogger._log(f" Fragment prefix ({len(fragPrefix)} chars, {len(prefixLines)} lines)")
|
|
if len(prefixLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 lines)")
|
|
for line in prefixLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(prefixLines) - 10} lines omitted) ...")
|
|
for line in prefixLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in prefixLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
JsonMergeLogger._log(f" Fragment prefix (COMPLETE): {fragPrefix}")
|
|
else:
|
|
JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all")
|
|
|
|
@staticmethod
|
|
def logValidation(validationType: str, success: bool, error: Optional[str] = None):
|
|
"""Log validation result."""
|
|
status = "✅ VALID" if success else "❌ INVALID"
|
|
JsonMergeLogger._log(f" Validation ({validationType}): {status}")
|
|
if error:
|
|
JsonMergeLogger._log(f" Error: {error}")
|
|
|
|
@staticmethod
|
|
def finishMerge(mergeId: str, finalResult: str, success: bool):
|
|
"""Finish merge operation and write log file."""
|
|
JsonMergeLogger._log("")
|
|
JsonMergeLogger._log(f"{'='*80}")
|
|
JsonMergeLogger._log(f"MERGE RESULT: {'✅ SUCCESS' if success else '❌ FAILED'}")
|
|
JsonMergeLogger._log(f"{'='*80}")
|
|
JsonMergeLogger._log(f"Final result length: {len(finalResult)} chars")
|
|
JsonMergeLogger._log("Final result (COMPLETE):")
|
|
JsonMergeLogger._log("="*80)
|
|
for line in finalResult.split('\n'):
|
|
JsonMergeLogger._log(line)
|
|
JsonMergeLogger._log("="*80)
|
|
JsonMergeLogger._log("")
|
|
|
|
# Write log content to buffer (will be written at end of test run)
|
|
logContent = "\n".join(JsonMergeLogger._logBuffer)
|
|
|
|
# If we have a current log file, append to it
|
|
if JsonMergeLogger._currentLogFile:
|
|
try:
|
|
currentFileDir = os.path.dirname(os.path.abspath(__file__))
|
|
logFilePath = os.path.join(currentFileDir, JsonMergeLogger._currentLogFile)
|
|
mode = 'a' if JsonMergeLogger._appendMode else 'w'
|
|
with open(logFilePath, mode, encoding='utf-8') as f:
|
|
f.write(logContent)
|
|
f.write("\n\n") # Add separator between merges
|
|
JsonMergeLogger._appendMode = True # Next writes will append
|
|
logger.debug(f"JSON merge log appended to: {logFilePath}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to write merge log file: {e}")
|
|
else:
|
|
# No log file set - write individual file (fallback)
|
|
currentFileDir = os.path.dirname(os.path.abspath(__file__))
|
|
logDir = currentFileDir
|
|
os.makedirs(logDir, exist_ok=True)
|
|
logFilePath = os.path.join(logDir, f"{mergeId}.txt")
|
|
try:
|
|
with open(logFilePath, 'w', encoding='utf-8') as f:
|
|
f.write(logContent)
|
|
logger.info(f"JSON merge log written to: {logFilePath}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to write merge log file: {e}")
|
|
|
|
# Clear buffer for next merge
|
|
JsonMergeLogger._logBuffer = []
|
|
|
|
@staticmethod
|
|
def _log(message: str):
|
|
"""Internal log method."""
|
|
JsonMergeLogger._logBuffer.append(message)
|
|
logger.debug(message)
|
|
|
|
|
|
class JsonDataExtractor:
|
|
"""Extracts data from JSON fragments, even if incomplete."""
|
|
|
|
@staticmethod
|
|
def extract(jsonString: str, mergeId: Optional[str] = None, removeFromEnd: bool = True) -> Dict[str, Any]:
|
|
"""
|
|
Extract complete data from JSON fragment.
|
|
|
|
For merging: We know exactly where to clean:
|
|
- accumulated: remove incomplete parts at the END
|
|
- newFragment: remove incomplete parts at the BEGINNING
|
|
|
|
Simple approach: Remove incomplete parts at specified position, then parse.
|
|
"""
|
|
if mergeId:
|
|
position = "END" if removeFromEnd else "BEGINNING"
|
|
JsonMergeLogger.logStep("EXTRACTION", f"Extracting data from JSON fragment ({len(jsonString)} chars) - cleaning from {position}")
|
|
|
|
if not jsonString or not jsonString.strip():
|
|
if mergeId:
|
|
JsonMergeLogger.logExtraction("Empty input", False, error="Input is empty")
|
|
return {}
|
|
|
|
normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
|
|
if not normalized:
|
|
if mergeId:
|
|
JsonMergeLogger.logExtraction("Normalization", False, error="Normalized string is empty")
|
|
return {}
|
|
|
|
# Try to parse as complete JSON first
|
|
parsed, parseErr, _ = tryParseJson(normalized)
|
|
if parseErr is None and parsed is not None:
|
|
if isinstance(parsed, dict):
|
|
finalResult = parsed
|
|
elif isinstance(parsed, list):
|
|
finalResult = {"elements": parsed}
|
|
else:
|
|
finalResult = {"elements": [parsed]} if parsed else {}
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logExtraction("Direct parsing", True, finalResult)
|
|
JsonMergeLogger.logStep("EXTRACTION", "Direct parsing successful", finalResult)
|
|
|
|
return finalResult if finalResult else {}
|
|
|
|
# Remove incomplete parts from specified position
|
|
if removeFromEnd:
|
|
cleaned = JsonDataExtractor._removeIncompleteFromEnd(normalized)
|
|
else:
|
|
cleaned = JsonDataExtractor._removeIncompleteFromBeginning(normalized)
|
|
|
|
if cleaned:
|
|
# Close structures and try to parse
|
|
closed = closeJsonStructures(cleaned)
|
|
parsed, parseErr2, _ = tryParseJson(closed)
|
|
if parseErr2 is None and parsed is not None:
|
|
if isinstance(parsed, dict):
|
|
finalResult = parsed
|
|
elif isinstance(parsed, list):
|
|
finalResult = {"elements": parsed}
|
|
else:
|
|
finalResult = {"elements": [parsed]} if parsed else {}
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logExtraction("Remove incomplete + close", True, finalResult)
|
|
JsonMergeLogger.logStep("EXTRACTION", "Remove incomplete + close successful", finalResult)
|
|
|
|
return finalResult if finalResult else {}
|
|
|
|
# Return empty dict if nothing worked
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("EXTRACTION", "No data extracted", {}, error="All strategies failed")
|
|
return {}
|
|
|
|
@staticmethod
|
|
def _removeIncompleteFromEnd(jsonString: str) -> str:
|
|
"""
|
|
Remove incomplete parts from the END of JSON string.
|
|
Goes through structure level by level, keeps complete elements, removes incomplete ones at the end.
|
|
"""
|
|
# Find first '{' or '[' to start
|
|
startIdx = -1
|
|
for i, char in enumerate(jsonString):
|
|
if char in '{[':
|
|
startIdx = i
|
|
break
|
|
|
|
if startIdx == -1:
|
|
return ""
|
|
|
|
# Remove incomplete parts from end recursively
|
|
cleaned = JsonDataExtractor._cleanJsonFromEnd(jsonString[startIdx:])
|
|
return cleaned
|
|
|
|
@staticmethod
|
|
def _removeIncompleteFromBeginning(jsonString: str) -> str:
|
|
"""
|
|
Remove incomplete parts from the BEGINNING of JSON string.
|
|
Finds where valid JSON starts and removes everything before it.
|
|
"""
|
|
# Find first '{' or '[' to start
|
|
startIdx = -1
|
|
for i, char in enumerate(jsonString):
|
|
if char in '{[':
|
|
startIdx = i
|
|
break
|
|
|
|
if startIdx == -1:
|
|
return ""
|
|
|
|
# Return from start position - beginning cleanup is just finding the start
|
|
return jsonString[startIdx:]
|
|
|
|
@staticmethod
|
|
def _cleanJsonFromEnd(jsonStr: str) -> str:
|
|
"""
|
|
Recursively clean JSON from the END: keep complete elements, remove incomplete ones at the end.
|
|
Goes through structure level by level.
|
|
"""
|
|
# Try to parse as-is first
|
|
try:
|
|
parsed = json.loads(jsonStr)
|
|
return jsonStr
|
|
except Exception:
|
|
pass
|
|
|
|
# If dict: go through each key-value pair, remove incomplete ones at the end
|
|
if jsonStr.strip().startswith('{'):
|
|
return JsonDataExtractor._cleanDictFromEnd(jsonStr)
|
|
|
|
# If array: go through each element, remove incomplete ones at the end
|
|
if jsonStr.strip().startswith('['):
|
|
return JsonDataExtractor._cleanArrayFromEnd(jsonStr)
|
|
|
|
return ""
|
|
|
|
@staticmethod
|
|
def _cleanDictFromEnd(jsonStr: str) -> str:
|
|
"""Clean dict from END: keep complete key-value pairs, remove incomplete ones at the end."""
|
|
if not jsonStr.strip().startswith('{'):
|
|
return ""
|
|
|
|
result = ['{']
|
|
i = 1 # Skip opening '{'
|
|
first = True
|
|
|
|
while i < len(jsonStr):
|
|
# Skip whitespace
|
|
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
|
|
i += 1
|
|
|
|
if i >= len(jsonStr):
|
|
break
|
|
|
|
# Check if we hit closing brace
|
|
if jsonStr[i] == '}':
|
|
break
|
|
|
|
# Skip comma
|
|
if jsonStr[i] == ',':
|
|
i += 1
|
|
continue
|
|
|
|
# Try to extract key-value pair
|
|
keyStart = i
|
|
# Find key (string)
|
|
if jsonStr[i] == '"':
|
|
i += 1
|
|
while i < len(jsonStr) and jsonStr[i] != '"':
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
else:
|
|
i += 1
|
|
if i < len(jsonStr):
|
|
i += 1 # Skip closing quote
|
|
else:
|
|
# Invalid key - stop here (incomplete at end)
|
|
break
|
|
|
|
# Skip whitespace and colon
|
|
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t:':
|
|
i += 1
|
|
|
|
if i >= len(jsonStr):
|
|
break
|
|
|
|
# Try to extract value
|
|
valueStart = i
|
|
valueEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
|
|
|
|
if valueEnd > valueStart:
|
|
# Try to parse this key-value pair
|
|
pairStr = jsonStr[keyStart:valueEnd]
|
|
try:
|
|
# Test if it's valid JSON
|
|
testStr = '{' + pairStr + '}'
|
|
json.loads(testStr)
|
|
# Valid pair - add it
|
|
if not first:
|
|
result.append(',')
|
|
result.append(pairStr)
|
|
first = False
|
|
i = valueEnd
|
|
except Exception:
|
|
# Invalid pair - stop here (incomplete at end)
|
|
break
|
|
else:
|
|
# Incomplete value - stop here (incomplete at end)
|
|
break
|
|
|
|
result.append('}')
|
|
return ''.join(result)
|
|
|
|
@staticmethod
|
|
def _cleanArrayFromEnd(jsonStr: str) -> str:
|
|
"""Clean array from END: keep complete elements, remove incomplete ones at the end."""
|
|
if not jsonStr.strip().startswith('['):
|
|
return ""
|
|
|
|
result = ['[']
|
|
i = 1 # Skip opening '['
|
|
first = True
|
|
|
|
while i < len(jsonStr):
|
|
# Skip whitespace
|
|
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
|
|
i += 1
|
|
|
|
if i >= len(jsonStr):
|
|
break
|
|
|
|
# Check if we hit closing bracket
|
|
if jsonStr[i] == ']':
|
|
break
|
|
|
|
# Skip comma
|
|
if jsonStr[i] == ',':
|
|
i += 1
|
|
continue
|
|
|
|
# Try to extract element
|
|
elemStart = i
|
|
elemEnd = JsonDataExtractor._findCompleteValue(jsonStr, i)
|
|
|
|
if elemEnd > elemStart:
|
|
# Try to parse this element
|
|
elemStr = jsonStr[elemStart:elemEnd]
|
|
try:
|
|
# Test if it's valid JSON
|
|
json.loads(elemStr)
|
|
# Valid element - add it
|
|
if not first:
|
|
result.append(',')
|
|
result.append(elemStr)
|
|
first = False
|
|
i = elemEnd
|
|
except Exception:
|
|
# Invalid element - stop here (incomplete at end)
|
|
break
|
|
else:
|
|
# Incomplete element - stop here (incomplete at end)
|
|
break
|
|
|
|
result.append(']')
|
|
return ''.join(result)
|
|
|
|
@staticmethod
|
|
def _findCompleteValue(jsonStr: str, start: int) -> int:
|
|
"""Find the end of a complete JSON value starting at start position."""
|
|
if start >= len(jsonStr):
|
|
return start
|
|
|
|
i = start
|
|
|
|
# Skip whitespace
|
|
while i < len(jsonStr) and jsonStr[i] in ' \n\r\t':
|
|
i += 1
|
|
|
|
if i >= len(jsonStr):
|
|
return start
|
|
|
|
char = jsonStr[i]
|
|
|
|
# String
|
|
if char == '"':
|
|
i += 1
|
|
while i < len(jsonStr):
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
elif jsonStr[i] == '"':
|
|
return i + 1
|
|
else:
|
|
i += 1
|
|
return start # Incomplete string
|
|
|
|
# Number, boolean, null
|
|
if char in '-0123456789tfn':
|
|
while i < len(jsonStr) and jsonStr[i] not in ',}]':
|
|
i += 1
|
|
return i
|
|
|
|
# Object
|
|
if char == '{':
|
|
braceCount = 1
|
|
i += 1
|
|
while i < len(jsonStr) and braceCount > 0:
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
elif jsonStr[i] == '"':
|
|
# Skip string
|
|
i += 1
|
|
while i < len(jsonStr):
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
elif jsonStr[i] == '"':
|
|
i += 1
|
|
break
|
|
else:
|
|
i += 1
|
|
elif jsonStr[i] == '{':
|
|
braceCount += 1
|
|
i += 1
|
|
elif jsonStr[i] == '}':
|
|
braceCount -= 1
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
if braceCount == 0:
|
|
return i
|
|
return start # Incomplete object
|
|
|
|
# Array
|
|
if char == '[':
|
|
bracketCount = 1
|
|
i += 1
|
|
while i < len(jsonStr) and bracketCount > 0:
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
elif jsonStr[i] == '"':
|
|
# Skip string
|
|
i += 1
|
|
while i < len(jsonStr):
|
|
if jsonStr[i] == '\\':
|
|
i += 2
|
|
elif jsonStr[i] == '"':
|
|
i += 1
|
|
break
|
|
else:
|
|
i += 1
|
|
elif jsonStr[i] == '[':
|
|
bracketCount += 1
|
|
i += 1
|
|
elif jsonStr[i] == ']':
|
|
bracketCount -= 1
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
if bracketCount == 0:
|
|
return i
|
|
return start # Incomplete array
|
|
|
|
return start
|
|
|
|
@staticmethod
|
|
def _extractAllCompleteObjects(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract ALL complete objects from JSON string using balanced brace matching.
|
|
Ignores incomplete objects at the end.
|
|
|
|
Core principle: Every fragment can be cut anywhere - extract only complete objects.
|
|
"""
|
|
foundObjs = []
|
|
braceCount = 0
|
|
startPos = -1
|
|
|
|
for i, char in enumerate(jsonString):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
# Found a complete object
|
|
objStr = jsonString[startPos:i+1]
|
|
try:
|
|
obj = json.loads(objStr)
|
|
if isinstance(obj, dict) and obj:
|
|
foundObjs.append(obj)
|
|
except Exception:
|
|
# Not valid JSON - skip it
|
|
pass
|
|
startPos = -1
|
|
elif braceCount < 0:
|
|
# Unbalanced - reset
|
|
braceCount = 0
|
|
startPos = -1
|
|
|
|
# If we end with an incomplete object (startPos >= 0 and braceCount > 0), ignore it
|
|
# It will be in the next fragment
|
|
|
|
return foundObjs
|
|
|
|
@staticmethod
|
|
def _extractElements(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""Extract elements array from JSON string - extracts ALL complete elements."""
|
|
elements = []
|
|
|
|
# Pattern 1: Look for "elements": [...] (including incomplete at end)
|
|
elementsPattern = r'"elements"\s*:\s*\[(.*)'
|
|
match = re.search(elementsPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
elementsContent = match.group(1)
|
|
# Extract ALL complete element objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(elementsContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
elementStr = elementsContent[startPos:i+1]
|
|
try:
|
|
element = json.loads(elementStr)
|
|
if isinstance(element, dict):
|
|
elements.append(element)
|
|
except Exception:
|
|
# Try to extract table rows from incomplete element
|
|
rows = JsonDataExtractor._extractTableRowsFromElement(elementStr)
|
|
if rows:
|
|
elements.append({
|
|
"type": "table",
|
|
"content": {
|
|
"rows": rows
|
|
}
|
|
})
|
|
startPos = -1
|
|
elif braceCount < 0:
|
|
break # Unbalanced - stop
|
|
|
|
# Pattern 2: Look for table structure directly (even if incomplete)
|
|
if not elements:
|
|
# Look for "type": "table" pattern
|
|
tablePattern = r'"type"\s*:\s*"table"[^}]*"rows"\s*:\s*\[(.*?)(?:\]|$)'
|
|
tableMatch = re.search(tablePattern, jsonString, re.DOTALL)
|
|
if tableMatch:
|
|
rowsContent = tableMatch.group(1)
|
|
rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
|
|
if rows:
|
|
elements.append({
|
|
"type": "table",
|
|
"content": {
|
|
"rows": rows
|
|
}
|
|
})
|
|
|
|
# Pattern 3: Look for table rows directly (without structure)
|
|
if not elements:
|
|
rows = JsonDataExtractor._extractTableRows(jsonString)
|
|
if rows:
|
|
elements.append({
|
|
"type": "table",
|
|
"content": {
|
|
"rows": rows
|
|
}
|
|
})
|
|
|
|
return elements
|
|
|
|
@staticmethod
|
|
def _extractTableRowsFromElement(elementStr: str) -> List[List[str]]:
|
|
"""Extract table rows from incomplete element string."""
|
|
# Look for rows array in element
|
|
rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
|
|
match = re.search(rowsPattern, elementStr, re.DOTALL)
|
|
if match:
|
|
return JsonDataExtractor._extractRowsFromContent(match.group(1))
|
|
return []
|
|
|
|
@staticmethod
|
|
def _extractRowsFromContent(rowsContent: str) -> List[List[str]]:
|
|
"""Extract rows from rows content string."""
|
|
rows = []
|
|
# Extract all array patterns: ["value1", "value2"]
|
|
# Use non-greedy matching but ensure we get complete arrays
|
|
arrayPattern = r'\[(.*?)\]'
|
|
arrayMatches = re.findall(arrayPattern, rowsContent)
|
|
for arrayContent in arrayMatches:
|
|
# Extract cells - handle both quoted strings and numbers
|
|
# First try to find quoted strings
|
|
cellPattern = r'"([^"]*)"'
|
|
cells = re.findall(cellPattern, arrayContent)
|
|
# If no quoted strings, try numbers or other values
|
|
if not cells:
|
|
# Try to find any values (numbers, booleans, etc.)
|
|
valuePattern = r'(-?\d+\.?\d*|true|false|null)'
|
|
cells = re.findall(valuePattern, arrayContent)
|
|
# Only add rows with at least 1 cell (allow single-column tables)
|
|
if len(cells) >= 1:
|
|
rows.append(cells)
|
|
return rows
|
|
|
|
@staticmethod
|
|
def _extractTableRows(jsonString: str) -> List[List[str]]:
|
|
"""Extract table rows from JSON string using multiple strategies."""
|
|
rows = []
|
|
|
|
# Strategy 1: Look for "rows": [[...], [...]]
|
|
rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
|
|
match = re.search(rowsPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
rowsContent = match.group(1)
|
|
rows = JsonDataExtractor._extractRowsFromContent(rowsContent)
|
|
if rows:
|
|
return rows
|
|
|
|
# Strategy 2: Look for standalone array patterns ["value1", "value2"]
|
|
# Pattern for complete arrays with 2 columns
|
|
completeArrayPattern = r'\["([^"]*)",\s*"([^"]*)"\]'
|
|
matches = re.findall(completeArrayPattern, jsonString)
|
|
if len(matches) >= 2: # Need at least 2 rows to be confident
|
|
return [[m[0], m[1]] for m in matches]
|
|
|
|
# Strategy 3: Extract any array patterns (more lenient)
|
|
# Find all [ ... ] patterns that contain quoted strings
|
|
allArrays = re.findall(r'\[([^\]]*)\]', jsonString)
|
|
for arrayContent in allArrays:
|
|
# Extract quoted strings
|
|
cells = re.findall(r'"([^"]*)"', arrayContent)
|
|
if len(cells) >= 2: # At least 2 columns
|
|
rows.append(cells)
|
|
|
|
# Only return if we have multiple rows (likely a table)
|
|
if len(rows) >= 2:
|
|
return rows
|
|
|
|
return []
|
|
|
|
@staticmethod
|
|
def _extractDocuments(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract documents structure from JSON string - extracts ALL complete documents/chapters/sections.
|
|
Ignores incomplete ones at the end.
|
|
|
|
Core principle: Fragment can be cut anywhere - extract only complete objects.
|
|
"""
|
|
documents = []
|
|
|
|
# Pattern 1: Look for "documents": [...] structure (including incomplete at end)
|
|
documentsPattern = r'"documents"\s*:\s*\[(.*)'
|
|
match = re.search(documentsPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
documentsContent = match.group(1)
|
|
# Extract ALL complete document objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(documentsContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
# Found a complete document object
|
|
docStr = documentsContent[startPos:i+1]
|
|
try:
|
|
doc = json.loads(docStr)
|
|
if isinstance(doc, dict):
|
|
# Extract chapters/sections from document
|
|
chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
|
|
sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
|
|
if chapters:
|
|
doc["chapters"] = chapters
|
|
if sections:
|
|
doc["sections"] = sections
|
|
if doc:
|
|
documents.append(doc)
|
|
except Exception:
|
|
# Not valid JSON - try to extract chapters/sections directly
|
|
chapters = JsonDataExtractor._extractChaptersFromDocument(docStr)
|
|
sections = JsonDataExtractor._extractSectionsFromDocument(docStr)
|
|
if chapters or sections:
|
|
doc = {}
|
|
if chapters:
|
|
doc["chapters"] = chapters
|
|
if sections:
|
|
doc["sections"] = sections
|
|
if doc:
|
|
documents.append(doc)
|
|
startPos = -1
|
|
elif braceCount < 0:
|
|
break
|
|
|
|
# If we end with an incomplete document (startPos >= 0 and braceCount > 0), ignore it
|
|
# It will be in the next fragment
|
|
|
|
if documents:
|
|
return documents
|
|
|
|
# Pattern 2: Look for "chapters": [...] pattern directly (fragment might start mid-document)
|
|
chapters = JsonDataExtractor._extractChaptersFromString(jsonString)
|
|
if chapters:
|
|
documents.append({"chapters": chapters})
|
|
|
|
# Pattern 3: Look for "sections": [...] pattern directly
|
|
sections = JsonDataExtractor._extractSectionsFromString(jsonString)
|
|
if sections:
|
|
documents.append({"sections": sections})
|
|
|
|
return documents
|
|
|
|
@staticmethod
|
|
def _extractChaptersFromDocument(docStr: str) -> List[Dict[str, Any]]:
|
|
"""Extract chapters array from document string."""
|
|
return JsonDataExtractor._extractChaptersFromString(docStr)
|
|
|
|
@staticmethod
|
|
def _extractChaptersFromString(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract chapters array from JSON string - extracts ALL complete chapters.
|
|
Ignores incomplete chapters at the end.
|
|
|
|
Core principle: Fragment can be cut anywhere - extract only complete objects.
|
|
"""
|
|
chapters = []
|
|
|
|
# Look for "chapters": [...] pattern (including incomplete at end)
|
|
chaptersPattern = r'"chapters"\s*:\s*\[(.*)'
|
|
match = re.search(chaptersPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
chaptersContent = match.group(1)
|
|
# Extract ALL complete chapter objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(chaptersContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
# Found a complete chapter object
|
|
chapterStr = chaptersContent[startPos:i+1]
|
|
try:
|
|
chapter = json.loads(chapterStr)
|
|
if isinstance(chapter, dict):
|
|
chapters.append(chapter)
|
|
except Exception:
|
|
# Not valid JSON - skip it (incomplete chapter)
|
|
pass
|
|
startPos = -1
|
|
elif braceCount < 0:
|
|
# Unbalanced - stop here
|
|
break
|
|
|
|
# If we end with an incomplete chapter (startPos >= 0 and braceCount > 0), ignore it
|
|
# It will be in the next fragment
|
|
|
|
# Also try to extract chapters that might be standalone (fragment starts mid-array)
|
|
# Look for complete chapter objects anywhere in the string
|
|
if not chapters:
|
|
# Try to find complete chapter objects using balanced brace matching
|
|
allObjs = JsonDataExtractor._extractAllCompleteObjects(jsonString)
|
|
# Filter for objects that look like chapters (have id and title)
|
|
for obj in allObjs:
|
|
if isinstance(obj, dict) and "id" in obj and "title" in obj:
|
|
chapters.append(obj)
|
|
|
|
return chapters
|
|
|
|
@staticmethod
|
|
def _extractSectionsFromDocument(docStr: str) -> List[Dict[str, Any]]:
|
|
"""Extract sections array from document string."""
|
|
return JsonDataExtractor._extractSectionsFromString(docStr)
|
|
|
|
@staticmethod
|
|
def _extractSectionsFromString(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""Extract sections array from JSON string, even if incomplete."""
|
|
sections = []
|
|
|
|
# Look for "sections": [...]
|
|
sectionsPattern = r'"sections"\s*:\s*\[(.*?)(?:\]|$)'
|
|
match = re.search(sectionsPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
sectionsContent = match.group(1)
|
|
# Extract section objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(sectionsContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
sectionStr = sectionsContent[startPos:i+1]
|
|
try:
|
|
section = json.loads(sectionStr)
|
|
if isinstance(section, dict):
|
|
sections.append(section)
|
|
except Exception:
|
|
# Incomplete section - try to extract what we can
|
|
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', sectionStr)
|
|
contentTypeMatch = re.search(r'"content_type"\s*:\s*"([^"]*)"', sectionStr)
|
|
if idMatch or contentTypeMatch:
|
|
section = {}
|
|
if idMatch:
|
|
section["id"] = idMatch.group(1)
|
|
if contentTypeMatch:
|
|
section["content_type"] = contentTypeMatch.group(1)
|
|
if section:
|
|
sections.append(section)
|
|
startPos = -1
|
|
|
|
return sections
|
|
|
|
@staticmethod
|
|
def _extractFiles(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""Extract files array from JSON string, even if incomplete."""
|
|
files = []
|
|
|
|
# Look for "files": [...]
|
|
filesPattern = r'"files"\s*:\s*\[(.*?)(?:\]|$)'
|
|
match = re.search(filesPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
filesContent = match.group(1)
|
|
# Extract file objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(filesContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
fileStr = filesContent[startPos:i+1]
|
|
try:
|
|
fileObj = json.loads(fileStr)
|
|
if isinstance(fileObj, dict):
|
|
files.append(fileObj)
|
|
except Exception:
|
|
# Incomplete file - try to extract what we can
|
|
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', fileStr)
|
|
filenameMatch = re.search(r'"filename"\s*:\s*"([^"]*)"', fileStr)
|
|
if idMatch or filenameMatch:
|
|
fileObj = {}
|
|
if idMatch:
|
|
fileObj["id"] = idMatch.group(1)
|
|
if filenameMatch:
|
|
fileObj["filename"] = filenameMatch.group(1)
|
|
if fileObj:
|
|
files.append(fileObj)
|
|
startPos = -1
|
|
|
|
return files
|
|
|
|
@staticmethod
|
|
def _extractImages(jsonString: str) -> List[Dict[str, Any]]:
|
|
"""Extract images array from JSON string, even if incomplete."""
|
|
images = []
|
|
|
|
# Look for "images": [...]
|
|
imagesPattern = r'"images"\s*:\s*\[(.*?)(?:\]|$)'
|
|
match = re.search(imagesPattern, jsonString, re.DOTALL)
|
|
if match:
|
|
imagesContent = match.group(1)
|
|
# Extract image objects using balanced brace matching
|
|
braceCount = 0
|
|
startPos = -1
|
|
for i, char in enumerate(imagesContent):
|
|
if char == '{':
|
|
if braceCount == 0:
|
|
startPos = i
|
|
braceCount += 1
|
|
elif char == '}':
|
|
braceCount -= 1
|
|
if braceCount == 0 and startPos >= 0:
|
|
imageStr = imagesContent[startPos:i+1]
|
|
try:
|
|
image = json.loads(imageStr)
|
|
if isinstance(image, dict):
|
|
images.append(image)
|
|
except Exception:
|
|
# Incomplete image - try to extract what we can
|
|
idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', imageStr)
|
|
urlMatch = re.search(r'"url"\s*:\s*"([^"]*)"', imageStr)
|
|
if idMatch or urlMatch:
|
|
image = {}
|
|
if idMatch:
|
|
image["id"] = idMatch.group(1)
|
|
if urlMatch:
|
|
image["url"] = urlMatch.group(1)
|
|
if image:
|
|
images.append(image)
|
|
startPos = -1
|
|
|
|
return images
|
|
|
|
|
|
class JsonStructureDetector:
|
|
"""Detects JSON structure type from extracted data."""
|
|
|
|
@staticmethod
|
|
def detect(data: Dict[str, Any], mergeId: Optional[str] = None) -> str:
|
|
"""
|
|
Detect structure type from data - GENERIC approach.
|
|
|
|
Only checks for top-level keys, no content analysis.
|
|
|
|
Returns:
|
|
Structure type: "elements", "documents", "files", "images", or "unknown"
|
|
"""
|
|
if "elements" in data:
|
|
structureType = "elements"
|
|
elif "documents" in data:
|
|
structureType = "documents"
|
|
elif "files" in data:
|
|
structureType = "files"
|
|
elif "images" in data:
|
|
structureType = "images"
|
|
else:
|
|
# Unknown structure - will be handled generically
|
|
structureType = "unknown"
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("DETECTION", f"Detected structure type: {structureType}", structureType)
|
|
|
|
return structureType
|
|
|
|
|
|
class JsonDataMerger:
|
|
"""Merges JSON data intelligently with overlap detection."""
|
|
|
|
@staticmethod
|
|
def merge(
|
|
accumulated: Dict[str, Any],
|
|
newFragment: Dict[str, Any],
|
|
structureType: str,
|
|
mergeId: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Merge two JSON data structures.
|
|
|
|
Args:
|
|
accumulated: Previously accumulated data
|
|
newFragment: New fragment data
|
|
structureType: Detected structure type
|
|
mergeId: Optional merge ID for logging
|
|
|
|
Returns:
|
|
Merged data structure
|
|
"""
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("MERGING", f"Merging {structureType} structures", {
|
|
"acc_keys": list(accumulated.keys()) if accumulated else [],
|
|
"frag_keys": list(newFragment.keys()) if newFragment else []
|
|
})
|
|
|
|
if not accumulated:
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("MERGING", "No accumulated data, returning fragment", newFragment)
|
|
return newFragment if newFragment else {}
|
|
if not newFragment:
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("MERGING", "No fragment data, returning accumulated", accumulated)
|
|
return accumulated
|
|
|
|
# Merge based on structure type
|
|
if structureType == "elements":
|
|
result = JsonDataMerger._mergeElements(accumulated, newFragment)
|
|
elif structureType == "documents":
|
|
result = JsonDataMerger._mergeDocuments(accumulated, newFragment)
|
|
elif structureType == "files":
|
|
result = JsonDataMerger._mergeFiles(accumulated, newFragment)
|
|
elif structureType == "images":
|
|
result = JsonDataMerger._mergeImages(accumulated, newFragment)
|
|
else:
|
|
# Unknown structure - try to merge generically
|
|
result = JsonDataMerger._mergeGeneric(accumulated, newFragment)
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("MERGING", f"Merged {structureType} structures", result)
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def _mergeElements(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Merge elements structures."""
|
|
accElements = accumulated.get("elements", [])
|
|
fragElements = newFragment.get("elements", [])
|
|
|
|
if not accElements:
|
|
return {"elements": fragElements} if fragElements else accumulated
|
|
if not fragElements:
|
|
return {"elements": accElements}
|
|
|
|
# Merge elements with overlap detection
|
|
mergedElements = JsonDataMerger._mergeElementList(accElements, fragElements)
|
|
|
|
return {"elements": mergedElements}
|
|
|
|
@staticmethod
|
|
def _mergeElementList(accElements: List[Dict[str, Any]], fragElements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Merge two element lists with overlap detection."""
|
|
if not accElements:
|
|
return fragElements
|
|
if not fragElements:
|
|
return accElements
|
|
|
|
# Special handling: if both have table elements, merge them intelligently
|
|
accTables = [e for e in accElements if isinstance(e, dict) and e.get("type") == "table"]
|
|
fragTables = [e for e in fragElements if isinstance(e, dict) and e.get("type") == "table"]
|
|
|
|
if accTables and fragTables:
|
|
# Merge table elements
|
|
mergedTable = JsonDataMerger._mergeTableElements(accTables[0], fragTables[0])
|
|
if mergedTable:
|
|
# Replace tables with merged table
|
|
otherAccElements = [e for e in accElements if not (isinstance(e, dict) and e.get("type") == "table")]
|
|
otherFragElements = [e for e in fragElements if not (isinstance(e, dict) and e.get("type") == "table")]
|
|
return otherAccElements + [mergedTable] + otherFragElements
|
|
|
|
# Find overlap by comparing elements
|
|
overlapStart = JsonDataMerger._findOverlap(accElements, fragElements, None, "elements")
|
|
|
|
if overlapStart > 0:
|
|
# Found overlap - remove overlapping elements from fragment
|
|
merged = accElements + fragElements[overlapStart:]
|
|
return merged
|
|
else:
|
|
# No overlap - append all
|
|
return accElements + fragElements
|
|
|
|
@staticmethod
|
|
def _mergeTableElements(accTable: Dict[str, Any], fragTable: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Merge two table elements by merging their rows."""
|
|
accRows = JsonDataMerger._getTableRows(accTable)
|
|
fragRows = JsonDataMerger._getTableRows(fragTable)
|
|
|
|
if not accRows:
|
|
return fragTable
|
|
if not fragRows:
|
|
return accTable
|
|
|
|
# Find overlap in rows
|
|
overlapStart = JsonDataMerger._findOverlap(accRows, fragRows, None, "table_rows")
|
|
|
|
# Merge rows
|
|
mergedRows = accRows + fragRows[overlapStart:] if overlapStart > 0 else accRows + fragRows
|
|
|
|
# Build merged table
|
|
mergedTable = accTable.copy()
|
|
content = mergedTable.get("content", {})
|
|
if not isinstance(content, dict):
|
|
content = {}
|
|
content["rows"] = mergedRows
|
|
|
|
# Preserve headers
|
|
if "headers" not in content:
|
|
fragContent = fragTable.get("content", {})
|
|
if isinstance(fragContent, dict) and "headers" in fragContent:
|
|
content["headers"] = fragContent["headers"]
|
|
|
|
mergedTable["content"] = content
|
|
return mergedTable
|
|
|
|
@staticmethod
|
|
def _findOverlap(accList: List[Any], fragList: List[Any], mergeId: Optional[str] = None, overlapType: str = "generic") -> int:
|
|
"""Find overlap between two lists. Returns index where overlap starts in fragList."""
|
|
if not accList or not fragList:
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap(overlapType, 0)
|
|
return 0
|
|
|
|
# Try to find longest common suffix/prefix
|
|
maxOverlap = min(len(accList), len(fragList))
|
|
|
|
for overlapLen in range(maxOverlap, 0, -1):
|
|
accSuffix = accList[-overlapLen:]
|
|
fragPrefix = fragList[:overlapLen]
|
|
|
|
# Compare elements
|
|
if JsonDataMerger._listsEqual(accSuffix, fragPrefix):
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap(overlapType, overlapLen, accSuffix, fragPrefix)
|
|
return overlapLen
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap(overlapType, 0)
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _listsEqual(list1: List[Any], list2: List[Any]) -> bool:
|
|
"""Check if two lists are equal (deep comparison for dicts)."""
|
|
if len(list1) != len(list2):
|
|
return False
|
|
|
|
for i in range(len(list1)):
|
|
if isinstance(list1[i], dict) and isinstance(list2[i], dict):
|
|
# Compare dicts by comparing their content
|
|
if not JsonDataMerger._dictsEqual(list1[i], list2[i]):
|
|
return False
|
|
elif list1[i] != list2[i]:
|
|
return False
|
|
|
|
return True
|
|
|
|
@staticmethod
|
|
def _dictsEqual(dict1: Dict[str, Any], dict2: Dict[str, Any]) -> bool:
|
|
"""Check if two dicts are equal (comparing key content)."""
|
|
# For table elements, compare rows
|
|
if dict1.get("type") == "table" and dict2.get("type") == "table":
|
|
rows1 = JsonDataMerger._getTableRows(dict1)
|
|
rows2 = JsonDataMerger._getTableRows(dict2)
|
|
return rows1 == rows2
|
|
|
|
# For other elements, compare type and key content
|
|
if dict1.get("type") != dict2.get("type"):
|
|
return False
|
|
|
|
# Compare content
|
|
content1 = dict1.get("content", {})
|
|
content2 = dict2.get("content", {})
|
|
|
|
if isinstance(content1, dict) and isinstance(content2, dict):
|
|
# Compare rows for tables
|
|
if "rows" in content1 and "rows" in content2:
|
|
return content1["rows"] == content2["rows"]
|
|
# Compare items for lists
|
|
if "items" in content1 and "items" in content2:
|
|
return content1["items"] == content2["items"]
|
|
|
|
return dict1 == dict2
|
|
|
|
@staticmethod
|
|
def _getTableRows(element: Dict[str, Any]) -> List[List[str]]:
|
|
"""Extract table rows from element."""
|
|
content = element.get("content", {})
|
|
if isinstance(content, dict):
|
|
return content.get("rows", [])
|
|
return element.get("rows", [])
|
|
|
|
@staticmethod
|
|
def _mergeDocuments(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Merge documents structures."""
|
|
accDocs = accumulated.get("documents", [])
|
|
fragDocs = newFragment.get("documents", [])
|
|
|
|
if not accDocs:
|
|
return {"documents": fragDocs} if fragDocs else accumulated
|
|
if not fragDocs:
|
|
return {"documents": accDocs}
|
|
|
|
# Merge documents (simplified - would need proper merging logic)
|
|
mergedDocs = accDocs + fragDocs
|
|
return {"documents": mergedDocs}
|
|
|
|
@staticmethod
|
|
def _mergeFiles(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Merge files structures."""
|
|
accFiles = accumulated.get("files", [])
|
|
fragFiles = newFragment.get("files", [])
|
|
|
|
if not accFiles:
|
|
return {"files": fragFiles} if fragFiles else accumulated
|
|
if not fragFiles:
|
|
return {"files": accFiles}
|
|
|
|
mergedFiles = accFiles + fragFiles
|
|
return {"files": mergedFiles}
|
|
|
|
@staticmethod
|
|
def _mergeImages(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Merge images structures."""
|
|
accImages = accumulated.get("images", [])
|
|
fragImages = newFragment.get("images", [])
|
|
|
|
if not accImages:
|
|
return {"images": fragImages} if fragImages else accumulated
|
|
if not fragImages:
|
|
return {"images": accImages}
|
|
|
|
mergedImages = accImages + fragImages
|
|
return {"images": mergedImages}
|
|
|
|
@staticmethod
|
|
def _mergeGeneric(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generic merge for unknown structures."""
|
|
# Try to merge by combining keys
|
|
merged = accumulated.copy()
|
|
for key, value in newFragment.items():
|
|
if key in merged:
|
|
# Key exists - try to merge values
|
|
if isinstance(merged[key], list) and isinstance(value, list):
|
|
merged[key] = merged[key] + value
|
|
elif isinstance(merged[key], dict) and isinstance(value, dict):
|
|
merged[key] = JsonDataMerger._mergeGeneric(merged[key], value)
|
|
else:
|
|
merged[key] = value
|
|
else:
|
|
merged[key] = value
|
|
|
|
return merged
|
|
|
|
|
|
class JsonResultBuilder:
|
|
"""Builds final JSON result, ensuring it's always valid."""
|
|
|
|
@staticmethod
|
|
def build(mergedData: Dict[str, Any], structureType: str, mergeId: Optional[str] = None) -> str:
|
|
"""
|
|
Build final JSON string from merged data.
|
|
|
|
Args:
|
|
mergedData: Merged data structure
|
|
structureType: Detected structure type
|
|
|
|
Returns:
|
|
Valid JSON string (never empty)
|
|
"""
|
|
if not mergedData:
|
|
# Return empty structure based on type
|
|
if structureType == "elements":
|
|
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
elif structureType == "documents":
|
|
return json.dumps({"documents": [{}]}, indent=2, ensure_ascii=False)
|
|
elif structureType == "files":
|
|
return json.dumps({"files": []}, indent=2, ensure_ascii=False)
|
|
elif structureType == "images":
|
|
return json.dumps({"images": []}, indent=2, ensure_ascii=False)
|
|
else:
|
|
return json.dumps({}, indent=2, ensure_ascii=False)
|
|
|
|
# Ensure structure is correct - GENERIC approach
|
|
if structureType == "elements" and "elements" not in mergedData:
|
|
# Try to wrap data in elements structure
|
|
if isinstance(mergedData, dict):
|
|
# Generic: If it has any data, wrap it as an element
|
|
if mergedData:
|
|
mergedData = {"elements": [mergedData]}
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("BUILDING", "Wrapping single object as element (generic)", mergedData)
|
|
else:
|
|
# Empty dict - return empty elements
|
|
mergedData = {"elements": []}
|
|
|
|
elif structureType == "documents" and "documents" not in mergedData:
|
|
# Try to wrap data in documents structure
|
|
if isinstance(mergedData, dict):
|
|
if mergedData:
|
|
# Generic: Wrap single object in documents structure
|
|
# Try to detect if it should be chapters or sections by checking accumulated data
|
|
# But for now, use generic approach: wrap in documents with a generic key
|
|
mergedData = {"documents": [mergedData]}
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in documents structure (generic)", mergedData)
|
|
else:
|
|
mergedData = {"documents": [{}]}
|
|
|
|
elif structureType == "files" and "files" not in mergedData:
|
|
# Try to wrap data in files structure
|
|
if isinstance(mergedData, dict):
|
|
if mergedData:
|
|
mergedData = {"files": [mergedData]}
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in files structure (generic)", mergedData)
|
|
else:
|
|
mergedData = {"files": []}
|
|
|
|
elif structureType == "images" and "images" not in mergedData:
|
|
# Try to wrap data in images structure
|
|
if isinstance(mergedData, dict):
|
|
if mergedData:
|
|
mergedData = {"images": [mergedData]}
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("BUILDING", "Wrapping single object in images structure (generic)", mergedData)
|
|
else:
|
|
mergedData = {"images": []}
|
|
|
|
elif structureType == "unknown" and isinstance(mergedData, dict) and mergedData:
|
|
# Unknown structure but has data - wrap generically as elements
|
|
mergedData = {"elements": [mergedData]}
|
|
if mergeId:
|
|
JsonMergeLogger.logStep("BUILDING", "Unknown structure, wrapping as elements (generic)", mergedData)
|
|
|
|
# Clean data structure before serialization
|
|
cleanedData = JsonResultBuilder._cleanDataStructure(mergedData)
|
|
|
|
# Try to serialize
|
|
try:
|
|
jsonString = json.dumps(cleanedData, indent=2, ensure_ascii=False)
|
|
|
|
# Validate the JSON string by trying to parse it
|
|
try:
|
|
parsed, parseErr, _ = tryParseJson(jsonString)
|
|
if parseErr is None:
|
|
# Valid JSON - return it
|
|
return jsonString
|
|
else:
|
|
# Invalid JSON - try to repair
|
|
logger.warning(f"Generated JSON is invalid: {parseErr}, attempting repair")
|
|
repaired = closeJsonStructures(jsonString)
|
|
parsed2, parseErr2, _ = tryParseJson(repaired)
|
|
if parseErr2 is None:
|
|
return repaired
|
|
else:
|
|
# Repair failed - return minimal valid structure
|
|
logger.error(f"Repair failed: {parseErr2}, returning minimal structure")
|
|
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
except Exception as parseEx:
|
|
# Parse validation failed - try repair
|
|
logger.warning(f"Parse validation failed: {parseEx}, attempting repair")
|
|
try:
|
|
repaired = closeJsonStructures(jsonString)
|
|
parsed2, parseErr2, _ = tryParseJson(repaired)
|
|
if parseErr2 is None:
|
|
return repaired
|
|
except Exception:
|
|
pass
|
|
# Return minimal valid structure
|
|
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
|
|
except (TypeError, ValueError) as e:
|
|
logger.error(f"Error serializing JSON: {e}")
|
|
# Try to clean more aggressively and retry
|
|
try:
|
|
cleanedData2 = JsonResultBuilder._cleanDataStructure(cleanedData, aggressive=True)
|
|
jsonString = json.dumps(cleanedData2, indent=2, ensure_ascii=False)
|
|
# Validate
|
|
parsed, parseErr, _ = tryParseJson(jsonString)
|
|
if parseErr is None:
|
|
return jsonString
|
|
except Exception:
|
|
pass
|
|
# Fallback to empty structure
|
|
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error building JSON: {e}")
|
|
# Fallback to empty structure
|
|
return json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
|
|
@staticmethod
|
|
def _cleanDataStructure(data: Any, aggressive: bool = False) -> Any:
|
|
"""
|
|
Clean data structure to ensure it's JSON-serializable.
|
|
|
|
Removes None values, ensures lists contain only valid items,
|
|
and repairs incomplete structures.
|
|
"""
|
|
if data is None:
|
|
return {} if aggressive else None
|
|
|
|
if isinstance(data, dict):
|
|
cleaned = {}
|
|
for key, value in data.items():
|
|
if value is None and aggressive:
|
|
continue # Skip None values in aggressive mode
|
|
cleaned[key] = JsonResultBuilder._cleanDataStructure(value, aggressive)
|
|
return cleaned
|
|
|
|
elif isinstance(data, list):
|
|
cleaned = []
|
|
for item in data:
|
|
cleanedItem = JsonResultBuilder._cleanDataStructure(item, aggressive)
|
|
if cleanedItem is not None or not aggressive:
|
|
cleaned.append(cleanedItem)
|
|
return cleaned
|
|
|
|
elif isinstance(data, (str, int, float, bool)):
|
|
return data
|
|
|
|
else:
|
|
# Unknown type - try to convert to string or skip
|
|
if aggressive:
|
|
return str(data)
|
|
return data
|
|
|
|
|
|
class ModularJsonMerger:
|
|
"""
|
|
Modular JSON Merger - Main entry point.
|
|
|
|
Simple pipeline:
|
|
1. Find overlap between JSON strings
|
|
2. Merge strings together
|
|
3. Parse and clean the merged JSON
|
|
"""
|
|
|
|
@staticmethod
|
|
def _findStringOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
|
|
"""
|
|
Find overlap between two JSON strings - GENERIC solution.
|
|
|
|
Works for any JSON structure (arrays, objects, nested, minified, formatted).
|
|
Uses multiple strategies to find overlap regardless of JSON format.
|
|
|
|
Strategy:
|
|
1. Exact suffix/prefix match (fastest, works for any format)
|
|
2. Structure-aware: Find last complete JSON elements in accumulated that match start of fragment
|
|
3. Line-based: If JSON is formatted, use line matching (for better performance)
|
|
4. Partial match: Handle incomplete elements at cut point
|
|
|
|
Returns the length of the overlap (number of characters).
|
|
"""
|
|
if not accStr or not fragStr:
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string", 0)
|
|
return 0
|
|
|
|
# Strategy 1: Try exact suffix/prefix match (fastest, works for any format)
|
|
maxOverlap = min(len(accStr), len(fragStr))
|
|
|
|
# Start from maximum possible overlap and work backwards
|
|
for overlapLen in range(maxOverlap, 0, -1):
|
|
accSuffix = accStr[-overlapLen:]
|
|
fragPrefix = fragStr[:overlapLen]
|
|
|
|
if accSuffix == fragPrefix:
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string (exact)", overlapLen, accSuffix[:200], fragPrefix[:200])
|
|
return overlapLen
|
|
|
|
# Strategy 2: Structure-aware overlap detection (GENERIC - works for any JSON structure)
|
|
# Find last complete JSON elements in accumulated and check if they appear at start of fragment
|
|
overlapLen = ModularJsonMerger._findStructureBasedOverlap(accStr, fragStr, mergeId)
|
|
if overlapLen > 0:
|
|
return overlapLen
|
|
|
|
# Strategy 3: Line-based overlap (works well for formatted JSON)
|
|
# Only use if JSON appears to be formatted (has newlines)
|
|
if '\n' in accStr and '\n' in fragStr:
|
|
overlapLen = ModularJsonMerger._findLineBasedOverlap(accStr, fragStr, mergeId)
|
|
if overlapLen > 0:
|
|
return overlapLen
|
|
|
|
# Strategy 4: Partial overlap (incomplete element at cut point)
|
|
overlapLen = ModularJsonMerger._findPartialOverlap(accStr, fragStr, mergeId)
|
|
if overlapLen > 0:
|
|
return overlapLen
|
|
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string", 0)
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _findStructureBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
|
|
"""
|
|
Find overlap by detecting complete JSON elements (structure-aware, GENERIC).
|
|
|
|
Works for ANY JSON structure:
|
|
- Arrays: Finds last complete array elements
|
|
- Objects: Finds last complete object properties
|
|
- Nested structures: Recursively finds complete elements
|
|
- Minified or formatted JSON: Structure-aware, not format-dependent
|
|
- Any use case: section_content, chapter_structure, code_structure, etc.
|
|
|
|
Strategy: Find last complete JSON elements in accumulated that match start of fragment.
|
|
Uses balanced bracket/brace matching to identify complete elements regardless of format.
|
|
"""
|
|
accTrimmed = accStr.rstrip()
|
|
fragTrimmed = fragStr.lstrip()
|
|
|
|
if not accTrimmed or not fragTrimmed:
|
|
return 0
|
|
|
|
# Find last complete elements in accumulated by parsing backwards
|
|
# Look for complete array elements or object properties
|
|
|
|
# Strategy: Find where accumulated has complete elements at the end
|
|
# and check if fragment starts with the same elements
|
|
|
|
# Use a sliding window approach: check different suffix lengths from accumulated
|
|
maxCheckLength = min(2000, len(accTrimmed), len(fragTrimmed))
|
|
|
|
# Check in reverse order (largest to smallest) to find longest overlap first
|
|
for checkLen in range(maxCheckLength, 50, -5): # Step by 5 for performance
|
|
if checkLen > len(accTrimmed) or checkLen > len(fragTrimmed):
|
|
continue
|
|
|
|
accSuffix = accTrimmed[-checkLen:]
|
|
fragPrefix = fragTrimmed[:checkLen]
|
|
|
|
# Check if accSuffix ends with complete JSON element(s) and fragPrefix starts with same
|
|
# A complete element ends with proper closing brackets/braces
|
|
|
|
# Verify that accSuffix ends with complete structure
|
|
# and fragPrefix starts with the same structure
|
|
if ModularJsonMerger._isCompleteJsonElement(accSuffix) and \
|
|
ModularJsonMerger._startsWithSameElement(accSuffix, fragPrefix):
|
|
# Found overlap! Verify it's meaningful (not just whitespace)
|
|
if len(accSuffix.strip()) > 20:
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string (structure-based)", checkLen, accSuffix[:200], fragPrefix[:200])
|
|
return checkLen
|
|
|
|
# Alternative: Try to find common substring that represents complete elements
|
|
# Look for patterns like complete array rows or object properties
|
|
# Check last 500 chars of accumulated against first 500 chars of fragment
|
|
checkWindow = min(500, len(accTrimmed), len(fragTrimmed))
|
|
if checkWindow > 100:
|
|
accWindow = accTrimmed[-checkWindow:]
|
|
fragWindow = fragTrimmed[:checkWindow]
|
|
|
|
# Find longest common substring that represents complete elements
|
|
# Look for boundaries like ], [ or }, { or ", "
|
|
for i in range(checkWindow - 50, 50, -5):
|
|
accSub = accWindow[-i:]
|
|
fragSub = fragWindow[:i]
|
|
|
|
if accSub == fragSub:
|
|
# Check if it's a complete element boundary
|
|
if ModularJsonMerger._isCompleteElementBoundary(accSub):
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string (structure-boundary)", i, accSub[:200], fragSub[:200])
|
|
return i
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _isCompleteJsonElement(jsonStr: str) -> bool:
|
|
"""Check if string ends with a complete JSON element (balanced brackets/braces)."""
|
|
jsonStr = jsonStr.strip()
|
|
if not jsonStr:
|
|
return False
|
|
|
|
# Check if it ends with complete structure markers
|
|
# Complete array element: ends with ] or ], or ],
|
|
# Complete object element: ends with } or }, or },
|
|
if jsonStr[-1] in ']}':
|
|
# Check if brackets/braces are balanced
|
|
braceCount = jsonStr.count('{') - jsonStr.count('}')
|
|
bracketCount = jsonStr.count('[') - jsonStr.count(']')
|
|
return braceCount == 0 and bracketCount == 0
|
|
|
|
return False
|
|
|
|
@staticmethod
|
|
def _startsWithSameElement(accSuffix: str, fragPrefix: str) -> bool:
|
|
"""Check if fragment prefix starts with the same element as accumulated suffix."""
|
|
# Normalize whitespace for comparison
|
|
accNorm = accSuffix.strip()
|
|
fragNorm = fragPrefix.strip()
|
|
|
|
# Check if fragPrefix starts with accSuffix (or vice versa for partial matches)
|
|
if fragNorm.startswith(accNorm):
|
|
return True
|
|
|
|
# Check if they have common prefix (for partial element completion)
|
|
minLen = min(len(accNorm), len(fragNorm))
|
|
if minLen > 20:
|
|
# Check if first 80% of accSuffix matches start of fragPrefix
|
|
checkLen = int(minLen * 0.8)
|
|
return accNorm[:checkLen] == fragNorm[:checkLen]
|
|
|
|
return False
|
|
|
|
@staticmethod
|
|
def _isCompleteElementBoundary(jsonStr: str) -> bool:
|
|
"""Check if string represents a complete element boundary (e.g., ], [ or }, {)."""
|
|
jsonStr = jsonStr.strip()
|
|
if not jsonStr:
|
|
return False
|
|
|
|
# Check if it contains complete element boundaries
|
|
# Pattern: ends with ], or }, or ],\n or },\n
|
|
if jsonStr.rstrip().endswith(('],', '},', ']', '}')):
|
|
return True
|
|
|
|
# Check if it's a complete array element or object property
|
|
if '],' in jsonStr or '},' in jsonStr:
|
|
return True
|
|
|
|
return False
|
|
|
|
@staticmethod
|
|
def _findLineBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
|
|
"""
|
|
Find overlap using line-based matching (for formatted JSON).
|
|
"""
|
|
accLines = accStr.rstrip().split('\n')
|
|
fragLines = fragStr.lstrip().split('\n')
|
|
|
|
# Try to find matching lines from the end of accumulated at the start of fragment
|
|
maxLinesToCheck = min(10, len(accLines), len(fragLines))
|
|
|
|
for numLines in range(maxLinesToCheck, 0, -1):
|
|
# Get last N lines from accumulated (excluding empty lines)
|
|
accLastLines = [line.strip() for line in accLines[-numLines:] if line.strip()]
|
|
# Get first N lines from fragment (excluding empty lines)
|
|
fragFirstLines = [line.strip() for line in fragLines[:numLines] if line.strip()]
|
|
|
|
# Check if they match
|
|
if len(accLastLines) > 0 and len(fragFirstLines) > 0:
|
|
# Try to find where accLastLines match fragFirstLines
|
|
for i in range(len(accLastLines)):
|
|
# Check if accLastLines[i:] matches fragFirstLines[:len(accLastLines)-i]
|
|
accSuffixLines = accLastLines[i:]
|
|
fragPrefixLines = fragFirstLines[:len(accSuffixLines)]
|
|
|
|
if accSuffixLines == fragPrefixLines and len(accSuffixLines) > 0:
|
|
# Found overlap! Calculate character length
|
|
accSuffixText = '\n'.join(accLastLines[i:])
|
|
fragPrefixText = '\n'.join(fragPrefixLines)
|
|
|
|
# Find where this text appears in the original strings
|
|
accPos = accStr.rfind(accSuffixText)
|
|
fragPos = fragStr.find(fragPrefixText)
|
|
|
|
if accPos >= 0 and fragPos == 0:
|
|
# Found valid overlap
|
|
overlapLen = len(accSuffixText)
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string (line-based)", overlapLen, accSuffixText[:200], fragPrefixText[:200])
|
|
return overlapLen
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _findPartialOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int:
|
|
"""
|
|
Find partial overlap (incomplete element at cut point).
|
|
"""
|
|
accLines = accStr.rstrip().split('\n')
|
|
fragLines = fragStr.lstrip().split('\n')
|
|
|
|
if accLines and fragLines:
|
|
lastAccLine = accLines[-1].strip()
|
|
firstFragLine = fragLines[0].strip()
|
|
|
|
# Check if lastAccLine is a prefix of firstFragLine (incomplete line completed)
|
|
if lastAccLine and firstFragLine.startswith(lastAccLine):
|
|
# Also check if there are more matching lines after
|
|
overlapLen = len(lastAccLine)
|
|
# Try to extend overlap with more lines
|
|
for i in range(1, min(len(accLines), len(fragLines))):
|
|
if accLines[-1-i].strip() == fragLines[i].strip():
|
|
overlapLen += len('\n' + fragLines[i])
|
|
else:
|
|
break
|
|
|
|
if overlapLen > 20: # Only if meaningful overlap
|
|
if mergeId:
|
|
JsonMergeLogger.logOverlap("string (partial line)", overlapLen, lastAccLine[:200], firstFragLine[:200])
|
|
return overlapLen
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str:
|
|
"""
|
|
Merge two JSON strings together, removing the overlap.
|
|
"""
|
|
if overlapLength > 0:
|
|
# Remove overlap from fragment and append
|
|
merged = accStr + fragStr[overlapLength:]
|
|
else:
|
|
# No overlap - just concatenate (might need comma or other separator)
|
|
# Try to add comma if needed
|
|
accTrimmed = accStr.rstrip().rstrip(',')
|
|
fragTrimmed = fragStr.lstrip().lstrip(',')
|
|
|
|
# Check if we need a separator
|
|
if accTrimmed and fragTrimmed:
|
|
# If accumulated ends with } or ] and fragment starts with { or [, we might need comma
|
|
if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['):
|
|
merged = accTrimmed + ',' + fragTrimmed
|
|
else:
|
|
merged = accTrimmed + fragTrimmed
|
|
else:
|
|
merged = accStr + fragStr
|
|
|
|
return merged
|
|
|
|
@staticmethod
|
|
def merge(accumulated: str, newFragment: str) -> Tuple[str, bool]:
|
|
"""
|
|
Merge two JSON fragments intelligently.
|
|
|
|
Args:
|
|
accumulated: Previously accumulated JSON string
|
|
newFragment: New fragment JSON string
|
|
|
|
Returns:
|
|
Tuple of (merged_json_string, has_overlap):
|
|
- merged_json_string: Merged JSON string (closed if no overlap, unclosed if overlap found)
|
|
- has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop)
|
|
"""
|
|
# Start logging
|
|
mergeId = JsonMergeLogger.startMerge(accumulated, newFragment)
|
|
|
|
if not accumulated:
|
|
result = newFragment if newFragment else "{}"
|
|
JsonMergeLogger.finishMerge(mergeId, result, True)
|
|
return (result, False) # No overlap if no accumulated data
|
|
if not newFragment:
|
|
JsonMergeLogger.finishMerge(mergeId, accumulated, True)
|
|
return (accumulated, False) # No overlap if no new fragment
|
|
|
|
try:
|
|
# Normalize both strings
|
|
accNormalized = stripCodeFences(normalizeJsonText(accumulated)).strip()
|
|
fragNormalized = stripCodeFences(normalizeJsonText(newFragment)).strip()
|
|
|
|
JsonMergeLogger._log(f"\n Normalized Accumulated ({len(accNormalized)} chars)")
|
|
accNormLines = accNormalized.split('\n')
|
|
if len(accNormLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(accNormLines)} lines)")
|
|
for line in accNormLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(accNormLines) - 10} lines omitted) ...")
|
|
for line in accNormLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in accNormLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f"\n Normalized New Fragment ({len(fragNormalized)} chars)")
|
|
fragNormLines = fragNormalized.split('\n')
|
|
if len(fragNormLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(fragNormLines)} lines)")
|
|
for line in fragNormLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(fragNormLines) - 10} lines omitted) ...")
|
|
for line in fragNormLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in fragNormLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
|
|
# Step 1: Find overlap between JSON strings
|
|
JsonMergeLogger.logStep("PHASE 1", "Finding overlap between JSON strings", None)
|
|
overlapLength = ModularJsonMerger._findStringOverlap(accNormalized, fragNormalized, mergeId)
|
|
|
|
if overlapLength > 0:
|
|
accSuffix = accNormalized[-overlapLength:]
|
|
fragPrefix = fragNormalized[:overlapLength]
|
|
JsonMergeLogger._log(f"\n Overlap found ({overlapLength} chars):")
|
|
JsonMergeLogger._log(f" Accumulated suffix: {accSuffix}")
|
|
JsonMergeLogger._log(f" Fragment prefix: {fragPrefix}")
|
|
else:
|
|
# CRITICAL: No overlap found - this means iterations should stop
|
|
JsonMergeLogger._log(f"\n ⚠️ NO OVERLAP FOUND - This indicates iterations should stop")
|
|
JsonMergeLogger._log(f" Closing JSON and returning final result")
|
|
|
|
# Close the accumulated JSON (it's complete as far as we can tell)
|
|
closedJson = closeJsonStructures(accNormalized)
|
|
JsonMergeLogger._log(f"\n Closed JSON ({len(closedJson)} chars):")
|
|
JsonMergeLogger._log(" " + "="*78)
|
|
for line in closedJson.split('\n'):
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(" " + "="*78)
|
|
|
|
JsonMergeLogger.finishMerge(mergeId, closedJson, True)
|
|
# Return closed JSON with has_overlap=False to indicate iterations should stop
|
|
return (closedJson, False)
|
|
|
|
# Step 2: Merge strings together (only if overlap was found)
|
|
JsonMergeLogger.logStep("PHASE 2", f"Merging strings (overlap: {overlapLength} chars)", None)
|
|
mergedString = ModularJsonMerger._mergeStrings(accNormalized, fragNormalized, overlapLength)
|
|
|
|
JsonMergeLogger._log(f"\n Merged String ({len(mergedString)} chars)")
|
|
mergedLines = mergedString.split('\n')
|
|
if len(mergedLines) > 10:
|
|
JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(mergedLines)} lines)")
|
|
for line in mergedLines[:5]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
JsonMergeLogger._log(f" ... ({len(mergedLines) - 10} lines omitted) ...")
|
|
for line in mergedLines[-5:]:
|
|
JsonMergeLogger._log(f" {line}")
|
|
else:
|
|
for line in mergedLines:
|
|
JsonMergeLogger._log(f" {line}")
|
|
|
|
# Step 3: Return merged string (with incomplete element at end for next iteration)
|
|
JsonMergeLogger.logStep("PHASE 3", "Returning merged string (may be unclosed)", None)
|
|
JsonMergeLogger._log(f"\n Returning merged string (preserving incomplete element at end for next iteration)")
|
|
|
|
JsonMergeLogger.finishMerge(mergeId, mergedString, True)
|
|
# Return merged string with has_overlap=True to indicate iterations should continue
|
|
return (mergedString, True)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in modular merger: {e}")
|
|
JsonMergeLogger.logStep("ERROR", f"Exception occurred: {str(e)}", None, error=str(e))
|
|
# Fallback: try to return accumulated if valid
|
|
try:
|
|
accParsed, accErr, _ = tryParseJson(accumulated)
|
|
if accErr is None:
|
|
JsonMergeLogger.finishMerge(mergeId, accumulated, False)
|
|
return (accumulated, False) # No overlap on error
|
|
except Exception:
|
|
pass
|
|
# Last resort: return empty valid JSON
|
|
fallback = json.dumps({"elements": []}, indent=2, ensure_ascii=False)
|
|
JsonMergeLogger.finishMerge(mergeId, fallback, False)
|
|
return (fallback, False) # No overlap on error
|