# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Modular JSON Merger - Intelligent JSON Fragment Merging A clean, modular approach to merging JSON fragments that may be cut randomly. Designed to be simple, robust, and always return valid data. Architecture: 1. Data Extractor: Extracts all possible data from fragments (even incomplete) 2. Structure Detector: Detects JSON structure type (elements, documents, files, etc.) 3. Data Merger: Intelligently merges data with overlap detection 4. Result Builder: Always returns valid JSON structure """ import json import re import logging import os from datetime import datetime from typing import Dict, Any, List, Optional, Tuple, Union from modules.shared.jsonUtils import ( normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson ) logger = logging.getLogger(__name__) class JsonMergeLogger: """Consolidated logger for JSON merging process.""" _logBuffer: List[str] = [] _mergeId: int = 0 _currentLogFile: Optional[str] = None _appendMode: bool = False @staticmethod def initializeLogFile(logFileName: Optional[str] = None): """Initialize a new log file for a test run.""" JsonMergeLogger._logBuffer = [] JsonMergeLogger._mergeId = 0 if logFileName: JsonMergeLogger._currentLogFile = logFileName JsonMergeLogger._appendMode = False # Clear existing file try: currentFileDir = os.path.dirname(os.path.abspath(__file__)) logFilePath = os.path.join(currentFileDir, logFileName) with open(logFilePath, 'w', encoding='utf-8') as f: f.write("") # Clear file except Exception: pass else: JsonMergeLogger._currentLogFile = None JsonMergeLogger._appendMode = False @staticmethod def startMerge(accumulated: str, newFragment: str) -> str: """Start a new merge operation and return merge ID.""" JsonMergeLogger._mergeId += 1 mergeId = f"merge_{JsonMergeLogger._mergeId}" JsonMergeLogger._log(f"{'='*80}") JsonMergeLogger._log(f"JSON MERGE OPERATION #{JsonMergeLogger._mergeId}") JsonMergeLogger._log(f"{'='*80}") JsonMergeLogger._log(f"Timestamp: {datetime.now().isoformat()}") JsonMergeLogger._log("") JsonMergeLogger._log("INPUT:") JsonMergeLogger._log(f" Accumulated length: {len(accumulated)} chars") JsonMergeLogger._log(f" New Fragment length: {len(newFragment)} chars") # Log only summary (first 5 and last 5 lines) to avoid log spam accLines = accumulated.split('\n') fragLines = newFragment.split('\n') JsonMergeLogger._log(f" Accumulated: {len(accLines)} lines (showing first 5 and last 5)") if len(accLines) > 10: for line in accLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(accLines) - 10} lines omitted) ...") for line in accLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in accLines: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" New Fragment: {len(fragLines)} lines (showing first 5 and last 5)") if len(fragLines) > 10: for line in fragLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(fragLines) - 10} lines omitted) ...") for line in fragLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in fragLines: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log("") return mergeId @staticmethod def logStep(stepName: str, description: str, result: Any = None, error: Optional[str] = None): """Log a step with its result.""" JsonMergeLogger._log(f"STEP: {stepName}") JsonMergeLogger._log(f" Description: {description}") if error: JsonMergeLogger._log(f" ❌ ERROR: {error}") elif result is not None: if isinstance(result, str): resultLines = result.split('\n') JsonMergeLogger._log(f" ✅ Result (string, {len(result)} chars, {len(resultLines)} lines)") if len(resultLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in resultLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(resultLines) - 10} lines omitted) ...") for line in resultLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in resultLines: JsonMergeLogger._log(f" {line}") elif isinstance(result, dict): keys = list(result.keys()) JsonMergeLogger._log(f" ✅ Result (dict): keys={keys}, size={len(str(result))} chars") # Log full structure with JSON formatting - NO TRUNCATION try: jsonStr = json.dumps(result, indent=2, ensure_ascii=False) JsonMergeLogger._log(f" Full data (COMPLETE, {len(jsonStr)} chars):") JsonMergeLogger._log(" " + "="*76) for line in jsonStr.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*76) except Exception as e: JsonMergeLogger._log(f" Could not serialize: {e}") strRepr = str(result) strLines = strRepr.split('\n') JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)") if len(strLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in strLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...") for line in strLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in strLines: JsonMergeLogger._log(f" {line}") # Log structure details if "elements" in result: elemCount = len(result["elements"]) if isinstance(result["elements"], list) else 0 JsonMergeLogger._log(f" - elements: {elemCount} items") if isinstance(result["elements"], list) and elemCount > 0: JsonMergeLogger._log(f" First element type: {result['elements'][0].get('type', 'unknown') if isinstance(result['elements'][0], dict) else 'not a dict'}") if "documents" in result: docCount = len(result["documents"]) if isinstance(result["documents"], list) else 0 JsonMergeLogger._log(f" - documents: {docCount} items") elif isinstance(result, list): JsonMergeLogger._log(f" ✅ Result (list): {len(result)} items (COMPLETE)") if len(result) > 0: JsonMergeLogger._log(f" First item type: {type(result[0]).__name__}") try: jsonStr = json.dumps(result, indent=2, ensure_ascii=False) # ALL items JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):") JsonMergeLogger._log(" " + "="*76) for line in jsonStr.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*76) except Exception: strRepr = str(result) strLines = strRepr.split('\n') JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)") if len(strLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in strLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...") for line in strLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in strLines: JsonMergeLogger._log(f" {line}") else: JsonMergeLogger._log(f" ✅ Result: {type(result).__name__} = {str(result)[:200]}") else: JsonMergeLogger._log(f" ⏳ In progress...") JsonMergeLogger._log("") @staticmethod def logExtraction(strategy: str, success: bool, data: Any = None, error: Optional[str] = None): """Log extraction strategy result.""" status = "✅ SUCCESS" if success else "❌ FAILED" JsonMergeLogger._log(f" Extraction Strategy: {strategy} - {status}") if error: JsonMergeLogger._log(f" Error: {error}") elif data is not None: if isinstance(data, dict): keys = list(data.keys()) JsonMergeLogger._log(f" Extracted keys: {keys}") # Log full extracted data - NO TRUNCATION try: jsonStr = json.dumps(data, indent=2, ensure_ascii=False) JsonMergeLogger._log(f" Extracted data (COMPLETE, {len(jsonStr)} chars):") JsonMergeLogger._log(" " + "="*76) for line in jsonStr.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*76) except Exception as e: JsonMergeLogger._log(f" Could not serialize extracted data: {e}") strRepr = str(data) strLines = strRepr.split('\n') JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)") if len(strLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in strLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...") for line in strLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in strLines: JsonMergeLogger._log(f" {line}") elif isinstance(data, list): JsonMergeLogger._log(f" Extracted {len(data)} items (COMPLETE)") if len(data) > 0: try: jsonStr = json.dumps(data, indent=2, ensure_ascii=False) # ALL items JsonMergeLogger._log(f" All items (COMPLETE, {len(jsonStr)} chars):") JsonMergeLogger._log(" " + "="*76) for line in jsonStr.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*76) except Exception as e: JsonMergeLogger._log(f" Could not serialize list: {e}") strRepr = str(data) strLines = strRepr.split('\n') JsonMergeLogger._log(f" String representation ({len(strRepr)} chars, {len(strLines)} lines)") if len(strLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in strLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(strLines) - 10} lines omitted) ...") for line in strLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in strLines: JsonMergeLogger._log(f" {line}") @staticmethod def logOverlap(overlapType: str, overlapLen: int, accSuffix: Any = None, fragPrefix: Any = None): """Log overlap detection result.""" JsonMergeLogger._log(f" Overlap Detection ({overlapType}):") JsonMergeLogger._log(f" Overlap length: {overlapLen}") if overlapLen > 0: JsonMergeLogger._log(f" ✅ Found overlap of {overlapLen} chars") if accSuffix is not None: if isinstance(accSuffix, str): JsonMergeLogger._log(f" Accumulated suffix (COMPLETE, {len(accSuffix)} chars):") JsonMergeLogger._log(" " + "="*76) for line in accSuffix.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*76) else: # For lists/arrays, only log summary to avoid log flooding if isinstance(accSuffix, list): JsonMergeLogger._log(f" Accumulated suffix: list with {len(accSuffix)} items") else: JsonMergeLogger._log(f" Accumulated suffix: {type(accSuffix).__name__}") if fragPrefix is not None: if isinstance(fragPrefix, str): prefixLines = fragPrefix.split('\n') JsonMergeLogger._log(f" Fragment prefix ({len(fragPrefix)} chars, {len(prefixLines)} lines)") if len(prefixLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 lines)") for line in prefixLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(prefixLines) - 10} lines omitted) ...") for line in prefixLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in prefixLines: JsonMergeLogger._log(f" {line}") else: # For lists/arrays, only log summary to avoid log flooding if isinstance(fragPrefix, list): JsonMergeLogger._log(f" Fragment prefix: list with {len(fragPrefix)} items") else: JsonMergeLogger._log(f" Fragment prefix: {type(fragPrefix).__name__}") else: JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all") @staticmethod def logValidation(validationType: str, success: bool, error: Optional[str] = None): """Log validation result.""" status = "✅ VALID" if success else "❌ INVALID" JsonMergeLogger._log(f" Validation ({validationType}): {status}") if error: JsonMergeLogger._log(f" Error: {error}") @staticmethod def finishMerge(mergeId: str, finalResult: str, success: bool): """Finish merge operation and write log file.""" JsonMergeLogger._log("") JsonMergeLogger._log(f"{'='*80}") JsonMergeLogger._log(f"MERGE RESULT: {'✅ SUCCESS' if success else '❌ FAILED'}") JsonMergeLogger._log(f"{'='*80}") JsonMergeLogger._log(f"Final result length: {len(finalResult)} chars") JsonMergeLogger._log("Final result (COMPLETE):") JsonMergeLogger._log("="*80) for line in finalResult.split('\n'): JsonMergeLogger._log(line) JsonMergeLogger._log("="*80) JsonMergeLogger._log("") # Write log content to buffer (will be written at end of test run) logContent = "\n".join(JsonMergeLogger._logBuffer) # If we have a current log file, append to it if JsonMergeLogger._currentLogFile: try: currentFileDir = os.path.dirname(os.path.abspath(__file__)) logFilePath = os.path.join(currentFileDir, JsonMergeLogger._currentLogFile) mode = 'a' if JsonMergeLogger._appendMode else 'w' with open(logFilePath, mode, encoding='utf-8') as f: f.write(logContent) f.write("\n\n") # Add separator between merges JsonMergeLogger._appendMode = True # Next writes will append logger.debug(f"JSON merge log appended to: {logFilePath}") except Exception as e: logger.error(f"Failed to write merge log file: {e}") else: # No log file set - write individual file (fallback) currentFileDir = os.path.dirname(os.path.abspath(__file__)) logDir = currentFileDir os.makedirs(logDir, exist_ok=True) logFilePath = os.path.join(logDir, f"{mergeId}.txt") try: with open(logFilePath, 'w', encoding='utf-8') as f: f.write(logContent) logger.info(f"JSON merge log written to: {logFilePath}") except Exception as e: logger.error(f"Failed to write merge log file: {e}") # Clear buffer for next merge JsonMergeLogger._logBuffer = [] @staticmethod def _log(message: str): """Internal log method.""" JsonMergeLogger._logBuffer.append(message) logger.debug(message) class JsonDataExtractor: """Extracts data from JSON fragments, even if incomplete.""" @staticmethod def extract(jsonString: str, mergeId: Optional[str] = None, removeFromEnd: bool = True) -> Dict[str, Any]: """ Extract complete data from JSON fragment. For merging: We know exactly where to clean: - accumulated: remove incomplete parts at the END - newFragment: remove incomplete parts at the BEGINNING Simple approach: Remove incomplete parts at specified position, then parse. """ if mergeId: position = "END" if removeFromEnd else "BEGINNING" JsonMergeLogger.logStep("EXTRACTION", f"Extracting data from JSON fragment ({len(jsonString)} chars) - cleaning from {position}") if not jsonString or not jsonString.strip(): if mergeId: JsonMergeLogger.logExtraction("Empty input", False, error="Input is empty") return {} normalized = stripCodeFences(normalizeJsonText(jsonString)).strip() if not normalized: if mergeId: JsonMergeLogger.logExtraction("Normalization", False, error="Normalized string is empty") return {} # Try to parse as complete JSON first parsed, parseErr, _ = tryParseJson(normalized) if parseErr is None and parsed is not None: if isinstance(parsed, dict): finalResult = parsed elif isinstance(parsed, list): finalResult = {"elements": parsed} else: finalResult = {"elements": [parsed]} if parsed else {} if mergeId: JsonMergeLogger.logExtraction("Direct parsing", True, finalResult) JsonMergeLogger.logStep("EXTRACTION", "Direct parsing successful", finalResult) return finalResult if finalResult else {} # Remove incomplete parts from specified position if removeFromEnd: cleaned = JsonDataExtractor._removeIncompleteFromEnd(normalized) else: cleaned = JsonDataExtractor._removeIncompleteFromBeginning(normalized) if cleaned: # Close structures and try to parse closed = closeJsonStructures(cleaned) parsed, parseErr2, _ = tryParseJson(closed) if parseErr2 is None and parsed is not None: if isinstance(parsed, dict): finalResult = parsed elif isinstance(parsed, list): finalResult = {"elements": parsed} else: finalResult = {"elements": [parsed]} if parsed else {} if mergeId: JsonMergeLogger.logExtraction("Remove incomplete + close", True, finalResult) JsonMergeLogger.logStep("EXTRACTION", "Remove incomplete + close successful", finalResult) return finalResult if finalResult else {} # Return empty dict if nothing worked if mergeId: JsonMergeLogger.logStep("EXTRACTION", "No data extracted", {}, error="All strategies failed") return {} @staticmethod def _removeIncompleteFromEnd(jsonString: str) -> str: """ Remove incomplete parts from the END of JSON string. Goes through structure level by level, keeps complete elements, removes incomplete ones at the end. """ # Find first '{' or '[' to start startIdx = -1 for i, char in enumerate(jsonString): if char in '{[': startIdx = i break if startIdx == -1: return "" # Remove incomplete parts from end recursively cleaned = JsonDataExtractor._cleanJsonFromEnd(jsonString[startIdx:]) return cleaned @staticmethod def _removeIncompleteFromBeginning(jsonString: str) -> str: """ Remove incomplete parts from the BEGINNING of JSON string. Finds where valid JSON starts and removes everything before it. """ # Find first '{' or '[' to start startIdx = -1 for i, char in enumerate(jsonString): if char in '{[': startIdx = i break if startIdx == -1: return "" # Return from start position - beginning cleanup is just finding the start return jsonString[startIdx:] @staticmethod def _cleanJsonFromEnd(jsonStr: str) -> str: """ Recursively clean JSON from the END: keep complete elements, remove incomplete ones at the end. Goes through structure level by level. """ # Try to parse as-is first try: parsed = json.loads(jsonStr) return jsonStr except Exception: pass # If dict: go through each key-value pair, remove incomplete ones at the end if jsonStr.strip().startswith('{'): return JsonDataExtractor._cleanDictFromEnd(jsonStr) # If array: go through each element, remove incomplete ones at the end if jsonStr.strip().startswith('['): return JsonDataExtractor._cleanArrayFromEnd(jsonStr) return "" @staticmethod def _cleanDictFromEnd(jsonStr: str) -> str: """Clean dict from END: keep complete key-value pairs, remove incomplete ones at the end.""" if not jsonStr.strip().startswith('{'): return "" result = ['{'] i = 1 # Skip opening '{' first = True while i < len(jsonStr): # Skip whitespace while i < len(jsonStr) and jsonStr[i] in ' \n\r\t': i += 1 if i >= len(jsonStr): break # Check if we hit closing brace if jsonStr[i] == '}': break # Skip comma if jsonStr[i] == ',': i += 1 continue # Try to extract key-value pair keyStart = i # Find key (string) if jsonStr[i] == '"': i += 1 while i < len(jsonStr) and jsonStr[i] != '"': if jsonStr[i] == '\\': i += 2 else: i += 1 if i < len(jsonStr): i += 1 # Skip closing quote else: # Invalid key - stop here (incomplete at end) break # Skip whitespace and colon while i < len(jsonStr) and jsonStr[i] in ' \n\r\t:': i += 1 if i >= len(jsonStr): break # Try to extract value valueStart = i valueEnd = JsonDataExtractor._findCompleteValue(jsonStr, i) if valueEnd > valueStart: # Try to parse this key-value pair pairStr = jsonStr[keyStart:valueEnd] try: # Test if it's valid JSON testStr = '{' + pairStr + '}' json.loads(testStr) # Valid pair - add it if not first: result.append(',') result.append(pairStr) first = False i = valueEnd except Exception: # Invalid pair - stop here (incomplete at end) break else: # Incomplete value - stop here (incomplete at end) break result.append('}') return ''.join(result) @staticmethod def _cleanArrayFromEnd(jsonStr: str) -> str: """Clean array from END: keep complete elements, remove incomplete ones at the end.""" if not jsonStr.strip().startswith('['): return "" result = ['['] i = 1 # Skip opening '[' first = True while i < len(jsonStr): # Skip whitespace while i < len(jsonStr) and jsonStr[i] in ' \n\r\t': i += 1 if i >= len(jsonStr): break # Check if we hit closing bracket if jsonStr[i] == ']': break # Skip comma if jsonStr[i] == ',': i += 1 continue # Try to extract element elemStart = i elemEnd = JsonDataExtractor._findCompleteValue(jsonStr, i) if elemEnd > elemStart: # Try to parse this element elemStr = jsonStr[elemStart:elemEnd] try: # Test if it's valid JSON json.loads(elemStr) # Valid element - add it if not first: result.append(',') result.append(elemStr) first = False i = elemEnd except Exception: # Invalid element - stop here (incomplete at end) break else: # Incomplete element - stop here (incomplete at end) break result.append(']') return ''.join(result) @staticmethod def _findCompleteValue(jsonStr: str, start: int) -> int: """Find the end of a complete JSON value starting at start position.""" if start >= len(jsonStr): return start i = start # Skip whitespace while i < len(jsonStr) and jsonStr[i] in ' \n\r\t': i += 1 if i >= len(jsonStr): return start char = jsonStr[i] # String if char == '"': i += 1 while i < len(jsonStr): if jsonStr[i] == '\\': i += 2 elif jsonStr[i] == '"': return i + 1 else: i += 1 return start # Incomplete string # Number, boolean, null if char in '-0123456789tfn': while i < len(jsonStr) and jsonStr[i] not in ',}]': i += 1 return i # Object if char == '{': braceCount = 1 i += 1 while i < len(jsonStr) and braceCount > 0: if jsonStr[i] == '\\': i += 2 elif jsonStr[i] == '"': # Skip string i += 1 while i < len(jsonStr): if jsonStr[i] == '\\': i += 2 elif jsonStr[i] == '"': i += 1 break else: i += 1 elif jsonStr[i] == '{': braceCount += 1 i += 1 elif jsonStr[i] == '}': braceCount -= 1 i += 1 else: i += 1 if braceCount == 0: return i return start # Incomplete object # Array if char == '[': bracketCount = 1 i += 1 while i < len(jsonStr) and bracketCount > 0: if jsonStr[i] == '\\': i += 2 elif jsonStr[i] == '"': # Skip string i += 1 while i < len(jsonStr): if jsonStr[i] == '\\': i += 2 elif jsonStr[i] == '"': i += 1 break else: i += 1 elif jsonStr[i] == '[': bracketCount += 1 i += 1 elif jsonStr[i] == ']': bracketCount -= 1 i += 1 else: i += 1 if bracketCount == 0: return i return start # Incomplete array return start @staticmethod def _extractAllCompleteObjects(jsonString: str) -> List[Dict[str, Any]]: """ Extract ALL complete objects from JSON string using balanced brace matching. Ignores incomplete objects at the end. Core principle: Every fragment can be cut anywhere - extract only complete objects. """ foundObjs = [] braceCount = 0 startPos = -1 for i, char in enumerate(jsonString): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: # Found a complete object objStr = jsonString[startPos:i+1] try: obj = json.loads(objStr) if isinstance(obj, dict) and obj: foundObjs.append(obj) except Exception: # Not valid JSON - skip it pass startPos = -1 elif braceCount < 0: # Unbalanced - reset braceCount = 0 startPos = -1 # If we end with an incomplete object (startPos >= 0 and braceCount > 0), ignore it # It will be in the next fragment return foundObjs @staticmethod def _extractElements(jsonString: str) -> List[Dict[str, Any]]: """Extract elements array from JSON string - extracts ALL complete elements.""" elements = [] # Pattern 1: Look for "elements": [...] (including incomplete at end) elementsPattern = r'"elements"\s*:\s*\[(.*)' match = re.search(elementsPattern, jsonString, re.DOTALL) if match: elementsContent = match.group(1) # Extract ALL complete element objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(elementsContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: elementStr = elementsContent[startPos:i+1] try: element = json.loads(elementStr) if isinstance(element, dict): elements.append(element) except Exception: # Try to extract table rows from incomplete element rows = JsonDataExtractor._extractTableRowsFromElement(elementStr) if rows: elements.append({ "type": "table", "content": { "rows": rows } }) startPos = -1 elif braceCount < 0: break # Unbalanced - stop # Pattern 2: Look for table structure directly (even if incomplete) if not elements: # Look for "type": "table" pattern tablePattern = r'"type"\s*:\s*"table"[^}]*"rows"\s*:\s*\[(.*?)(?:\]|$)' tableMatch = re.search(tablePattern, jsonString, re.DOTALL) if tableMatch: rowsContent = tableMatch.group(1) rows = JsonDataExtractor._extractRowsFromContent(rowsContent) if rows: elements.append({ "type": "table", "content": { "rows": rows } }) # Pattern 3: Look for table rows directly (without structure) if not elements: rows = JsonDataExtractor._extractTableRows(jsonString) if rows: elements.append({ "type": "table", "content": { "rows": rows } }) return elements @staticmethod def _extractTableRowsFromElement(elementStr: str) -> List[List[str]]: """Extract table rows from incomplete element string.""" # Look for rows array in element rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)' match = re.search(rowsPattern, elementStr, re.DOTALL) if match: return JsonDataExtractor._extractRowsFromContent(match.group(1)) return [] @staticmethod def _extractRowsFromContent(rowsContent: str) -> List[List[str]]: """Extract rows from rows content string.""" rows = [] # Extract all array patterns: ["value1", "value2"] # Use non-greedy matching but ensure we get complete arrays arrayPattern = r'\[(.*?)\]' arrayMatches = re.findall(arrayPattern, rowsContent) for arrayContent in arrayMatches: # Extract cells - handle both quoted strings and numbers # First try to find quoted strings cellPattern = r'"([^"]*)"' cells = re.findall(cellPattern, arrayContent) # If no quoted strings, try numbers or other values if not cells: # Try to find any values (numbers, booleans, etc.) valuePattern = r'(-?\d+\.?\d*|true|false|null)' cells = re.findall(valuePattern, arrayContent) # Only add rows with at least 1 cell (allow single-column tables) if len(cells) >= 1: rows.append(cells) return rows @staticmethod def _extractTableRows(jsonString: str) -> List[List[str]]: """Extract table rows from JSON string using multiple strategies.""" rows = [] # Strategy 1: Look for "rows": [[...], [...]] rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)' match = re.search(rowsPattern, jsonString, re.DOTALL) if match: rowsContent = match.group(1) rows = JsonDataExtractor._extractRowsFromContent(rowsContent) if rows: return rows # Strategy 2: Look for standalone array patterns ["value1", "value2"] # Pattern for complete arrays with 2 columns completeArrayPattern = r'\["([^"]*)",\s*"([^"]*)"\]' matches = re.findall(completeArrayPattern, jsonString) if len(matches) >= 2: # Need at least 2 rows to be confident return [[m[0], m[1]] for m in matches] # Strategy 3: Extract any array patterns (more lenient) # Find all [ ... ] patterns that contain quoted strings allArrays = re.findall(r'\[([^\]]*)\]', jsonString) for arrayContent in allArrays: # Extract quoted strings cells = re.findall(r'"([^"]*)"', arrayContent) if len(cells) >= 2: # At least 2 columns rows.append(cells) # Only return if we have multiple rows (likely a table) if len(rows) >= 2: return rows return [] @staticmethod def _extractDocuments(jsonString: str) -> List[Dict[str, Any]]: """ Extract documents structure from JSON string - extracts ALL complete documents/chapters/sections. Ignores incomplete ones at the end. Core principle: Fragment can be cut anywhere - extract only complete objects. """ documents = [] # Pattern 1: Look for "documents": [...] structure (including incomplete at end) documentsPattern = r'"documents"\s*:\s*\[(.*)' match = re.search(documentsPattern, jsonString, re.DOTALL) if match: documentsContent = match.group(1) # Extract ALL complete document objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(documentsContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: # Found a complete document object docStr = documentsContent[startPos:i+1] try: doc = json.loads(docStr) if isinstance(doc, dict): # Extract chapters/sections from document chapters = JsonDataExtractor._extractChaptersFromDocument(docStr) sections = JsonDataExtractor._extractSectionsFromDocument(docStr) if chapters: doc["chapters"] = chapters if sections: doc["sections"] = sections if doc: documents.append(doc) except Exception: # Not valid JSON - try to extract chapters/sections directly chapters = JsonDataExtractor._extractChaptersFromDocument(docStr) sections = JsonDataExtractor._extractSectionsFromDocument(docStr) if chapters or sections: doc = {} if chapters: doc["chapters"] = chapters if sections: doc["sections"] = sections if doc: documents.append(doc) startPos = -1 elif braceCount < 0: break # If we end with an incomplete document (startPos >= 0 and braceCount > 0), ignore it # It will be in the next fragment if documents: return documents # Pattern 2: Look for "chapters": [...] pattern directly (fragment might start mid-document) chapters = JsonDataExtractor._extractChaptersFromString(jsonString) if chapters: documents.append({"chapters": chapters}) # Pattern 3: Look for "sections": [...] pattern directly sections = JsonDataExtractor._extractSectionsFromString(jsonString) if sections: documents.append({"sections": sections}) return documents @staticmethod def _extractChaptersFromDocument(docStr: str) -> List[Dict[str, Any]]: """Extract chapters array from document string.""" return JsonDataExtractor._extractChaptersFromString(docStr) @staticmethod def _extractChaptersFromString(jsonString: str) -> List[Dict[str, Any]]: """ Extract chapters array from JSON string - extracts ALL complete chapters. Ignores incomplete chapters at the end. Core principle: Fragment can be cut anywhere - extract only complete objects. """ chapters = [] # Look for "chapters": [...] pattern (including incomplete at end) chaptersPattern = r'"chapters"\s*:\s*\[(.*)' match = re.search(chaptersPattern, jsonString, re.DOTALL) if match: chaptersContent = match.group(1) # Extract ALL complete chapter objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(chaptersContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: # Found a complete chapter object chapterStr = chaptersContent[startPos:i+1] try: chapter = json.loads(chapterStr) if isinstance(chapter, dict): chapters.append(chapter) except Exception: # Not valid JSON - skip it (incomplete chapter) pass startPos = -1 elif braceCount < 0: # Unbalanced - stop here break # If we end with an incomplete chapter (startPos >= 0 and braceCount > 0), ignore it # It will be in the next fragment # Also try to extract chapters that might be standalone (fragment starts mid-array) # Look for complete chapter objects anywhere in the string if not chapters: # Try to find complete chapter objects using balanced brace matching allObjs = JsonDataExtractor._extractAllCompleteObjects(jsonString) # Filter for objects that look like chapters (have id and title) for obj in allObjs: if isinstance(obj, dict) and "id" in obj and "title" in obj: chapters.append(obj) return chapters @staticmethod def _extractSectionsFromDocument(docStr: str) -> List[Dict[str, Any]]: """Extract sections array from document string.""" return JsonDataExtractor._extractSectionsFromString(docStr) @staticmethod def _extractSectionsFromString(jsonString: str) -> List[Dict[str, Any]]: """Extract sections array from JSON string, even if incomplete.""" sections = [] # Look for "sections": [...] sectionsPattern = r'"sections"\s*:\s*\[(.*?)(?:\]|$)' match = re.search(sectionsPattern, jsonString, re.DOTALL) if match: sectionsContent = match.group(1) # Extract section objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(sectionsContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: sectionStr = sectionsContent[startPos:i+1] try: section = json.loads(sectionStr) if isinstance(section, dict): sections.append(section) except Exception: # Incomplete section - try to extract what we can idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', sectionStr) contentTypeMatch = re.search(r'"content_type"\s*:\s*"([^"]*)"', sectionStr) if idMatch or contentTypeMatch: section = {} if idMatch: section["id"] = idMatch.group(1) if contentTypeMatch: section["content_type"] = contentTypeMatch.group(1) if section: sections.append(section) startPos = -1 return sections @staticmethod def _extractFiles(jsonString: str) -> List[Dict[str, Any]]: """Extract files array from JSON string, even if incomplete.""" files = [] # Look for "files": [...] filesPattern = r'"files"\s*:\s*\[(.*?)(?:\]|$)' match = re.search(filesPattern, jsonString, re.DOTALL) if match: filesContent = match.group(1) # Extract file objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(filesContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: fileStr = filesContent[startPos:i+1] try: fileObj = json.loads(fileStr) if isinstance(fileObj, dict): files.append(fileObj) except Exception: # Incomplete file - try to extract what we can idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', fileStr) filenameMatch = re.search(r'"filename"\s*:\s*"([^"]*)"', fileStr) if idMatch or filenameMatch: fileObj = {} if idMatch: fileObj["id"] = idMatch.group(1) if filenameMatch: fileObj["filename"] = filenameMatch.group(1) if fileObj: files.append(fileObj) startPos = -1 return files @staticmethod def _extractImages(jsonString: str) -> List[Dict[str, Any]]: """Extract images array from JSON string, even if incomplete.""" images = [] # Look for "images": [...] imagesPattern = r'"images"\s*:\s*\[(.*?)(?:\]|$)' match = re.search(imagesPattern, jsonString, re.DOTALL) if match: imagesContent = match.group(1) # Extract image objects using balanced brace matching braceCount = 0 startPos = -1 for i, char in enumerate(imagesContent): if char == '{': if braceCount == 0: startPos = i braceCount += 1 elif char == '}': braceCount -= 1 if braceCount == 0 and startPos >= 0: imageStr = imagesContent[startPos:i+1] try: image = json.loads(imageStr) if isinstance(image, dict): images.append(image) except Exception: # Incomplete image - try to extract what we can idMatch = re.search(r'"id"\s*:\s*"([^"]*)"', imageStr) urlMatch = re.search(r'"url"\s*:\s*"([^"]*)"', imageStr) if idMatch or urlMatch: image = {} if idMatch: image["id"] = idMatch.group(1) if urlMatch: image["url"] = urlMatch.group(1) if image: images.append(image) startPos = -1 return images class JsonStructureDetector: """Detects JSON structure type from extracted data.""" @staticmethod def detect(data: Dict[str, Any], mergeId: Optional[str] = None) -> str: """ Detect structure type from data - GENERIC approach. Only checks for top-level keys, no content analysis. Returns: Structure type: "elements", "documents", "files", "images", or "unknown" """ if "elements" in data: structureType = "elements" elif "documents" in data: structureType = "documents" elif "files" in data: structureType = "files" elif "images" in data: structureType = "images" else: # Unknown structure - will be handled generically structureType = "unknown" if mergeId: JsonMergeLogger.logStep("DETECTION", f"Detected structure type: {structureType}", structureType) return structureType class JsonDataMerger: """Merges JSON data intelligently with overlap detection.""" @staticmethod def merge( accumulated: Dict[str, Any], newFragment: Dict[str, Any], structureType: str, mergeId: Optional[str] = None ) -> Dict[str, Any]: """ Merge two JSON data structures. Args: accumulated: Previously accumulated data newFragment: New fragment data structureType: Detected structure type mergeId: Optional merge ID for logging Returns: Merged data structure """ if mergeId: JsonMergeLogger.logStep("MERGING", f"Merging {structureType} structures", { "acc_keys": list(accumulated.keys()) if accumulated else [], "frag_keys": list(newFragment.keys()) if newFragment else [] }) if not accumulated: if mergeId: JsonMergeLogger.logStep("MERGING", "No accumulated data, returning fragment", newFragment) return newFragment if newFragment else {} if not newFragment: if mergeId: JsonMergeLogger.logStep("MERGING", "No fragment data, returning accumulated", accumulated) return accumulated # Merge based on structure type if structureType == "elements": result = JsonDataMerger._mergeElements(accumulated, newFragment) elif structureType == "documents": result = JsonDataMerger._mergeDocuments(accumulated, newFragment) elif structureType == "files": result = JsonDataMerger._mergeFiles(accumulated, newFragment) elif structureType == "images": result = JsonDataMerger._mergeImages(accumulated, newFragment) else: # Unknown structure - try to merge generically result = JsonDataMerger._mergeGeneric(accumulated, newFragment) if mergeId: JsonMergeLogger.logStep("MERGING", f"Merged {structureType} structures", result) return result @staticmethod def _mergeElements(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]: """Merge elements structures.""" accElements = accumulated.get("elements", []) fragElements = newFragment.get("elements", []) if not accElements: return {"elements": fragElements} if fragElements else accumulated if not fragElements: return {"elements": accElements} # Merge elements with overlap detection mergedElements = JsonDataMerger._mergeElementList(accElements, fragElements) return {"elements": mergedElements} @staticmethod def _mergeElementList(accElements: List[Dict[str, Any]], fragElements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Merge two element lists with overlap detection.""" if not accElements: return fragElements if not fragElements: return accElements # Special handling: if both have table elements, merge them intelligently accTables = [e for e in accElements if isinstance(e, dict) and e.get("type") == "table"] fragTables = [e for e in fragElements if isinstance(e, dict) and e.get("type") == "table"] if accTables and fragTables: # Merge table elements mergedTable = JsonDataMerger._mergeTableElements(accTables[0], fragTables[0]) if mergedTable: # Replace tables with merged table otherAccElements = [e for e in accElements if not (isinstance(e, dict) and e.get("type") == "table")] otherFragElements = [e for e in fragElements if not (isinstance(e, dict) and e.get("type") == "table")] return otherAccElements + [mergedTable] + otherFragElements # Find overlap by comparing elements overlapStart = JsonDataMerger._findOverlap(accElements, fragElements, None, "elements") if overlapStart > 0: # Found overlap - remove overlapping elements from fragment merged = accElements + fragElements[overlapStart:] return merged else: # No overlap - append all return accElements + fragElements @staticmethod def _mergeTableElements(accTable: Dict[str, Any], fragTable: Dict[str, Any]) -> Dict[str, Any]: """Merge two table elements by merging their rows.""" accRows = JsonDataMerger._getTableRows(accTable) fragRows = JsonDataMerger._getTableRows(fragTable) if not accRows: return fragTable if not fragRows: return accTable # Find overlap in rows overlapStart = JsonDataMerger._findOverlap(accRows, fragRows, None, "table_rows") # Merge rows mergedRows = accRows + fragRows[overlapStart:] if overlapStart > 0 else accRows + fragRows # Build merged table mergedTable = accTable.copy() content = mergedTable.get("content", {}) if not isinstance(content, dict): content = {} content["rows"] = mergedRows # Preserve headers if "headers" not in content: fragContent = fragTable.get("content", {}) if isinstance(fragContent, dict) and "headers" in fragContent: content["headers"] = fragContent["headers"] mergedTable["content"] = content return mergedTable @staticmethod def _findOverlap(accList: List[Any], fragList: List[Any], mergeId: Optional[str] = None, overlapType: str = "generic") -> int: """Find overlap between two lists. Returns index where overlap starts in fragList.""" if not accList or not fragList: if mergeId: JsonMergeLogger.logOverlap(overlapType, 0) return 0 # Try to find longest common suffix/prefix maxOverlap = min(len(accList), len(fragList)) for overlapLen in range(maxOverlap, 0, -1): accSuffix = accList[-overlapLen:] fragPrefix = fragList[:overlapLen] # Compare elements if JsonDataMerger._listsEqual(accSuffix, fragPrefix): if mergeId: JsonMergeLogger.logOverlap(overlapType, overlapLen, accSuffix, fragPrefix) return overlapLen if mergeId: JsonMergeLogger.logOverlap(overlapType, 0) return 0 @staticmethod def _listsEqual(list1: List[Any], list2: List[Any]) -> bool: """Check if two lists are equal (deep comparison for dicts).""" if len(list1) != len(list2): return False for i in range(len(list1)): if isinstance(list1[i], dict) and isinstance(list2[i], dict): # Compare dicts by comparing their content if not JsonDataMerger._dictsEqual(list1[i], list2[i]): return False elif list1[i] != list2[i]: return False return True @staticmethod def _dictsEqual(dict1: Dict[str, Any], dict2: Dict[str, Any]) -> bool: """Check if two dicts are equal (comparing key content).""" # For table elements, compare rows if dict1.get("type") == "table" and dict2.get("type") == "table": rows1 = JsonDataMerger._getTableRows(dict1) rows2 = JsonDataMerger._getTableRows(dict2) return rows1 == rows2 # For other elements, compare type and key content if dict1.get("type") != dict2.get("type"): return False # Compare content content1 = dict1.get("content", {}) content2 = dict2.get("content", {}) if isinstance(content1, dict) and isinstance(content2, dict): # Compare rows for tables if "rows" in content1 and "rows" in content2: return content1["rows"] == content2["rows"] # Compare items for lists if "items" in content1 and "items" in content2: return content1["items"] == content2["items"] return dict1 == dict2 @staticmethod def _getTableRows(element: Dict[str, Any]) -> List[List[str]]: """Extract table rows from element.""" content = element.get("content", {}) if isinstance(content, dict): return content.get("rows", []) return element.get("rows", []) @staticmethod def _mergeDocuments(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]: """Merge documents structures.""" accDocs = accumulated.get("documents", []) fragDocs = newFragment.get("documents", []) if not accDocs: return {"documents": fragDocs} if fragDocs else accumulated if not fragDocs: return {"documents": accDocs} # Merge documents (simplified - would need proper merging logic) mergedDocs = accDocs + fragDocs return {"documents": mergedDocs} @staticmethod def _mergeFiles(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]: """Merge files structures.""" accFiles = accumulated.get("files", []) fragFiles = newFragment.get("files", []) if not accFiles: return {"files": fragFiles} if fragFiles else accumulated if not fragFiles: return {"files": accFiles} mergedFiles = accFiles + fragFiles return {"files": mergedFiles} @staticmethod def _mergeImages(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]: """Merge images structures.""" accImages = accumulated.get("images", []) fragImages = newFragment.get("images", []) if not accImages: return {"images": fragImages} if fragImages else accumulated if not fragImages: return {"images": accImages} mergedImages = accImages + fragImages return {"images": mergedImages} @staticmethod def _mergeGeneric(accumulated: Dict[str, Any], newFragment: Dict[str, Any]) -> Dict[str, Any]: """Generic merge for unknown structures.""" # Try to merge by combining keys merged = accumulated.copy() for key, value in newFragment.items(): if key in merged: # Key exists - try to merge values if isinstance(merged[key], list) and isinstance(value, list): merged[key] = merged[key] + value elif isinstance(merged[key], dict) and isinstance(value, dict): merged[key] = JsonDataMerger._mergeGeneric(merged[key], value) else: merged[key] = value else: merged[key] = value return merged class JsonResultBuilder: """Builds final JSON result, ensuring it's always valid.""" @staticmethod def build(mergedData: Dict[str, Any], structureType: str, mergeId: Optional[str] = None) -> str: """ Build final JSON string from merged data. Args: mergedData: Merged data structure structureType: Detected structure type Returns: Valid JSON string (never empty) """ if not mergedData: # Return empty structure based on type if structureType == "elements": return json.dumps({"elements": []}, indent=2, ensure_ascii=False) elif structureType == "documents": return json.dumps({"documents": [{}]}, indent=2, ensure_ascii=False) elif structureType == "files": return json.dumps({"files": []}, indent=2, ensure_ascii=False) elif structureType == "images": return json.dumps({"images": []}, indent=2, ensure_ascii=False) else: return json.dumps({}, indent=2, ensure_ascii=False) # Ensure structure is correct - GENERIC approach if structureType == "elements" and "elements" not in mergedData: # Try to wrap data in elements structure if isinstance(mergedData, dict): # Generic: If it has any data, wrap it as an element if mergedData: mergedData = {"elements": [mergedData]} if mergeId: JsonMergeLogger.logStep("BUILDING", "Wrapping single object as element (generic)", mergedData) else: # Empty dict - return empty elements mergedData = {"elements": []} elif structureType == "documents" and "documents" not in mergedData: # Try to wrap data in documents structure if isinstance(mergedData, dict): if mergedData: # Generic: Wrap single object in documents structure # Try to detect if it should be chapters or sections by checking accumulated data # But for now, use generic approach: wrap in documents with a generic key mergedData = {"documents": [mergedData]} if mergeId: JsonMergeLogger.logStep("BUILDING", "Wrapping single object in documents structure (generic)", mergedData) else: mergedData = {"documents": [{}]} elif structureType == "files" and "files" not in mergedData: # Try to wrap data in files structure if isinstance(mergedData, dict): if mergedData: mergedData = {"files": [mergedData]} if mergeId: JsonMergeLogger.logStep("BUILDING", "Wrapping single object in files structure (generic)", mergedData) else: mergedData = {"files": []} elif structureType == "images" and "images" not in mergedData: # Try to wrap data in images structure if isinstance(mergedData, dict): if mergedData: mergedData = {"images": [mergedData]} if mergeId: JsonMergeLogger.logStep("BUILDING", "Wrapping single object in images structure (generic)", mergedData) else: mergedData = {"images": []} elif structureType == "unknown" and isinstance(mergedData, dict) and mergedData: # Unknown structure but has data - wrap generically as elements mergedData = {"elements": [mergedData]} if mergeId: JsonMergeLogger.logStep("BUILDING", "Unknown structure, wrapping as elements (generic)", mergedData) # Clean data structure before serialization cleanedData = JsonResultBuilder._cleanDataStructure(mergedData) # Try to serialize try: jsonString = json.dumps(cleanedData, indent=2, ensure_ascii=False) # Validate the JSON string by trying to parse it try: parsed, parseErr, _ = tryParseJson(jsonString) if parseErr is None: # Valid JSON - return it return jsonString else: # Invalid JSON - try to repair logger.warning(f"Generated JSON is invalid: {parseErr}, attempting repair") repaired = closeJsonStructures(jsonString) parsed2, parseErr2, _ = tryParseJson(repaired) if parseErr2 is None: return repaired else: # Repair failed - return minimal valid structure logger.error(f"Repair failed: {parseErr2}, returning minimal structure") return json.dumps({"elements": []}, indent=2, ensure_ascii=False) except Exception as parseEx: # Parse validation failed - try repair logger.warning(f"Parse validation failed: {parseEx}, attempting repair") try: repaired = closeJsonStructures(jsonString) parsed2, parseErr2, _ = tryParseJson(repaired) if parseErr2 is None: return repaired except Exception: pass # Return minimal valid structure return json.dumps({"elements": []}, indent=2, ensure_ascii=False) except (TypeError, ValueError) as e: logger.error(f"Error serializing JSON: {e}") # Try to clean more aggressively and retry try: cleanedData2 = JsonResultBuilder._cleanDataStructure(cleanedData, aggressive=True) jsonString = json.dumps(cleanedData2, indent=2, ensure_ascii=False) # Validate parsed, parseErr, _ = tryParseJson(jsonString) if parseErr is None: return jsonString except Exception: pass # Fallback to empty structure return json.dumps({"elements": []}, indent=2, ensure_ascii=False) except Exception as e: logger.error(f"Unexpected error building JSON: {e}") # Fallback to empty structure return json.dumps({"elements": []}, indent=2, ensure_ascii=False) @staticmethod def _cleanDataStructure(data: Any, aggressive: bool = False) -> Any: """ Clean data structure to ensure it's JSON-serializable. Removes None values, ensures lists contain only valid items, and repairs incomplete structures. """ if data is None: return {} if aggressive else None if isinstance(data, dict): cleaned = {} for key, value in data.items(): if value is None and aggressive: continue # Skip None values in aggressive mode cleaned[key] = JsonResultBuilder._cleanDataStructure(value, aggressive) return cleaned elif isinstance(data, list): cleaned = [] for item in data: cleanedItem = JsonResultBuilder._cleanDataStructure(item, aggressive) if cleanedItem is not None or not aggressive: cleaned.append(cleanedItem) return cleaned elif isinstance(data, (str, int, float, bool)): return data else: # Unknown type - try to convert to string or skip if aggressive: return str(data) return data class ModularJsonMerger: """ Modular JSON Merger - Main entry point. Simple pipeline: 1. Find overlap between JSON strings 2. Merge strings together 3. Parse and clean the merged JSON """ @staticmethod def _findStringOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int: """ Find overlap between two JSON strings - GENERIC solution. Works for any JSON structure (arrays, objects, nested, minified, formatted). Uses multiple strategies to find overlap regardless of JSON format. Strategy: 1. Exact suffix/prefix match (fastest, works for any format) 2. Structure-aware: Find last complete JSON elements in accumulated that match start of fragment 3. Line-based: If JSON is formatted, use line matching (for better performance) 4. Partial match: Handle incomplete elements at cut point Returns the length of the overlap (number of characters). """ if not accStr or not fragStr: if mergeId: JsonMergeLogger.logOverlap("string", 0) return 0 # Strategy 1: Try exact suffix/prefix match (fastest, works for any format) maxOverlap = min(len(accStr), len(fragStr)) # Start from maximum possible overlap and work backwards for overlapLen in range(maxOverlap, 0, -1): accSuffix = accStr[-overlapLen:] fragPrefix = fragStr[:overlapLen] if accSuffix == fragPrefix: if mergeId: JsonMergeLogger.logOverlap("string (exact)", overlapLen, accSuffix[:200], fragPrefix[:200]) return overlapLen # Strategy 2: Structure-aware overlap detection (GENERIC - works for any JSON structure) # Find last complete JSON elements in accumulated and check if they appear at start of fragment overlapLen = ModularJsonMerger._findStructureBasedOverlap(accStr, fragStr, mergeId) if overlapLen > 0: return overlapLen # Strategy 3: Line-based overlap (works well for formatted JSON) # Only use if JSON appears to be formatted (has newlines) if '\n' in accStr and '\n' in fragStr: overlapLen = ModularJsonMerger._findLineBasedOverlap(accStr, fragStr, mergeId) if overlapLen > 0: return overlapLen # Strategy 4: Partial overlap (incomplete element at cut point) overlapLen = ModularJsonMerger._findPartialOverlap(accStr, fragStr, mergeId) if overlapLen > 0: return overlapLen if mergeId: JsonMergeLogger.logOverlap("string", 0) return 0 @staticmethod def _findStructureBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int: """ Find overlap by detecting complete JSON elements (structure-aware, GENERIC). Works for ANY JSON structure: - Arrays: Finds last complete array elements - Objects: Finds last complete object properties - Nested structures: Recursively finds complete elements - Minified or formatted JSON: Structure-aware, not format-dependent - Any use case: section_content, chapter_structure, code_structure, etc. Strategy: Find last complete JSON elements in accumulated that match start of fragment. Uses balanced bracket/brace matching to identify complete elements regardless of format. """ accTrimmed = accStr.rstrip() fragTrimmed = fragStr.lstrip() if not accTrimmed or not fragTrimmed: return 0 # Find last complete elements in accumulated by parsing backwards # Look for complete array elements or object properties # Strategy: Find where accumulated has complete elements at the end # and check if fragment starts with the same elements # Use a sliding window approach: check different suffix lengths from accumulated maxCheckLength = min(2000, len(accTrimmed), len(fragTrimmed)) # Check in reverse order (largest to smallest) to find longest overlap first for checkLen in range(maxCheckLength, 50, -5): # Step by 5 for performance if checkLen > len(accTrimmed) or checkLen > len(fragTrimmed): continue accSuffix = accTrimmed[-checkLen:] fragPrefix = fragTrimmed[:checkLen] # Check if accSuffix ends with complete JSON element(s) and fragPrefix starts with same # A complete element ends with proper closing brackets/braces # Verify that accSuffix ends with complete structure # and fragPrefix starts with the same structure if ModularJsonMerger._isCompleteJsonElement(accSuffix) and \ ModularJsonMerger._startsWithSameElement(accSuffix, fragPrefix): # Found overlap! Verify it's meaningful (not just whitespace) if len(accSuffix.strip()) > 20: if mergeId: JsonMergeLogger.logOverlap("string (structure-based)", checkLen, accSuffix[:200], fragPrefix[:200]) return checkLen # Alternative: Try to find common substring that represents complete elements # Look for patterns like complete array rows or object properties # Check last 500 chars of accumulated against first 500 chars of fragment checkWindow = min(500, len(accTrimmed), len(fragTrimmed)) if checkWindow > 100: accWindow = accTrimmed[-checkWindow:] fragWindow = fragTrimmed[:checkWindow] # Find longest common substring that represents complete elements # Look for boundaries like ], [ or }, { or ", " for i in range(checkWindow - 50, 50, -5): accSub = accWindow[-i:] fragSub = fragWindow[:i] if accSub == fragSub: # Check if it's a complete element boundary if ModularJsonMerger._isCompleteElementBoundary(accSub): if mergeId: JsonMergeLogger.logOverlap("string (structure-boundary)", i, accSub[:200], fragSub[:200]) return i return 0 @staticmethod def _isCompleteJsonElement(jsonStr: str) -> bool: """Check if string ends with a complete JSON element (balanced brackets/braces).""" jsonStr = jsonStr.strip() if not jsonStr: return False # Check if it ends with complete structure markers # Complete array element: ends with ] or ], or ], # Complete object element: ends with } or }, or }, if jsonStr[-1] in ']}': # Check if brackets/braces are balanced braceCount = jsonStr.count('{') - jsonStr.count('}') bracketCount = jsonStr.count('[') - jsonStr.count(']') return braceCount == 0 and bracketCount == 0 return False @staticmethod def _startsWithSameElement(accSuffix: str, fragPrefix: str) -> bool: """Check if fragment prefix starts with the same element as accumulated suffix.""" # Normalize whitespace for comparison accNorm = accSuffix.strip() fragNorm = fragPrefix.strip() # Check if fragPrefix starts with accSuffix (or vice versa for partial matches) if fragNorm.startswith(accNorm): return True # Check if they have common prefix (for partial element completion) minLen = min(len(accNorm), len(fragNorm)) if minLen > 20: # Check if first 80% of accSuffix matches start of fragPrefix checkLen = int(minLen * 0.8) return accNorm[:checkLen] == fragNorm[:checkLen] return False @staticmethod def _isCompleteElementBoundary(jsonStr: str) -> bool: """Check if string represents a complete element boundary (e.g., ], [ or }, {).""" jsonStr = jsonStr.strip() if not jsonStr: return False # Check if it contains complete element boundaries # Pattern: ends with ], or }, or ],\n or },\n if jsonStr.rstrip().endswith(('],', '},', ']', '}')): return True # Check if it's a complete array element or object property if '],' in jsonStr or '},' in jsonStr: return True return False @staticmethod def _findLineBasedOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int: """ Find overlap using line-based matching (for formatted JSON). """ accLines = accStr.rstrip().split('\n') fragLines = fragStr.lstrip().split('\n') # Try to find matching lines from the end of accumulated at the start of fragment maxLinesToCheck = min(10, len(accLines), len(fragLines)) for numLines in range(maxLinesToCheck, 0, -1): # Get last N lines from accumulated (excluding empty lines) accLastLines = [line.strip() for line in accLines[-numLines:] if line.strip()] # Get first N lines from fragment (excluding empty lines) fragFirstLines = [line.strip() for line in fragLines[:numLines] if line.strip()] # Check if they match if len(accLastLines) > 0 and len(fragFirstLines) > 0: # Try to find where accLastLines match fragFirstLines for i in range(len(accLastLines)): # Check if accLastLines[i:] matches fragFirstLines[:len(accLastLines)-i] accSuffixLines = accLastLines[i:] fragPrefixLines = fragFirstLines[:len(accSuffixLines)] if accSuffixLines == fragPrefixLines and len(accSuffixLines) > 0: # Found overlap! Calculate character length accSuffixText = '\n'.join(accLastLines[i:]) fragPrefixText = '\n'.join(fragPrefixLines) # Find where this text appears in the original strings accPos = accStr.rfind(accSuffixText) fragPos = fragStr.find(fragPrefixText) if accPos >= 0 and fragPos == 0: # Found valid overlap overlapLen = len(accSuffixText) if mergeId: JsonMergeLogger.logOverlap("string (line-based)", overlapLen, accSuffixText[:200], fragPrefixText[:200]) return overlapLen return 0 @staticmethod def _findPartialOverlap(accStr: str, fragStr: str, mergeId: Optional[str] = None) -> int: """ Find partial overlap (incomplete element at cut point). """ accLines = accStr.rstrip().split('\n') fragLines = fragStr.lstrip().split('\n') if accLines and fragLines: lastAccLine = accLines[-1].strip() firstFragLine = fragLines[0].strip() # Check if lastAccLine is a prefix of firstFragLine (incomplete line completed) if lastAccLine and firstFragLine.startswith(lastAccLine): # Also check if there are more matching lines after overlapLen = len(lastAccLine) # Try to extend overlap with more lines for i in range(1, min(len(accLines), len(fragLines))): if accLines[-1-i].strip() == fragLines[i].strip(): overlapLen += len('\n' + fragLines[i]) else: break if overlapLen > 20: # Only if meaningful overlap if mergeId: JsonMergeLogger.logOverlap("string (partial line)", overlapLen, lastAccLine[:200], firstFragLine[:200]) return overlapLen return 0 @staticmethod def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str: """ Merge two JSON strings together, removing the overlap. Handles whitespace at cut points properly for seamless merging. """ if overlapLength > 0: # Remove overlap from fragment and append # CRITICAL: Handle whitespace properly - if accumulated ends with whitespace # and fragment starts with the same content, we need to preserve whitespace structure merged = accStr + fragStr[overlapLength:] else: # No overlap - just concatenate (might need comma or other separator) # CRITICAL: Preserve whitespace structure when merging # Get trailing whitespace from accumulated (spaces, tabs, but not newlines) accTrailingWs = "" i = len(accStr) - 1 while i >= 0 and accStr[i] in [' ', '\t']: accTrailingWs = accStr[i] + accTrailingWs i -= 1 # Get leading whitespace from fragment (spaces, tabs, but not newlines) fragLeadingWs = "" i = 0 while i < len(fragStr) and fragStr[i] in [' ', '\t']: fragLeadingWs += fragStr[i] i += 1 # Trim for content detection but preserve whitespace structure accTrimmed = accStr.rstrip().rstrip(',') fragTrimmed = fragStr.lstrip().lstrip(',') # Check if we need a separator if accTrimmed and fragTrimmed: # If accumulated ends with } or ] and fragment starts with { or [, we might need comma if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['): # Add comma with appropriate whitespace merged = accTrimmed + ',' + fragLeadingWs + fragTrimmed else: # Merge with preserved whitespace structure # Use the whitespace from fragment (it knows the proper spacing) merged = accTrimmed + accTrailingWs + fragLeadingWs + fragTrimmed else: # One is empty - just concatenate with preserved whitespace merged = accStr + fragStr return merged @staticmethod def merge(accumulated: str, newFragment: str) -> Tuple[str, bool]: """ Merge two JSON fragments intelligently. Args: accumulated: Previously accumulated JSON string newFragment: New fragment JSON string Returns: Tuple of (merged_json_string, has_overlap): - merged_json_string: Merged JSON string (closed if no overlap, unclosed if overlap found) - has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop) """ # Start logging mergeId = JsonMergeLogger.startMerge(accumulated, newFragment) if not accumulated: result = newFragment if newFragment else "{}" JsonMergeLogger.finishMerge(mergeId, result, True) return (result, False) # No overlap if no accumulated data if not newFragment: JsonMergeLogger.finishMerge(mergeId, accumulated, True) return (accumulated, False) # No overlap if no new fragment try: # Normalize both strings accNormalized = stripCodeFences(normalizeJsonText(accumulated)).strip() fragNormalized = stripCodeFences(normalizeJsonText(newFragment)).strip() JsonMergeLogger._log(f"\n Normalized Accumulated ({len(accNormalized)} chars)") accNormLines = accNormalized.split('\n') if len(accNormLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(accNormLines)} lines)") for line in accNormLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(accNormLines) - 10} lines omitted) ...") for line in accNormLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in accNormLines: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f"\n Normalized New Fragment ({len(fragNormalized)} chars)") fragNormLines = fragNormalized.split('\n') if len(fragNormLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(fragNormLines)} lines)") for line in fragNormLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(fragNormLines) - 10} lines omitted) ...") for line in fragNormLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in fragNormLines: JsonMergeLogger._log(f" {line}") # Step 1: Find overlap between JSON strings JsonMergeLogger.logStep("PHASE 1", "Finding overlap between JSON strings", None) overlapLength = ModularJsonMerger._findStringOverlap(accNormalized, fragNormalized, mergeId) if overlapLength > 0: accSuffix = accNormalized[-overlapLength:] fragPrefix = fragNormalized[:overlapLength] JsonMergeLogger._log(f"\n Overlap found ({overlapLength} chars):") JsonMergeLogger._log(f" Accumulated suffix: {accSuffix}") JsonMergeLogger._log(f" Fragment prefix: {fragPrefix}") else: # CRITICAL: No overlap found - this means iterations should stop JsonMergeLogger._log(f"\n ⚠️ NO OVERLAP FOUND - This indicates iterations should stop") JsonMergeLogger._log(f" Closing JSON and returning final result") # Close the accumulated JSON (it's complete as far as we can tell) closedJson = closeJsonStructures(accNormalized) JsonMergeLogger._log(f"\n Closed JSON ({len(closedJson)} chars):") JsonMergeLogger._log(" " + "="*78) for line in closedJson.split('\n'): JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(" " + "="*78) JsonMergeLogger.finishMerge(mergeId, closedJson, True) # Return closed JSON with has_overlap=False to indicate iterations should stop return (closedJson, False) # Step 2: Merge strings together (only if overlap was found) JsonMergeLogger.logStep("PHASE 2", f"Merging strings (overlap: {overlapLength} chars)", None) mergedString = ModularJsonMerger._mergeStrings(accNormalized, fragNormalized, overlapLength) JsonMergeLogger._log(f"\n Merged String ({len(mergedString)} chars)") mergedLines = mergedString.split('\n') if len(mergedLines) > 10: JsonMergeLogger._log(f" (showing first 5 and last 5 of {len(mergedLines)} lines)") for line in mergedLines[:5]: JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" ... ({len(mergedLines) - 10} lines omitted) ...") for line in mergedLines[-5:]: JsonMergeLogger._log(f" {line}") else: for line in mergedLines: JsonMergeLogger._log(f" {line}") # Step 3: Return merged string (with incomplete element at end for next iteration) JsonMergeLogger.logStep("PHASE 3", "Returning merged string (may be unclosed)", None) JsonMergeLogger._log(f"\n Returning merged string (preserving incomplete element at end for next iteration)") JsonMergeLogger.finishMerge(mergeId, mergedString, True) # Return merged string with has_overlap=True to indicate iterations should continue return (mergedString, True) except Exception as e: logger.error(f"Error in modular merger: {e}") JsonMergeLogger.logStep("ERROR", f"Exception occurred: {str(e)}", None, error=str(e)) # Fallback: try to return accumulated if valid try: accParsed, accErr, _ = tryParseJson(accumulated) if accErr is None: JsonMergeLogger.finishMerge(mergeId, accumulated, False) return (accumulated, False) # No overlap on error except Exception: pass # Last resort: return empty valid JSON fallback = json.dumps({"elements": []}, indent=2, ensure_ascii=False) JsonMergeLogger.finishMerge(mergeId, fallback, False) return (fallback, False) # No overlap on error