# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ JSON Response Handling Module Handles merging of JSON responses from multiple AI iterations, including: - Section merging with intelligent overlap detection - JSON fragment detection and merging - Deep recursive structure merging - Overlap detection for complex nested structures - String accumulation for iterative JSON generation """ import json import logging import re from typing import Dict, Any, List, Optional, Tuple from modules.shared.jsonUtils import extractJsonString, repairBrokenJson, extractSectionsFromDocument from modules.datamodels.datamodelAi import JsonAccumulationState logger = logging.getLogger(__name__) class JsonResponseHandler: """Handles JSON response merging and fragment detection for iterative AI generation.""" @staticmethod def mergeSectionsIntelligently( existingSections: List[Dict[str, Any]], newSections: List[Dict[str, Any]], iteration: int ) -> List[Dict[str, Any]]: """ Intelligently merge sections from multiple iterations. This is a GENERIC merging strategy that handles broken JSON iterations. The break can occur anywhere - in any section, at any depth. Merging strategies (in order of priority): 1. Same Section ID: Merge sections with identical IDs 2. Same Content-Type + Position: If last section is incomplete and new section continues it 3. Same Order: Merge sections with same order value 4. Structural Analysis: Detect continuation based on content structure Args: existingSections: Sections accumulated from previous iterations newSections: Sections extracted from current iteration iteration: Current iteration number Returns: Merged list of sections """ if not newSections: return existingSections if not existingSections: return newSections mergedSections = existingSections.copy() for newSection in newSections: merged = False # Strategy 1: Same Section ID - merge directly newSectionId = newSection.get("id") if newSectionId: for i, existingSection in enumerate(mergedSections): if existingSection.get("id") == newSectionId: # Merge sections with same ID mergedSections[i] = JsonResponseHandler.mergeSectionContent( existingSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'") break if merged: continue # Strategy 2: Same Content-Type + Position (continuation detection) # Check if last section is incomplete and new section continues it if mergedSections: lastSection = mergedSections[-1] lastContentType = lastSection.get("content_type") newContentType = newSection.get("content_type") if lastContentType == newContentType: # Same content type - check if last section is incomplete if JsonResponseHandler.isSectionIncomplete(lastSection): # Last section is incomplete, merge with new section mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})") continue # Strategy 3: Same Order value newOrder = newSection.get("order") if newOrder is not None: for i, existingSection in enumerate(mergedSections): existingOrder = existingSection.get("order") if existingOrder is not None and existingOrder == newOrder: # Merge sections with same order mergedSections[i] = JsonResponseHandler.mergeSectionContent( existingSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}") break if merged: continue # Strategy 4: Structural Analysis - detect continuation # For code_block and table: if last section matches new section type, merge them if mergedSections: lastSection = mergedSections[-1] lastContentType = lastSection.get("content_type") newContentType = newSection.get("content_type") # Both are code blocks - merge them if lastContentType == "code_block" and newContentType == "code_block": mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis") continue # Both are tables - merge them (common case for broken JSON iterations) if lastContentType == "table" and newContentType == "table": mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis") continue # No merge strategy matched - add as new section if not merged: mergedSections.append(newSection) logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})") return mergedSections @staticmethod def isSectionIncomplete(section: Dict[str, Any]) -> bool: """ Check if a section is incomplete (broken at the end). This detects incomplete sections based on content analysis: - Code blocks: ends mid-line, ends with comma, ends with incomplete structure - Text sections: ends mid-sentence, ends with incomplete structure - Other types: check for incomplete elements """ contentType = section.get("content_type", "") elements = section.get("elements", []) if not elements: return False # Handle list of elements if isinstance(elements, list) and len(elements) > 0: lastElement = elements[-1] else: lastElement = elements if not isinstance(lastElement, dict): return False # Check code_block for incomplete code if contentType == "code_block": code = lastElement.get("code", "") if code: # Check if code ends incompletely: # - Ends with comma (incomplete CSV line) # - Ends with number but no newline (incomplete line) # - Ends mid-token (e.g., "23431,23" - incomplete number) codeStripped = code.rstrip() if codeStripped: # Check for incomplete patterns if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')): # Ends with comma or has comma but no final newline - likely incomplete return True # Check if last line is incomplete (doesn't end with newline and has partial content) if not code.endswith('\n') and codeStripped: # No final newline - might be incomplete # More sophisticated: check if last number is complete lastLine = codeStripped.split('\n')[-1] if lastLine and ',' in lastLine: # Has commas but might be incomplete parts = lastLine.split(',') if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete return True # Check table for incomplete rows if contentType == "table": rows = lastElement.get("rows", []) if rows: # Check if last row is incomplete (ends with incomplete data) lastRow = rows[-1] if isinstance(rows, list) else [] if isinstance(lastRow, list) and lastRow: # CRITICAL: Check if last row doesn't have expected number of columns (if headers exist) # This is the PRIMARY indicator of incomplete table rows headers = lastElement.get("headers", []) if headers and isinstance(headers, list): expectedCols = len(headers) if len(lastRow) < expectedCols: logger.debug(f"Table section incomplete: last row has {len(lastRow)} columns, expected {expectedCols}") return True # Also check if last row ends with incomplete data (e.g., incomplete string) lastCell = lastRow[-1] if lastRow else "" if isinstance(lastCell, str): # If last cell is incomplete (ends with quote or is very short), section might be incomplete if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell): logger.debug(f"Table section incomplete: last cell appears incomplete: '{lastCell}'") return True # Additional check: if last row has fewer cells than previous rows, it's likely incomplete if len(rows) > 1: prevRow = rows[-2] if isinstance(rows, list) and len(rows) > 1 else [] if isinstance(prevRow, list) and len(prevRow) > len(lastRow): logger.debug(f"Table section incomplete: last row has {len(lastRow)} cells, previous row has {len(prevRow)}") return True # Check paragraph/text for incomplete sentences if contentType in ["paragraph", "heading"]: text = lastElement.get("text", "") if text: # Simple heuristic: if doesn't end with sentence-ending punctuation textStripped = text.rstrip() if textStripped and not textStripped[-1] in '.!?': # Might be incomplete, but this is less reliable # Only mark as incomplete if very short (likely cut off) if len(textStripped) < 20: return True # Check lists for incomplete items if contentType in ["bullet_list", "numbered_list"]: items = lastElement.get("items", []) if items and isinstance(items, list): # Check if last item is incomplete (very short or ends with incomplete string) lastItem = items[-1] if items else None if isinstance(lastItem, str) and len(lastItem) < 3: return True # Check image for incomplete base64 data if contentType == "image": imageData = lastElement.get("base64Data", "") if imageData: # Base64 strings should end with padding ('=' or '==') # If it doesn't, it might be incomplete stripped = imageData.rstrip() if stripped and not stripped.endswith(('=', '==')): # Check if it's a valid base64 character sequence that was cut off if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=': return True # If length is not a multiple of 4 (base64 requirement), it might be incomplete if len(stripped) % 4 != 0: return True # GENERIC CHECK: Recursively analyze structure for incompleteness # This works for ANY structure: arrays, objects, nested, primitives return JsonResponseHandler._isStructureIncomplete(lastElement) @staticmethod def _isStructureIncomplete(structure: Any, max_depth: int = 10) -> bool: """ GENERIC recursive check for incomplete structures. Detects incompleteness by analyzing patterns: - Arrays: Last item shorter than previous items, incomplete patterns - Objects: Last object has fewer keys than pattern, incomplete values - Strings: Very short, ends abruptly, incomplete patterns - Nested: Recursively checks nested structures Works for ANY JSON structure of any depth/complexity. """ if max_depth <= 0: return False # Arrays/Lists - check for incomplete patterns if isinstance(structure, list): if len(structure) == 0: return False # Check if last item is incomplete compared to previous items last_item = structure[-1] # If we have previous items, compare structure if len(structure) > 1: prev_item = structure[-2] # If last item is a list and previous is a list, check length if isinstance(last_item, list) and isinstance(prev_item, list): if len(last_item) < len(prev_item): return True # Last row/item has fewer elements - likely incomplete # If last item is a dict and previous is a dict, check keys if isinstance(last_item, dict) and isinstance(prev_item, dict): if len(last_item) < len(prev_item): return True # Last object has fewer keys - likely incomplete # Recursively check last item for incompleteness if JsonResponseHandler._isStructureIncomplete(last_item, max_depth - 1): return True # Objects/Dicts - check for incomplete values elif isinstance(structure, dict): for key, value in structure.items(): # Recursively check each value if JsonResponseHandler._isStructureIncomplete(value, max_depth - 1): return True # Check for incomplete strings if isinstance(value, str): # Very short strings might be incomplete if len(value) > 0 and len(value) < 3: return True # Strings ending with incomplete patterns (comma, quote, etc.) stripped = value.rstrip() if stripped and stripped.endswith((',', '"', '\\')): return True # Strings - check for incomplete patterns elif isinstance(structure, str): # Very short strings might be incomplete if len(structure) > 0 and len(structure) < 3: return True # Strings ending with incomplete patterns stripped = structure.rstrip() if stripped and stripped.endswith((',', '"', '\\')): return True return False @staticmethod def mergeSectionContent( existingSection: Dict[str, Any], newSection: Dict[str, Any], iteration: int ) -> Dict[str, Any]: """ Merge content from two sections. Handles different content types: - code_block: Append code, handle overlaps, merge incomplete lines - paragraph/heading: Append text - table: Merge rows - list: Merge items - Other: Merge elements """ contentType = existingSection.get("content_type", "") existingElements = existingSection.get("elements", []) newElements = newSection.get("elements", []) if not newElements: return existingSection # Handle list of elements if isinstance(existingElements, list): existingElem = existingElements[-1] if existingElements else {} else: existingElem = existingElements if isinstance(newElements, list): newElem = newElements[0] if newElements else {} else: newElem = newElements if not isinstance(existingElem, dict) or not isinstance(newElem, dict): return existingSection # Merge based on content type if contentType == "code_block": existingCode = existingElem.get("code", "") newCode = newElem.get("code", "") if existingCode and newCode: mergedCode = JsonResponseHandler.mergeCodeBlocks(existingCode, newCode, iteration) existingElem["code"] = mergedCode # Preserve language from existing or new if "language" not in existingElem and "language" in newElem: existingElem["language"] = newElem["language"] elif contentType in ["paragraph", "heading"]: existingText = existingElem.get("text", "") newText = newElem.get("text", "") if existingText and newText: # Append text with space if needed if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n': mergedText = existingText.rstrip() + " " + newText.lstrip() else: mergedText = existingText.rstrip() + "\n" + newText.lstrip() existingElem["text"] = mergedText elif contentType == "table": # Merge table rows with sophisticated overlap detection # CRITICAL: Tables can have rows in two places: # 1. Direct: existingElem["rows"] (legacy format) # 2. Nested: existingElem["content"]["rows"] (current format) existingRows = None newRows = None # Check nested structure first (current format) if "content" in existingElem and isinstance(existingElem["content"], dict): existingRows = existingElem["content"].get("rows", []) # Fallback to direct structure (legacy format) if not existingRows: existingRows = existingElem.get("rows", []) # Check nested structure first (current format) if "content" in newElem and isinstance(newElem["content"], dict): newRows = newElem["content"].get("rows", []) # Fallback to direct structure (legacy format) if not newRows: newRows = newElem.get("rows", []) if existingRows and newRows: # Use sophisticated overlap detection that handles multiple overlapping rows mergedRows = JsonResponseHandler.mergeRowsWithOverlap(existingRows, newRows, iteration) # Store in nested structure (current format) if "content" not in existingElem: existingElem["content"] = {} existingElem["content"]["rows"] = mergedRows # Also set type if missing if "type" not in existingElem: existingElem["type"] = "table" logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(mergedRows)}") elif newRows: # If existing has no rows but new does, use new rows if "content" not in existingElem: existingElem["content"] = {} existingElem["content"]["rows"] = newRows if "type" not in existingElem: existingElem["type"] = "table" # Preserve headers from existing (or use new if existing has none) # Headers can be in content.headers or directly in element existingHeaders = existingElem.get("content", {}).get("headers", []) if "content" in existingElem else existingElem.get("headers", []) newHeaders = newElem.get("content", {}).get("headers", []) if "content" in newElem else newElem.get("headers", []) if not existingHeaders and newHeaders: if "content" not in existingElem: existingElem["content"] = {} existingElem["content"]["headers"] = newHeaders # Preserve caption from existing (or use new if existing has none) existingCaption = existingElem.get("content", {}).get("caption") if "content" in existingElem else existingElem.get("caption") newCaption = newElem.get("content", {}).get("caption") if "content" in newElem else newElem.get("caption") if not existingCaption and newCaption: if "content" not in existingElem: existingElem["content"] = {} existingElem["content"]["caption"] = newCaption elif contentType in ["bullet_list", "numbered_list"]: # Merge list items with sophisticated overlap detection existingItems = existingElem.get("items", []) newItems = newElem.get("items", []) if existingItems and newItems: mergedItems = JsonResponseHandler.mergeItemsWithOverlap(existingItems, newItems, iteration) existingElem["items"] = mergedItems elif newItems: existingElem["items"] = newItems elif contentType == "image": # Images are typically complete - if new image is provided, replace existing # But check if existing image data is incomplete (e.g., base64 string cut off) existingImageData = existingElem.get("base64Data", "") newImageData = newElem.get("base64Data", "") if existingImageData and newImageData: # If existing image data doesn't end with valid base64 padding, it might be incomplete # Base64 padding is '=' or '==' at the end if not existingImageData.rstrip().endswith(('=', '==')): # Existing image might be incomplete - merge by appending new data # This handles cases where base64 string was cut off existingElem["base64Data"] = existingImageData + newImageData logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data") else: # Existing image is complete - replace with new (or keep existing if new is empty) if newImageData: existingElem["base64Data"] = newImageData elif newImageData: existingElem["base64Data"] = newImageData # Preserve other image metadata if not existingElem.get("altText") and newElem.get("altText"): existingElem["altText"] = newElem["altText"] if not existingElem.get("caption") and newElem.get("caption"): existingElem["caption"] = newElem["caption"] else: # GENERIC FALLBACK: Use deep recursive merging for complex nested structures # This handles any content type with arbitrary depth and complexity merged_element = JsonResponseHandler.mergeDeepStructures( existingElem, newElem, iteration, f"section.{contentType}" ) existingElem = merged_element # Update section with merged content mergedSection = existingSection.copy() if isinstance(existingElements, list): # Update the last element in the list with merged content if existingElements: existingElements[-1] = existingElem mergedSection["elements"] = existingElements else: mergedSection["elements"] = existingElem # Preserve metadata from new section if missing in existing if "order" not in mergedSection and "order" in newSection: mergedSection["order"] = newSection["order"] return mergedSection @staticmethod def mergeCodeBlocks(existingCode: str, newCode: str, iteration: int) -> str: """ Merge two code blocks intelligently, handling overlaps and incomplete lines. """ if not existingCode: return newCode if not newCode: return existingCode existingLines = existingCode.rstrip().split('\n') newLines = newCode.strip().split('\n') if not existingLines or not newLines: return existingCode + "\n" + newCode lastExistingLine = existingLines[-1].strip() firstNewLine = newLines[0].strip() # Strategy 1: Exact overlap - remove duplicate line if lastExistingLine == firstNewLine: newLines = newLines[1:] logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge") # Strategy 2: Incomplete line merge # If last existing line ends with comma or is incomplete, merge with first new line elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5): # Last line is incomplete - merge with first new line # Remove trailing comma from existing line mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip() existingLines[-1] = mergedLine newLines = newLines[1:] logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation") # Strategy 3: Partial overlap detection # Check if first new line starts with the end of last existing line elif ',' in lastExistingLine and ',' in firstNewLine: lastExistingParts = lastExistingLine.split(',') firstNewParts = firstNewLine.split(',') # Check for overlap: if last part of existing matches first part of new if lastExistingParts and firstNewParts: lastExistingPart = lastExistingParts[-1].strip() firstNewPart = firstNewParts[0].strip() # If they match, there's overlap if lastExistingPart == firstNewPart and len(lastExistingParts) > 1: # Remove overlapping part from new line newLines[0] = ','.join(firstNewParts[1:]) logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge") # Reconstruct merged code mergedCode = '\n'.join(existingLines) if newLines: if mergedCode and not mergedCode.endswith('\n'): mergedCode += '\n' mergedCode += '\n'.join(newLines) return mergedCode @staticmethod def detectAndParseJsonFragment( result: str, allSections: List[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ GENERIC fragment detection for ANY JSON structure. Detects if response is a JSON fragment (continuation content) rather than full document structure. Works for ANY JSON type: arrays, objects, primitives, nested structures of any depth/complexity. Fragment = Any JSON that: 1. Does NOT have "documents" or "sections" keys (not full document structure) 2. Can be ANY structure: array, object, nested, primitive, etc. 3. Is continuation content that needs to be merged into existing sections Examples (all handled generically): - Array: [["37643", ...], ...] (table rows, list items, any array) - Object: {"rows": [...], "headers": [...]} (partial element) - Primitive: "continuation text" (rare but possible) - Nested: {"data": {"items": [...]}} (any nested structure) Returns fragment info dict with: - fragment_data: The parsed fragment content (ANY type) - target_section_id: ID of last incomplete section (generic, not type-specific) CRITICAL: Fully generic - no specific logic for tables, paragraphs, etc. """ try: extracted = extractJsonString(result) parsed = json.loads(extracted) # GENERIC fragment detection: Check if it's NOT a full document structure is_full_document = False if isinstance(parsed, dict): # Full document structure has "documents" or "sections" keys if "documents" in parsed or "sections" in parsed: is_full_document = True # If it's a full document structure, it's not a fragment if is_full_document: return None # Otherwise, it's a fragment (can be ANY structure: array, object, primitive, nested) # Find target: last incomplete section (generic, regardless of content type) target_section_id = JsonResponseHandler.findLastIncompleteSectionId(allSections) logger.info(f"Detected GENERIC JSON fragment (type: {type(parsed).__name__}), target: {target_section_id}") return { "fragment_data": parsed, # Can be ANY JSON structure "target_section_id": target_section_id } except Exception as e: logger.error(f"Error detecting JSON fragment: {e}") logger.debug(f"Fragment detection failed for result: {result[:500]}...") return None @staticmethod def findLastIncompleteSectionId( allSections: List[Dict[str, Any]] ) -> Optional[str]: """ GENERIC: Find the last incomplete section (regardless of content type). This is fully generic - works for ANY content type, ANY structure. Returns the ID of the last section that is incomplete, or None if all are complete. """ # Find the last incomplete section (generic, not type-specific) for section in reversed(allSections): if JsonResponseHandler.isSectionIncomplete(section): return section.get("id") # If no incomplete section found, return last section as fallback if allSections: return allSections[-1].get("id") return None @staticmethod def mergeFragmentIntoSection( fragment: Dict[str, Any], allSections: List[Dict[str, Any]], iteration: int ) -> Optional[List[Dict[str, Any]]]: """ GENERIC fragment merging for ANY JSON structure. Merges a JSON fragment (ANY structure: array, object, nested, primitive) into the last incomplete section. Uses ONLY deep recursive merging - no specific logic for content types. Handles ALL cases: 1. Fragments with overlap (detected and merged intelligently) 2. Fragments without overlap (continuation after cut-off, appended) 3. Any JSON structure (arrays, objects, nested, primitives) 4. Accumulative merging (uses merged data from past iterations) CRITICAL: Fully generic - works for ANY JSON structure, ANY content type. NO FALLBACKS: Returns None if merge fails (no target section found). """ fragment_data = fragment.get("fragment_data") target_section_id = fragment.get("target_section_id") if fragment_data is None: logger.error(f"Iteration {iteration}: ❌ Fragment has no fragment_data - merge FAILED") return None # Find the target section (last incomplete section, generic) target_section = None target_index = -1 if target_section_id: for i, section in enumerate(allSections): if section.get("id") == target_section_id: target_section = section target_index = i break # NO FALLBACKS: If target not found by ID, try to find incomplete section if not target_section: for i, section in enumerate(reversed(allSections)): if JsonResponseHandler.isSectionIncomplete(section): target_section = section target_index = len(allSections) - 1 - i break # NO FALLBACKS: If no target found, merge FAILS if not target_section: logger.error(f"Iteration {iteration}: ❌ MERGE FAILED - No target section found for fragment!") logger.error(f"Iteration {iteration}: Available sections: {[s.get('id') + ' (' + s.get('content_type', 'unknown') + ')' for s in allSections]}") return None # Get the last element from target section (where fragment will be merged) merged_section = target_section.copy() elements = merged_section.get("elements", []) if not isinstance(elements, list): elements = [elements] if elements else [] if not elements: elements = [{}] last_element = elements[-1] if elements else {} if not isinstance(last_element, dict): last_element = {} elements.append(last_element) # CRITICAL: GENERIC fragment merging for ALL structure types # Automatically detects the structure type and merges accordingly # Works for: tables, lists, code blocks, paragraphs, images, and any nested structures merged_element = JsonResponseHandler._mergeFragmentIntoElement( last_element, fragment_data, target_section, iteration, f"section.{target_section_id}.fragment" ) # Update elements with merged content elements[-1] = merged_element merged_section["elements"] = elements # Update allSections (this ensures accumulative merging - merged data is used for next iteration) merged_sections = allSections.copy() merged_sections[target_index] = merged_section logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section_id}'") # Log merged JSON for debugging try: from modules.shared.debugLogger import writeDebugFile merged_json_str = json.dumps(merged_sections, indent=2, ensure_ascii=False) writeDebugFile(merged_json_str, f"merged_json_iteration_{iteration}.json") except Exception as e: logger.debug(f"Iteration {iteration}: Failed to write merged JSON debug file: {e}") return merged_sections @staticmethod def completeIncompleteStructures(allSections: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Complete any incomplete structures in sections by ensuring proper JSON structure. This ensures JSON is properly closed even if merge failed or iterations stopped early. Works generically for ANY structure type - recursively processes all nested structures. Returns sections with completed structures. """ completed_sections = [] for section in allSections: completed_section = JsonResponseHandler._completeStructure(section) completed_sections.append(completed_section) return completed_sections @staticmethod def _completeStructure(structure: Any) -> Any: """ Recursively complete incomplete structures by ensuring arrays/objects are properly structured. Works generically for ANY JSON structure - no specific logic for content types. """ if isinstance(structure, dict): completed = {} for key, value in structure.items(): completed[key] = JsonResponseHandler._completeStructure(value) return completed elif isinstance(structure, list): completed = [] for item in structure: completed.append(JsonResponseHandler._completeStructure(item)) return completed else: # Primitive value - return as is return structure @staticmethod def getContentTypeForFragment(fragment_type: str) -> str: """Map fragment type to content type.""" mapping = { "table_rows": "table", "table_element": "table", "code_lines": "code_block", "code_element": "code_block", "list_items": "bullet_list" } return mapping.get(fragment_type, "paragraph") @staticmethod def deepCompare(obj1: Any, obj2: Any, max_depth: int = 10) -> bool: """ Deep recursive comparison of two JSON-serializable objects. Handles nested structures of any depth and complexity. Args: obj1: First object to compare obj2: Second object to compare max_depth: Maximum recursion depth to prevent infinite loops Returns: True if objects are deeply equal, False otherwise """ if max_depth <= 0: return False # Type check if type(obj1) != type(obj2): return False # Primitive types if isinstance(obj1, (str, int, float, bool, type(None))): return obj1 == obj2 # Lists/arrays - compare element by element if isinstance(obj1, list): if len(obj1) != len(obj2): return False return all(JsonResponseHandler.deepCompare(item1, item2, max_depth - 1) for item1, item2 in zip(obj1, obj2)) # Dicts/objects - compare key by key if isinstance(obj1, dict): if set(obj1.keys()) != set(obj2.keys()): return False return all(JsonResponseHandler.deepCompare(obj1[key], obj2[key], max_depth - 1) for key in obj1.keys()) # Fallback for other types return obj1 == obj2 @staticmethod def findLongestCommonSuffix( existing_list: List[Any], new_list: List[Any], min_overlap: int = 1 ) -> int: """ Find the longest common suffix of existing_list that matches a prefix of new_list. This handles cases where multiple elements overlap: - existing: [A, B, C, D] - new: [C, D, E, F] - overlap: [C, D] (length 2) Returns the length of the overlap (0 if no overlap found). """ if not existing_list or not new_list: return 0 max_overlap = min(len(existing_list), len(new_list)) # Try all possible overlap lengths (from longest to shortest) for overlap_len in range(max_overlap, min_overlap - 1, -1): existing_suffix = existing_list[-overlap_len:] new_prefix = new_list[:overlap_len] # Deep compare suffix and prefix if all(JsonResponseHandler.deepCompare(existing_suffix[i], new_prefix[i]) for i in range(overlap_len)): return overlap_len return 0 @staticmethod def findPartialOverlap( existing_item: Any, new_item: Any ) -> Tuple[bool, Optional[Any]]: """ Detect if new_item completes an incomplete existing_item. Handles cases like: - existing: ["37643", "37649", "37657", "37663", "37691", "37693", "37699", "37717", "37747", "376"] - new: ["37643", "37649", ...] Returns (is_partial_overlap, merged_item) if partial overlap detected, else (False, None). """ # Check if both are lists if isinstance(existing_item, list) and isinstance(new_item, list): if not existing_item or not new_item: return False, None # Check if last element of existing is incomplete and matches first of new last_existing = existing_item[-1] first_new = new_item[0] # If last existing is a string and first new is a string if isinstance(last_existing, str) and isinstance(first_new, str): # Check if last existing is incomplete (very short, ends with number, etc.) if len(last_existing) < 10 and first_new.startswith(last_existing): # Partial overlap - merge them merged_last = last_existing + first_new[len(last_existing):] merged_item = existing_item[:-1] + [merged_last] + new_item[1:] return True, merged_item # Check if last existing is incomplete list and first new completes it if isinstance(last_existing, list) and isinstance(first_new, list): if len(last_existing) < len(first_new): # Check if last existing is prefix of first new if first_new[:len(last_existing)] == last_existing: # Merge: replace incomplete last with complete first merged_item = existing_item[:-1] + [first_new] + new_item[1:] return True, merged_item # Check if existing is incomplete string and new completes it if isinstance(existing_item, str) and isinstance(new_item, str): if len(existing_item) < 50 and new_item.startswith(existing_item): # Partial overlap merged = existing_item + new_item[len(existing_item):] return True, merged return False, None @staticmethod def mergeRowsWithOverlap( existing_rows: List[List[str]], new_rows: List[List[str]], iteration: int ) -> List[List[str]]: """ Merge table rows with sophisticated overlap detection. Handles multiple overlapping rows and partial overlaps. """ if not new_rows: return existing_rows if not existing_rows: return new_rows # Strategy 1: Find longest common suffix/prefix overlap overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_rows, new_rows, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping table rows, removing duplicates") return existing_rows + new_rows[overlap_len:] # Strategy 2: Check for partial overlap in last row if len(existing_rows) > 0 and len(new_rows) > 0: last_existing = existing_rows[-1] first_new = new_rows[0] is_partial, merged_row = JsonResponseHandler.findPartialOverlap(last_existing, first_new) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap in table rows, merging") return existing_rows[:-1] + [merged_row] + new_rows[1:] # Strategy 3: Simple first/last comparison (fallback) if isinstance(existing_rows[-1], list) and isinstance(new_rows[0], list): if list(existing_rows[-1]) == list(new_rows[0]): logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)") return existing_rows + new_rows[1:] # No overlap detected - append all new rows return existing_rows + new_rows @staticmethod def mergeItemsWithOverlap( existing_items: List[str], new_items: List[str], iteration: int ) -> List[str]: """ Merge list items with sophisticated overlap detection. Handles multiple overlapping items and partial overlaps. """ if not new_items: return existing_items if not existing_items: return new_items # Strategy 1: Find longest common suffix/prefix overlap overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_items, new_items, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping list items, removing duplicates") return existing_items + new_items[overlap_len:] # Strategy 2: Check for partial overlap in last item if len(existing_items) > 0 and len(new_items) > 0: is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing_items[-1], new_items[0]) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap in list items, merging") return existing_items[:-1] + [merged_item] + new_items[1:] # Strategy 3: Simple first/last comparison (fallback) if existing_items[-1] == new_items[0]: logger.debug(f"Iteration {iteration}: Removed duplicate list item (exact match)") return existing_items + new_items[1:] # No overlap detected - append all new items return existing_items + new_items @staticmethod def mergeDeepStructures( existing: Any, new: Any, iteration: int, path: str = "root" ) -> Any: """ FULLY GENERIC recursive merge for ANY JSON structure of arbitrary depth/complexity. Handles ALL cases generically: 1. Arrays/Lists: Overlap detection (suffix/prefix), partial overlap, no overlap (continuation) 2. Objects/Dicts: Key-by-key merge with overlap detection for nested structures 3. Primitives: Equality check, replacement if different 4. Nested structures: Recursively handles any depth/complexity Overlap detection strategies (all generic): - Array overlap: Finds longest common suffix/prefix, handles partial overlaps - Object overlap: Detected recursively through key matching and deep comparison - No overlap: Appends/merges continuation content after cut-off point CRITICAL: Fully generic - no specific logic for content types. Works for ANY JSON structure: arrays, objects, nested, primitives, any combination. """ # Type check if type(existing) != type(new): # Types don't match - return new (replacement) logger.debug(f"Iteration {iteration}: Types don't match at {path} ({type(existing).__name__} vs {type(new).__name__}), replacing") return new # Lists/arrays - GENERIC merge with overlap detection if isinstance(existing, list) and isinstance(new, list): if not new: return existing if not existing: return new # Strategy 1: Find longest common suffix/prefix overlap (handles multiple overlapping elements) overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing, new, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping elements at {path}, removing duplicates") return existing + new[overlap_len:] # Strategy 2: Check for partial overlap in last element (incomplete element completion) if len(existing) > 0 and len(new) > 0: is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing[-1], new[0]) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap at {path}, merging incomplete element") return existing[:-1] + [merged_item] + new[1:] # Strategy 3: No overlap detected - continuation after cut-off point # This handles the case where new data starts exactly after the cut-off logger.debug(f"Iteration {iteration}: No overlap at {path}, appending continuation content ({len(new)} items)") return existing + new # Dicts/objects - GENERIC merge with recursive overlap detection if isinstance(existing, dict) and isinstance(new, dict): merged = existing.copy() # Check for object-level overlap: if new object is subset/superset of existing # This handles cases where same object structure appears in both existing_keys = set(existing.keys()) new_keys = set(new.keys()) # If new is subset of existing and values match, it's overlap (skip) if new_keys.issubset(existing_keys): all_match = True for key in new_keys: if not JsonResponseHandler.deepCompare(existing[key], new[key]): all_match = False break if all_match: logger.debug(f"Iteration {iteration}: Object at {path} is subset overlap, skipping") return existing # Merge key-by-key with recursive overlap detection for key, new_value in new.items(): if key in merged: # Key exists - merge recursively (handles nested overlap detection) merged[key] = JsonResponseHandler.mergeDeepStructures( merged[key], new_value, iteration, f"{path}.{key}" ) else: # New key - add it (continuation content) merged[key] = new_value logger.debug(f"Iteration {iteration}: Added new key '{key}' at {path} (continuation)") return merged # Primitives - equality check if existing == new: return existing # Different primitive values - return new (continuation/replacement) logger.debug(f"Iteration {iteration}: Primitive at {path} differs, using new value") return new @staticmethod def _mergeFragmentIntoElement( last_element: Dict[str, Any], fragment_data: Any, target_section: Dict[str, Any], iteration: int, path: str ) -> Dict[str, Any]: """ GENERIC fragment merging for ALL structure types. Automatically detects the structure type and merges fragments accordingly. Works for: tables, lists, code blocks, paragraphs, images, and any nested structures. Strategy: 1. Analyze last_element structure to determine content location (content.rows, content.items, etc.) 2. Detect fragment type (array, object, primitive) 3. Merge fragment into appropriate location using mergeDeepStructures Args: last_element: The existing element to merge into fragment_data: The fragment data to merge (can be any JSON structure) target_section: The target section (for content_type detection) iteration: Current iteration number path: Path for logging Returns: Merged element """ contentType = target_section.get("content_type", "") elementType = last_element.get("type", "") # Determine the content structure path based on element type and content type # This handles both nested (content.rows) and flat (rows) structures contentPath = None fragmentIsArray = isinstance(fragment_data, list) and len(fragment_data) > 0 # Detect structure type and determine merge path if contentType == "table" or elementType == "table": # Tables: merge into content.rows or rows if "content" in last_element and isinstance(last_element["content"], dict): contentPath = "content.rows" else: contentPath = "rows" elif contentType in ["bullet_list", "numbered_list", "list"] or elementType in ["bullet_list", "numbered_list", "list"]: # Lists: merge into content.items or items if "content" in last_element and isinstance(last_element["content"], dict): contentPath = "content.items" else: contentPath = "items" elif contentType == "code_block" or elementType == "code_block": # Code blocks: merge into content.code or code if "content" in last_element and isinstance(last_element["content"], dict): contentPath = "content.code" else: contentPath = "code" elif contentType in ["paragraph", "heading"] or elementType in ["paragraph", "heading"]: # Text: merge into content.text or text if "content" in last_element and isinstance(last_element["content"], dict): contentPath = "content.text" else: contentPath = "text" elif contentType == "image" or elementType == "image": # Images: merge into base64Data contentPath = "base64Data" # If we have a specific content path, merge into that location if contentPath: # Split path (e.g., "content.rows" -> ["content", "rows"]) pathParts = contentPath.split(".") # Ensure nested structure exists current = last_element for i, part in enumerate(pathParts[:-1]): if part not in current: current[part] = {} elif not isinstance(current[part], dict): current[part] = {} current = current[part] # Get existing content at target path targetKey = pathParts[-1] existingContent = current.get(targetKey, []) # Merge fragment into existing content # CRITICAL: Handle both array fragments and object fragments generically if fragmentIsArray: # Fragment is an array - merge arrays if isinstance(existingContent, list): # Check if fragment is array of arrays (e.g., table rows) or array of primitives if len(fragment_data) > 0 and isinstance(fragment_data[0], list): # Array of arrays - use rows merge for tables, generic merge for others if contentPath.endswith(".rows"): mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragment_data, iteration) else: # Generic array-of-arrays merge mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent, fragment_data, iteration, f"{path}.{targetKey}" ) else: # Array of primitives - use items merge for lists, generic merge for others if contentPath.endswith(".items"): mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragment_data, iteration) else: # Generic array merge using mergeDeepStructures mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent, fragment_data, iteration, f"{path}.{targetKey}" ) else: # Existing content is not a list - replace with fragment mergedContent = fragment_data elif isinstance(fragment_data, dict): # Fragment is an object - check if it contains nested content (e.g., {"content": {"rows": [...]}}) # If fragment has same structure as target, merge nested content if "content" in fragment_data and isinstance(fragment_data["content"], dict): fragmentNested = fragment_data["content"] # Check if fragment has the same key as our target (e.g., fragment.content.rows) if targetKey in fragmentNested: # Fragment has nested content matching our target - merge that content fragmentNestedContent = fragmentNested[targetKey] if isinstance(existingContent, list) and isinstance(fragmentNestedContent, list): # Both are lists - merge them if contentPath.endswith(".rows"): mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragmentNestedContent, iteration) elif contentPath.endswith(".items"): mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragmentNestedContent, iteration) else: mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent, fragmentNestedContent, iteration, f"{path}.{targetKey}" ) else: # Use deep merge for nested content mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent if existingContent else {}, fragmentNestedContent, iteration, f"{path}.{targetKey}" ) else: # Fragment has different structure - merge entire fragment object mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent if existingContent else {}, fragment_data, iteration, f"{path}.{targetKey}" ) else: # Fragment is a simple object - use deep merge mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent if existingContent else {}, fragment_data, iteration, f"{path}.{targetKey}" ) else: # Fragment is a primitive or unknown type - use deep merge mergedContent = JsonResponseHandler.mergeDeepStructures( existingContent if existingContent else {}, fragment_data, iteration, f"{path}.{targetKey}" ) # Update the merged content current[targetKey] = mergedContent # Ensure type is set if elementType and "type" not in last_element: last_element["type"] = elementType elif contentType and "type" not in last_element: last_element["type"] = contentType logger.info(f"Iteration {iteration}: ✅ Merged fragment into {contentPath} for section '{target_section.get('id')}'") return last_element # No specific content path - use generic deep merge # This handles any structure type generically merged_element = JsonResponseHandler.mergeDeepStructures( last_element, fragment_data, iteration, path ) logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section.get('id')}'") return merged_element @staticmethod def cleanEncodingIssues(jsonString: str) -> str: """ GENERIC function to remove problematic encoding parts from JSON string. Works for ANY JSON structure - removes problematic characters/bytes. Args: jsonString: JSON string that may have encoding issues Returns: Cleaned JSON string """ try: # Try to decode/encode to detect issues jsonString.encode('utf-8').decode('utf-8') return jsonString except UnicodeError: # Remove problematic parts cleaned = jsonString.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore') logger.warning("Removed encoding issues from JSON string") return cleaned @staticmethod def mergeJsonStringsWithOverlap( accumulated: str, newFragment: str ) -> Tuple[str, bool]: """ Merge JSON fragments intelligently using modular parser. Uses the new ModularJsonMerger for clean, robust merging. Falls back to legacy code only if new merger fails completely. Args: accumulated: Previously accumulated JSON string (may be incomplete/fragmented) newFragment: New fragment string to append (may be incomplete/fragmented) Returns: Tuple of (merged_json_string, has_overlap): - merged_json_string: Combined JSON string with fragments properly merged - has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop) """ if not accumulated: result = newFragment if newFragment else "{}" return (result, False) # No overlap if no accumulated data if not newFragment: return (accumulated, False) # No overlap if no new fragment # Use new modular merger try: from .subJsonMerger import ModularJsonMerger result, hasOverlap = ModularJsonMerger.merge(accumulated, newFragment) # IMPORTANT: ModularJsonMerger returns unclosed JSON if overlap found (with incomplete element at end) # If no overlap, returns closed JSON (iterations should stop) if result and result.strip() and result.strip() != "{}": # Return result with overlap flag return (result, hasOverlap) except Exception as e: logger.debug(f"Modular merger failed, using fallback: {e}") # Fallback to legacy merger (simplified) from modules.shared.jsonUtils import normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip() newFragmentExtracted = stripCodeFences(normalizeJsonText(newFragment)).strip() # Try simple string merge with repair try: # Close structures accClosed = closeJsonStructures(accumulatedExtracted) if accumulatedExtracted else "{}" fragClosed = closeJsonStructures(newFragmentExtracted) if newFragmentExtracted else "{}" # Try to parse both accParsed, accErr, _ = tryParseJson(accClosed) fragParsed, fragErr, _ = tryParseJson(fragClosed) # If both parse, merge structurally if accErr is None and fragErr is None: merged = JsonResponseHandler._mergeParsedJson(accParsed, fragParsed) if merged: result = json.dumps(merged, indent=2, ensure_ascii=False) return (result, False) # No overlap in fallback - close and stop # If only accumulated parses, return it if accErr is None and accParsed: result = json.dumps(accParsed, indent=2, ensure_ascii=False) return (result, False) # No overlap - close and stop except Exception: pass # Last resort: return accumulated (at least we have that) - close it if accumulatedExtracted: try: closed = closeJsonStructures(accumulatedExtracted) return (closed, False) # No overlap - close and stop except Exception: return (accumulatedExtracted, False) # No overlap - return as-is result = accumulated if accumulated else "{}" return (result, False) # No overlap - return as-is @staticmethod def _mergeParsedJson(accParsed: Any, fragParsed: Any) -> Optional[Dict[str, Any]]: """Simple merge of two parsed JSON objects.""" if isinstance(accParsed, dict) and isinstance(fragParsed, dict): # Merge dicts merged = accParsed.copy() # Merge elements if both have them if "elements" in accParsed and "elements" in fragParsed: accElements = accParsed.get("elements", []) fragElements = fragParsed.get("elements", []) # Simple merge - append new elements merged["elements"] = accElements + fragElements elif "elements" in fragParsed: merged["elements"] = fragParsed["elements"] # Merge other keys for key, value in fragParsed.items(): if key != "elements": if key in merged and isinstance(merged[key], list) and isinstance(value, list): merged[key] = merged[key] + value else: merged[key] = value return merged return None @staticmethod def _normalizeToElementsStructure( jsonString: str, originalString: str ) -> Optional[Dict[str, Any]]: """ Normalize any JSON structure (Dict, List, None, or parse error) to {"elements": [...]} format. Handles: - Dict with "elements" → return as-is - Dict without "elements" but with "type" → wrap in elements array - List → wrap in elements structure - Parse error → try repairBrokenJson - None → return None Args: jsonString: Extracted JSON string originalString: Original string (for context) Returns: Normalized Dict with "elements" array, or None if normalization fails """ if not jsonString: return None from modules.shared.jsonUtils import tryParseJson, repairBrokenJson, closeJsonStructures # Try to parse directly first try: parsed = json.loads(jsonString) parseErr = None except Exception as e: parseErr = e parsed = None # If parsing failed, try closing structures first (for incomplete fragments) if parseErr is not None: try: closed = closeJsonStructures(jsonString) parsed = json.loads(closed) parseErr = None except Exception: pass # If still failed, try repairBrokenJson ONLY if it looks like document structure # For other structures (like section_content), use fragment detection instead if parseErr is not None: # Check if this looks like a document structure (has "documents" or "sections") isDocumentStructure = '"documents"' in jsonString or '"sections"' in jsonString if isDocumentStructure: # Use repairBrokenJson for document structures repaired = repairBrokenJson(jsonString) if repaired: parsed = repaired parseErr = None else: # Still can't parse - try to detect fragment structure return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString) else: # For non-document structures, skip repairBrokenJson and go straight to fragment detection # repairBrokenJson tries to extract "sections" which doesn't work for other structures return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString) # Normalize based on type if parsed is None: return None elif isinstance(parsed, dict): # Already a dict if "elements" in parsed: return parsed elif "type" in parsed: # Single element - wrap in elements array return {"elements": [parsed]} else: # Unknown dict structure - try to extract elements return JsonResponseHandler._extractElementsFromDict(parsed) elif isinstance(parsed, list): # List - check if it's a list of elements or a fragment if parsed and isinstance(parsed[0], dict) and "type" in parsed[0]: # List of elements return {"elements": parsed} else: # Fragment list (e.g., array of rows) - detect structure return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString) else: # Primitive type - can't normalize return None @staticmethod def _detectAndNormalizeFragment( jsonString: str, originalString: str ) -> Optional[Dict[str, Any]]: """ Detect fragment structure and normalize it. Fragments can be: - Array of arrays (table rows): `[["row1"], ["row2"]]` or `["1947", "16883"], ["1948", "16889"]` - Array of strings (list items): `["item1", "item2"]` - Incomplete structure: `["item1", "item2", ` (ends with comma) - Partial object: `{"type": "table", "content": {"rows": [["1947"...` (cut mid-string) Returns normalized structure or None if detection fails. """ jsonStripped = jsonString.strip() # Strategy 1: Check if it's an array fragment if jsonStripped.startswith('['): # Try to parse as array from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Close incomplete structures closed = closeJsonStructures(jsonStripped) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None and isinstance(parsed, list): # Check structure: array of arrays (table rows) or array of strings (list items) if parsed and isinstance(parsed[0], list): # Array of arrays - likely table rows fragment return { "elements": [{ "type": "table", "content": { "rows": parsed } }] } elif parsed and isinstance(parsed[0], str): # Array of strings - likely list items fragment return { "elements": [{ "type": "bullet_list", "content": { "items": parsed } }] } elif parseErr is not None: # Can't parse - try regex extraction for table rows rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped) if rows: return { "elements": [{ "type": "table", "content": { "rows": rows } }] } # Strategy 2: Check if it's a partial object (cut mid-structure) # Look for patterns like: {"elements": [...] or {"type": "table"... if jsonStripped.startswith('{'): from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Try to close and parse closed = closeJsonStructures(jsonStripped) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None and isinstance(parsed, dict): # Successfully parsed - normalize it return JsonResponseHandler._normalizeToElementsStructure(closed, originalString) elif parseErr is not None: # Can't parse - try to extract table rows from the raw string # This handles cases like: {"elements": [{"type": "table", "content": {"rows": [["1947"... rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped) if rows: return { "elements": [{ "type": "table", "content": { "rows": rows } }] } # Try to extract any array patterns that might be table rows # Look for patterns like: ["1947", "10000"], ["1948", "10100"] import re # Pattern: ["value1", "value2"], ["value3", "value4"] rowPattern = r'\["([^"]*)",\s*"([^"]*)"\]' matches = re.findall(rowPattern, jsonStripped) if matches and len(matches) >= 2: # Found multiple row patterns - likely table rows rows = [[match[0], match[1]] for match in matches] return { "elements": [{ "type": "table", "content": { "rows": rows } }] } # Strategy 3: Try to extract rows from any text (even if not starting with [ or {) rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped) if rows: return { "elements": [{ "type": "table", "content": { "rows": rows } }] } return None @staticmethod def _extractElementsFromDict(d: Dict[str, Any]) -> Dict[str, Any]: """ Try to extract elements from unknown dict structure. Returns normalized structure or empty elements array. """ # Check common patterns if "sections" in d: # Document structure with sections sections = d.get("sections", []) elements = [] for section in sections: if isinstance(section, dict) and "elements" in section: elements.extend(section.get("elements", [])) return {"elements": elements} # Unknown structure - return empty return {"elements": []} @staticmethod def _mergeJsonStructuresGeneric( accumulatedObj: Dict[str, Any], newFragmentObj: Dict[str, Any], accumulatedRaw: str, newFragmentRaw: str, overlapElements: Optional[List[Dict[str, Any]]] = None ) -> Optional[Dict[str, Any]]: """ GENERIC merge of two JSON structures, handling overlaps and missing parts. Strategy: 1. Extract elements from both structures (both are normalized to {"elements": [...]}) 2. Use overlap elements if provided to identify merge point 3. Detect if both have same structure (same content type) 4. Group elements by type 5. Merge elements of same type using content-type-specific logic with overlap detection 6. Handle overlaps and missing parts intelligently Args: accumulatedObj: Normalized accumulated JSON object (guaranteed to have "elements") newFragmentObj: Normalized new fragment JSON object (guaranteed to have "elements") accumulatedRaw: Raw accumulated string (for fragment detection) newFragmentRaw: Raw new fragment string (for fragment detection) overlapElements: Optional list of overlap elements from continuation response Returns: Merged JSON object or None if merging fails """ try: # Step 1: Extract elements (both are normalized, so this should always work) accumulatedElements = accumulatedObj.get("elements", []) if isinstance(accumulatedObj, dict) else [] newFragmentElements = newFragmentObj.get("elements", []) if isinstance(newFragmentObj, dict) else [] if not accumulatedElements and not newFragmentElements: # No elements found - try to extract from raw strings # Try to extract any valid JSON structure from raw strings from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Try accumulated first if accumulatedRaw: try: closedAccumulated = closeJsonStructures(accumulatedRaw) parsed, parseErr, _ = tryParseJson(closedAccumulated) if parseErr is None and parsed: normalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulatedRaw) if normalized: return normalized except Exception: pass # Try new fragment if newFragmentRaw: try: closedFragment = closeJsonStructures(newFragmentRaw) parsed, parseErr, _ = tryParseJson(closedFragment) if parseErr is None and parsed: normalized = JsonResponseHandler._normalizeToElementsStructure(closedFragment, newFragmentRaw) if normalized: return normalized except Exception: pass # If still nothing, return empty structure (never None) return {"elements": []} # Step 2: Use overlap elements to identify merge point # If overlap elements are provided, use them to find where to merge if overlapElements and isinstance(overlapElements, list) and len(overlapElements) > 0: # Find overlap in accumulated elements overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements) if overlapStartIndex >= 0: # Remove overlapping elements from accumulated (they'll be replaced by continuation) accumulatedElements = accumulatedElements[:overlapStartIndex] logger.debug(f"Found overlap at index {overlapStartIndex}, removed {len(accumulatedElements) - overlapStartIndex} overlapping elements") # Step 3: Detect if newFragment is a continuation fragment # Check if newFragment starts with array elements (fragment, not full JSON) isFragment = JsonResponseHandler._isFragment(newFragmentRaw, newFragmentElements) # Step 4: Group elements by type for intelligent merging accumulatedByType = {} for elem in accumulatedElements: if isinstance(elem, dict): elemType = elem.get("type", "unknown") if elemType not in accumulatedByType: accumulatedByType[elemType] = [] accumulatedByType[elemType].append(elem) newFragmentByType = {} for elem in newFragmentElements: if isinstance(elem, dict): elemType = elem.get("type", "unknown") if elemType not in newFragmentByType: newFragmentByType[elemType] = [] newFragmentByType[elemType].append(elem) # Step 5: Merge elements intelligently mergedElements = [] allTypes = set(accumulatedByType.keys()) | set(newFragmentByType.keys()) for elemType in allTypes: accElems = accumulatedByType.get(elemType, []) fragElems = newFragmentByType.get(elemType, []) if not accElems: # Only in fragment - add all mergedElements.extend(fragElems) elif not fragElems: # Only in accumulated - add all mergedElements.extend(accElems) else: # Both have elements of this type - merge them using content-type-specific logic mergedElem = JsonResponseHandler._mergeElementsOfSameTypeGeneric( accElems[0], fragElems[0], elemType, accumulatedRaw, newFragmentRaw, isFragment ) if mergedElem: mergedElements.append(mergedElem) # Step 6: Reconstruct base structure if mergedElements: return {"elements": mergedElements} else: # No merged elements - return accumulated if available (NEVER return None) if accumulatedElements: return {"elements": accumulatedElements} # If no accumulated, return new fragment if available if newFragmentElements: return {"elements": newFragmentElements} # Last resort: return empty structure (never None) return {"elements": []} except Exception as e: logger.debug(f"Structure-based merge failed: {e}") import traceback logger.debug(traceback.format_exc()) return None @staticmethod def _isFragment(jsonString: str, elements: List[Dict[str, Any]]) -> bool: """ Detect if JSON string is a fragment (not a complete JSON object). Fragments: - Start with `[` but not `[{"` (array fragment, not full elements array) - Start with array elements like `["cell1", "cell2"],` (table rows fragment) - Don't have full structure (missing outer object with "elements") - Are continuations of previous structure """ jsonStripped = jsonString.strip() # Check if it starts with array (fragment) if jsonStripped.startswith('['): # Check if it's a full elements array `[{"type": ...}]` or a fragment `["cell1", "cell2"]` if jsonStripped.startswith('[{"') or jsonStripped.startswith('[{'): # Could be full structure - check if it has "type" field if elements and isinstance(elements[0], dict) and "type" in elements[0]: return False # Full structure # Otherwise it's a fragment (array of primitives or incomplete) return True # Check if it starts with object but missing "elements" wrapper if jsonStripped.startswith('{'): # Check if it has "elements" field if '"elements"' not in jsonStripped[:200]: # Check first 200 chars # Might be a single element fragment return True # Check if elements are incomplete (no full structure) if elements and isinstance(elements[0], dict): # Check if first element is missing required fields firstElem = elements[0] if "type" not in firstElem and "content" not in firstElem: return True return False @staticmethod def _mergeElementsOfSameTypeGeneric( accumulatedElem: Dict[str, Any], newFragmentElem: Dict[str, Any], elemType: str, accumulatedRaw: str, newFragmentRaw: str, isFragment: bool ) -> Optional[Dict[str, Any]]: """ GENERIC merge of two elements of the same type, with content-type-specific optimizations. Content-type-specific merging: - table: Merge rows arrays with overlap detection - paragraph: Merge text content - code_block: Merge code strings - bullet_list/numbered_list: Merge items arrays - heading: Use new fragment (usually complete) - image: Use new fragment (usually complete) - Other: Generic deep merge Args: accumulatedElem: Accumulated element newFragmentElem: New fragment element elemType: Content type (table, paragraph, etc.) accumulatedRaw: Raw accumulated string newFragmentRaw: Raw new fragment string isFragment: Whether newFragment is a fragment (continuation) Returns: Merged element or None if merging fails """ if elemType == "table": return JsonResponseHandler._mergeTableElementsGeneric( accumulatedElem, newFragmentElem, accumulatedRaw, newFragmentRaw, isFragment ) elif elemType == "paragraph": return JsonResponseHandler._mergeParagraphElements( accumulatedElem, newFragmentElem, isFragment ) elif elemType == "code_block": return JsonResponseHandler._mergeCodeBlockElements( accumulatedElem, newFragmentElem, isFragment ) elif elemType in ["bullet_list", "numbered_list"]: return JsonResponseHandler._mergeListElements( accumulatedElem, newFragmentElem, isFragment ) elif elemType in ["heading", "image"]: # Usually complete - use new fragment if it exists, otherwise accumulated return newFragmentElem if newFragmentElem else accumulatedElem else: # Generic merge: use mergeDeepStructures return JsonResponseHandler.mergeDeepStructures( accumulatedElem, newFragmentElem, 0, f"element_merge.{elemType}" ) @staticmethod def _mergeTableElementsGeneric( accumulatedElem: Dict[str, Any], newFragmentElem: Dict[str, Any], accumulatedRaw: str, newFragmentRaw: str, isFragment: bool ) -> Dict[str, Any]: """ GENERIC merge of two table elements with content-type-specific optimizations. Handles: - Overlapping rows (detect duplicates by comparing row content) - Missing headers (complete with existing headers) - Incomplete rows (complete with null values if needed) - Fragment rows (if newFragment is a fragment, extract rows from raw string) Args: accumulatedElem: Accumulated table element newFragmentElem: New fragment table element accumulatedRaw: Raw accumulated string (for fragment detection) newFragmentRaw: Raw new fragment string (for fragment extraction) isFragment: Whether newFragment is a fragment Returns: Merged table element """ # Extract content (handle both nested and flat structures) accContent = accumulatedElem.get("content", {}) if not accContent and "rows" in accumulatedElem: accContent = accumulatedElem fragContent = newFragmentElem.get("content", {}) if not fragContent and "rows" in newFragmentElem: fragContent = newFragmentElem # Extract rows accRows = accContent.get("rows", []) if isinstance(accContent, dict) else [] # If fragment, try to extract rows from raw string fragRows = fragContent.get("rows", []) if isinstance(fragContent, dict) else [] if isFragment and not fragRows: fragRows = JsonResponseHandler._extractRowsFromFragment(newFragmentRaw) # Extract headers (complete missing with existing) accHeaders = accContent.get("headers", []) if isinstance(accContent, dict) else [] fragHeaders = fragContent.get("headers", []) if isinstance(fragContent, dict) else [] mergedHeaders = accHeaders if accHeaders else fragHeaders # Merge rows with overlap detection mergedRows = JsonResponseHandler._mergeRowsWithOverlapDetection(accRows, fragRows) # Reconstruct table element mergedContent = { "headers": mergedHeaders, "rows": mergedRows } # Preserve other fields (caption, etc.) if isinstance(accContent, dict) and "caption" in accContent: mergedContent["caption"] = accContent["caption"] elif isinstance(fragContent, dict) and "caption" in fragContent: mergedContent["caption"] = fragContent["caption"] return { "type": "table", "content": mergedContent } @staticmethod def _extractRowsFromFragment(fragmentRaw: str) -> List[List[str]]: """ Extract table rows from fragment string. Handles fragments like: - `["1947", "16883"], ["1948", "16889"], ...` - `"rows": [["1947", "10000"], ["1948", "10100"]...` - Incomplete fragments cut mid-string Also handles fragments with more than 2 columns. """ import re rows = [] # Pattern 1: Array of arrays with 2 columns `["cell1", "cell2"], ["cell3", "cell4"]` # This pattern matches complete arrays: ["value1", "value2"] pattern2Col = r'\["([^"]*)",\s*"([^"]*)"\]' matches2Col = re.findall(pattern2Col, fragmentRaw) if matches2Col and len(matches2Col) >= 2: # Need at least 2 rows to be confident for match in matches2Col: if len(match) == 2: rows.append([match[0], match[1]]) if rows: return rows # Pattern 2: Array of arrays with variable columns (more robust) # Find all array patterns: ["...", "...", ...] # Use non-greedy matching but ensure we get complete arrays arrayPattern = r'\[(.*?)\]' arrayMatches = re.findall(arrayPattern, fragmentRaw) # Filter to only arrays that look like table rows (have multiple quoted values) validArrays = [] for arrayContent in arrayMatches: # Extract quoted strings from array content cellPattern = r'"([^"]*)"' cells = re.findall(cellPattern, arrayContent) # Only consider arrays with 2+ cells (likely table rows) if len(cells) >= 2: validArrays.append(cells) if validArrays and len(validArrays) >= 2: # Need at least 2 rows return validArrays # Pattern 3: Look for "rows": [...] pattern in incomplete JSON # This handles cases like: "rows": [["1947", "10000"], ["1948", "10100"]... rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)' rowsMatch = re.search(rowsPattern, fragmentRaw, re.DOTALL) if rowsMatch: rowsContent = rowsMatch.group(1) # Extract all array patterns from rows content arrayPattern = r'\[(.*?)\]' arrayMatches = re.findall(arrayPattern, rowsContent) for arrayContent in arrayMatches: cellPattern = r'"([^"]*)"' cells = re.findall(cellPattern, arrayContent) if len(cells) >= 2: # At least 2 columns rows.append(cells) if rows: return rows # Pattern 4: Try to parse as JSON array (handles complete arrays) from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Try to close incomplete structures closed = closeJsonStructures(fragmentRaw.strip()) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None and isinstance(parsed, list): if parsed and isinstance(parsed[0], list): # Array of arrays - table rows return parsed elif parsed and isinstance(parsed[0], str): # Array of strings - might be single column table return [[item] for item in parsed] # Pattern 5: Last resort - extract any array patterns we can find # Even if incomplete, try to extract what we can if not rows: # Find all patterns like ["value1", "value2"] even if incomplete # Use a more lenient pattern that handles incomplete strings incompletePattern = r'\["([^"]*)"(?:,\s*"([^"]*)")?' incompleteMatches = re.findall(incompletePattern, fragmentRaw) for match in incompleteMatches: if match[0]: # First value exists if match[1]: # Second value exists rows.append([match[0], match[1]]) else: # Only one value - might be incomplete, skip for now pass return rows @staticmethod def _mergeParagraphElements( accumulatedElem: Dict[str, Any], newFragmentElem: Dict[str, Any], isFragment: bool ) -> Dict[str, Any]: """Merge two paragraph elements.""" accContent = accumulatedElem.get("content", {}) fragContent = newFragmentElem.get("content", {}) accText = accContent.get("text", "") if isinstance(accContent, dict) else "" fragText = fragContent.get("text", "") if isinstance(fragContent, dict) else "" # Merge text (remove overlap if fragment) mergedText = accText + fragText if not isFragment else (accText.rstrip() + " " + fragText.lstrip()) return { "type": "paragraph", "content": {"text": mergedText} } @staticmethod def _mergeCodeBlockElements( accumulatedElem: Dict[str, Any], newFragmentElem: Dict[str, Any], isFragment: bool ) -> Dict[str, Any]: """Merge two code block elements.""" accContent = accumulatedElem.get("content", {}) fragContent = newFragmentElem.get("content", {}) accCode = accContent.get("code", "") if isinstance(accContent, dict) else "" fragCode = fragContent.get("code", "") if isinstance(fragContent, dict) else "" accLanguage = accContent.get("language") if isinstance(accContent, dict) else None fragLanguage = fragContent.get("language") if isinstance(fragContent, dict) else None mergedCode = accCode + "\n" + fragCode if fragCode else accCode mergedLanguage = accLanguage or fragLanguage result = { "type": "code_block", "content": {"code": mergedCode} } if mergedLanguage: result["content"]["language"] = mergedLanguage return result @staticmethod def _mergeListElements( accumulatedElem: Dict[str, Any], newFragmentElem: Dict[str, Any], isFragment: bool ) -> Dict[str, Any]: """Merge two list elements (bullet_list or numbered_list).""" accContent = accumulatedElem.get("content", {}) fragContent = newFragmentElem.get("content", {}) accItems = accContent.get("items", []) if isinstance(accContent, dict) else [] fragItems = fragContent.get("items", []) if isinstance(fragContent, dict) else [] # Merge items with overlap detection mergedItems = JsonResponseHandler._mergeItemsWithOverlapDetection(accItems, fragItems) elemType = accumulatedElem.get("type") or newFragmentElem.get("type") return { "type": elemType, "content": {"items": mergedItems} } @staticmethod def _findOverlapStartIndex( accumulatedElements: List[Dict[str, Any]], overlapElements: List[Dict[str, Any]] ) -> int: """ Find the start index in accumulatedElements where overlapElements begin. This helps identify where to merge continuation elements by matching the overlap elements with the end of accumulated elements. Args: accumulatedElements: List of accumulated elements overlapElements: List of overlap elements from continuation response Returns: Index where overlap starts, or -1 if not found """ if not overlapElements or not accumulatedElements: return -1 # Try to find overlap by matching element structures # Start from the end of accumulatedElements and work backwards overlapLen = len(overlapElements) accLen = len(accumulatedElements) if overlapLen > accLen: return -1 # Try matching from different start positions for startIdx in range(max(0, accLen - overlapLen), accLen): # Check if elements from startIdx match overlapElements matches = True for i in range(min(overlapLen, accLen - startIdx)): accElem = accumulatedElements[startIdx + i] overlapElem = overlapElements[i] # Compare element types if isinstance(accElem, dict) and isinstance(overlapElem, dict): accType = accElem.get("type") overlapType = overlapElem.get("type") if accType != overlapType: matches = False break # For tables, compare row counts or last rows if accType == "table": accRows = accElem.get("rows", []) or (accElem.get("content", {}).get("rows", []) if isinstance(accElem.get("content"), dict) else []) overlapRows = overlapElem.get("rows", []) or (overlapElem.get("content", {}).get("rows", []) if isinstance(overlapElem.get("content"), dict) else []) if accRows and overlapRows: # Check if last rows match if len(accRows) >= len(overlapRows): lastAccRows = accRows[-len(overlapRows):] if lastAccRows != overlapRows: matches = False break # For lists, compare items elif accType in ["bullet_list", "numbered_list"]: accItems = accElem.get("items", []) or (accElem.get("content", {}).get("items", []) if isinstance(accElem.get("content"), dict) else []) overlapItems = overlapElem.get("items", []) or (overlapElem.get("content", {}).get("items", []) if isinstance(overlapElem.get("content"), dict) else []) if accItems and overlapItems: if len(accItems) >= len(overlapItems): lastAccItems = accItems[-len(overlapItems):] if lastAccItems != overlapItems: matches = False break else: matches = False break if matches: return startIdx return -1 @staticmethod def _mergeRowsWithOverlapDetection( accRows: List[List[str]], fragRows: List[List[str]] ) -> List[List[str]]: """ Merge two row arrays, detecting and removing overlaps. Overlap detection: Compare rows to find duplicates. Missing parts: Complete with null values if needed. """ if not accRows: return fragRows if not fragRows: return accRows # Find overlap by comparing last rows of accRows with first rows of fragRows overlapStart = 0 maxOverlap = min(len(accRows), len(fragRows)) # Find the longest overlap for overlapLen in range(maxOverlap, 0, -1): accSuffix = accRows[-overlapLen:] fragPrefix = fragRows[:overlapLen] # Compare rows (exact match) if accSuffix == fragPrefix: overlapStart = overlapLen break # Merge: accumulated rows + non-overlapping fragment rows merged = accRows + fragRows[overlapStart:] return merged @staticmethod def _mergeItemsWithOverlapDetection( accItems: List[str], fragItems: List[str] ) -> List[str]: """ Merge two item arrays (for lists), detecting and removing overlaps. Overlap detection: Compare items to find duplicates. """ if not accItems: return fragItems if not fragItems: return accItems # Find overlap by comparing last items of accItems with first items of fragItems overlapStart = 0 maxOverlap = min(len(accItems), len(fragItems)) # Find the longest overlap for overlapLen in range(maxOverlap, 0, -1): accSuffix = accItems[-overlapLen:] fragPrefix = fragItems[:overlapLen] # Compare items (exact match) if accSuffix == fragPrefix: overlapStart = overlapLen break # Merge: accumulated items + non-overlapping fragment items merged = accItems + fragItems[overlapStart:] return merged @staticmethod def _extractOverlapAndContinuation(jsonString: str) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]: """ Extract overlap and continuation sections from AI response with explicit overlap structure. Expected format: { "overlap": [...], // Elements to repeat for merging "continuation": [...] // New elements to add } Or alternative format: { "overlap": "...", // Overlap as string "continuation": "..." // Continuation as string } Args: jsonString: JSON string that may contain overlap/continuation structure Returns: Tuple of (overlap_elements, continuation_json_string) or (None, None) if not found """ if not jsonString: return None, None from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures # Extract and normalize JSON extracted = stripCodeFences(normalizeJsonText(jsonString)).strip() if not extracted: return None, None # Try to parse try: closed = closeJsonStructures(extracted) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None and isinstance(parsed, dict): # Check for overlap/continuation structure overlap = parsed.get("overlap") continuation = parsed.get("continuation") if overlap is not None and continuation is not None: # Found explicit overlap structure overlapElements = None continuationJson = None # Extract overlap elements if isinstance(overlap, list): overlapElements = overlap elif isinstance(overlap, str): # Overlap is a string - try to parse it try: overlapParsed, _, _ = tryParseJson(closeJsonStructures(overlap)) if isinstance(overlapParsed, list): overlapElements = overlapParsed except Exception: pass # Extract continuation JSON if isinstance(continuation, (dict, list)): continuationJson = json.dumps(continuation, indent=2, ensure_ascii=False) elif isinstance(continuation, str): continuationJson = continuation if overlapElements is not None and continuationJson: return overlapElements, continuationJson except Exception: pass return None, None @staticmethod def _mergeWithExplicitOverlap( accumulated: str, continuationJson: str, overlapElements: List[Dict[str, Any]] ) -> str: """ Merge accumulated JSON with continuation JSON using explicit overlap information. Strategy: 1. Find overlap in accumulated using overlapElements 2. Remove overlapping elements from accumulated 3. Append continuation JSON Args: accumulated: Previously accumulated JSON string continuationJson: Continuation JSON string (new content) overlapElements: List of overlap elements from AI response Returns: Merged JSON string """ if not accumulated: return continuationJson if not continuationJson: return accumulated from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures # Normalize accumulated accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip() accumulatedNormalized = JsonResponseHandler._normalizeToElementsStructure( accumulatedExtracted, accumulated ) # Normalize continuation continuationExtracted = stripCodeFences(normalizeJsonText(continuationJson)).strip() continuationNormalized = JsonResponseHandler._normalizeToElementsStructure( continuationExtracted, continuationJson ) # If both normalized successfully, use structure-based merge with overlap if accumulatedNormalized and continuationNormalized: merged = JsonResponseHandler._mergeJsonStructuresGeneric( accumulatedNormalized, continuationNormalized, accumulatedExtracted, continuationExtracted, overlapElements=overlapElements ) if merged: return json.dumps(merged, indent=2, ensure_ascii=False) # Fallback: use overlap elements to find merge point in accumulated # Find where overlap elements match in accumulated if accumulatedNormalized and overlapElements: accumulatedElements = accumulatedNormalized.get("elements", []) overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements) if overlapStartIndex >= 0: # Remove overlapping elements accumulatedElements = accumulatedElements[:overlapStartIndex] accumulatedNormalized["elements"] = accumulatedElements # Merge continuation if continuationNormalized: continuationElements = continuationNormalized.get("elements", []) accumulatedElements.extend(continuationElements) accumulatedNormalized["elements"] = accumulatedElements return json.dumps(accumulatedNormalized, indent=2, ensure_ascii=False) # Last resort: simple concatenation return JsonResponseHandler._mergeJsonStringsWithOverlapFallback(accumulated, continuationJson) @staticmethod def _extractValidJsonPrefix(jsonString: str) -> str: """ Extract the longest valid JSON prefix from a string that may be cut randomly. Strategy: 1. Try to find the longest prefix that can be closed and parsed 2. Handle random cuts (mid-string, mid-number, etc.) 3. Return the longest valid prefix found Args: jsonString: JSON string that may be cut randomly Returns: Longest valid JSON prefix, or empty string if none found """ if not jsonString or not jsonString.strip(): return "" from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Strategy 1: Try progressive truncation to find longest valid JSON # Use binary search-like approach for efficiency bestValid = "" bestLength = 0 maxLen = len(jsonString) # Generate test lengths: full, 95%, 90%, ..., 10% testLengths = [] for percent in range(100, 9, -5): testLen = int(maxLen * percent / 100) if testLen > bestLength: testLengths.append(testLen) # Also test specific points near the end (common cut points) for offset in [200, 100, 50, 20, 10, 5, 2, 1]: if maxLen > offset: testLen = maxLen - offset if testLen > bestLength: testLengths.append(testLen) # Sort and deduplicate testLengths = sorted(set(testLengths), reverse=True) for testLen in testLengths: if testLen <= bestLength: continue # Already found better testStr = jsonString[:testLen] if not testStr.strip(): continue # Try to close and parse try: closed = closeJsonStructures(testStr) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None and parsed is not None: # Valid JSON found if testLen > bestLength: bestValid = closed bestLength = testLen except Exception: continue # Strategy 2: If we found valid JSON, return it if bestValid: return bestValid # Strategy 3: Try to extract balanced JSON (find first complete structure) jsonStripped = jsonString.strip() if jsonStripped.startswith('{') or jsonStripped.startswith('['): # Try to extract balanced JSON from modules.shared.jsonUtils import extractFirstBalancedJson balanced = extractFirstBalancedJson(jsonStripped) if balanced and balanced != jsonStripped: try: closed = closeJsonStructures(balanced) parsed, parseErr, _ = tryParseJson(closed) if parseErr is None: return closed except Exception: pass # Strategy 4: Try to repair by removing incomplete trailing structures # Find the last complete element/item before the cut try: # For arrays: find last complete element if jsonStripped.startswith('['): # Find last complete array element lastComma = jsonStripped.rfind(',') if lastComma > 0: # Try prefix up to last comma prefix = jsonStripped[:lastComma].strip() if prefix.endswith(','): prefix = prefix[:-1].strip() if prefix: closed = closeJsonStructures(prefix + ']') parsed, parseErr, _ = tryParseJson(closed) if parseErr is None: return closed # For objects: find last complete key-value pair elif jsonStripped.startswith('{'): # Find last complete key-value pair lastComma = jsonStripped.rfind(',') if lastComma > 0: # Try prefix up to last comma prefix = jsonStripped[:lastComma].strip() if prefix.endswith(','): prefix = prefix[:-1].strip() if prefix: closed = closeJsonStructures(prefix + '}') parsed, parseErr, _ = tryParseJson(closed) if parseErr is None: return closed except Exception: pass # Last resort: return empty (caller will handle) return "" @staticmethod def _smartConcatenate(accumulated: str, newFragment: str) -> str: """ Smart concatenation that tries to merge JSON fragments intelligently. Strategy: 1. Extract valid JSON from both fragments 2. Parse both as JSON objects/arrays 3. Merge them structurally 4. Return valid JSON Args: accumulated: Accumulated JSON string newFragment: New fragment to append Returns: Merged string with valid JSON, or empty if merging not possible """ if not accumulated or not newFragment: return "" from modules.shared.jsonUtils import closeJsonStructures, tryParseJson # Extract valid JSON prefixes from both accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated) newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment) if not accumulatedValid: accumulatedValid = accumulated if not newFragmentValid: newFragmentValid = newFragment # Try to parse both try: closedAccumulated = closeJsonStructures(accumulatedValid) parsedAccumulated, parseErr1, _ = tryParseJson(closedAccumulated) closedNewFragment = closeJsonStructures(newFragmentValid) parsedNewFragment, parseErr2, _ = tryParseJson(closedNewFragment) # If both parse successfully, merge structurally if parseErr1 is None and parseErr2 is None: # Normalize both to elements structure accNormalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulated) newNormalized = JsonResponseHandler._normalizeToElementsStructure(closedNewFragment, newFragment) if accNormalized and newNormalized: merged = JsonResponseHandler._mergeJsonStructuresGeneric( accNormalized, newNormalized, closedAccumulated, closedNewFragment ) if merged: return json.dumps(merged, indent=2, ensure_ascii=False) # If only accumulated parses, return it if parseErr1 is None and parsedAccumulated: return json.dumps(parsedAccumulated, indent=2, ensure_ascii=False) # If only new fragment parses, return it if parseErr2 is None and parsedNewFragment: return json.dumps(parsedNewFragment, indent=2, ensure_ascii=False) except Exception: pass # Fallback: Try simple string concatenation with repair accumulatedStripped = accumulated.strip() newFragmentStripped = newFragment.strip() # If accumulated doesn't end with } or ], it might be incomplete if accumulatedStripped and not accumulatedStripped.endswith(('}', ']')): try: closedAccumulated = closeJsonStructures(accumulatedStripped) # Check if newFragment starts with continuation if newFragmentStripped.startswith(','): # Remove leading comma and append merged = closedAccumulated.rstrip() + newFragmentStripped.lstrip(',').strip() elif newFragmentStripped.startswith(('}', ']')): # Fragment starts with closing - might be completing accumulated merged = closedAccumulated.rstrip() + newFragmentStripped else: # Try to append as continuation # Check if we need a comma separator if not closedAccumulated.rstrip().endswith((',', '[', '{')): merged = closedAccumulated.rstrip() + ',' + newFragmentStripped else: merged = closedAccumulated.rstrip() + newFragmentStripped # Try to repair and parse the merged result repaired = closeJsonStructures(merged) parsed, parseErr, _ = tryParseJson(repaired) if parseErr is None: return json.dumps(parsed, indent=2, ensure_ascii=False) except Exception: pass # If smart concatenation failed, return empty (caller will handle) return "" @staticmethod def _mergeJsonStringsWithOverlapFallback( accumulated: str, newFragment: str ) -> str: """ Fallback overlap detection using string comparison. Used when both strings are complete JSON structures or fragments. CRITICAL: Never returns empty JSON - always returns at least accumulated. """ if not accumulated: return newFragment if newFragment else "{}" if not newFragment: return accumulated from modules.shared.jsonUtils import tryParseJson, closeJsonStructures # Strategy 1: Try to extract valid JSON parts from both fragments # This handles random cuts better by finding the longest valid prefix/suffix # Extract valid JSON from accumulated (find longest valid prefix) accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated) # Extract valid JSON from newFragment (find longest valid prefix) newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment) # If we have valid JSON from both, try structure-based merge if accumulatedValid and newFragmentValid: try: parsedAccumulated, parseErr1, _ = tryParseJson(closeJsonStructures(accumulatedValid)) parsedNewFragment, parseErr2, _ = tryParseJson(closeJsonStructures(newFragmentValid)) if parseErr1 is None and parseErr2 is None: # Both are valid JSON - try structure merge accNormalized = JsonResponseHandler._normalizeToElementsStructure(accumulatedValid, accumulated) newNormalized = JsonResponseHandler._normalizeToElementsStructure(newFragmentValid, newFragment) if accNormalized and newNormalized: merged = JsonResponseHandler._mergeJsonStructuresGeneric( accNormalized, newNormalized, accumulatedValid, newFragmentValid ) if merged: return json.dumps(merged, indent=2, ensure_ascii=False) except Exception: pass # Strategy 2: Find longest common suffix/prefix match (character-level overlap) maxOverlapLen = min(len(accumulated), len(newFragment)) # Start from maximum possible overlap down to 1 character # But limit to reasonable overlap (max 50% of shorter string) maxReasonableOverlap = min(maxOverlapLen, min(len(accumulated), len(newFragment)) // 2) for overlapLen in range(maxReasonableOverlap, 0, -1): accumulatedSuffix = accumulated[-overlapLen:] newFragmentPrefix = newFragment[:overlapLen] if accumulatedSuffix == newFragmentPrefix: # Found overlap - remove duplicate part logger.debug(f"Found overlap of {overlapLen} characters, removing duplicate") merged = accumulated + newFragment[overlapLen:] # Ensure result is not empty if merged and merged.strip(): return merged # Strategy 3: No overlap found - try smart concatenation # Check if we can append newFragment to accumulated without breaking JSON structure merged = JsonResponseHandler._smartConcatenate(accumulated, newFragment) if merged and merged.strip(): return merged # Strategy 4: Last resort - simple concatenation (but ensure non-empty and valid JSON) result = accumulated + newFragment if not result or result.strip() in ['{}', '[]', '']: # Return accumulated as fallback (at least we have that) return accumulated if accumulated else "{}" # CRITICAL: Try to repair and validate the merged result try: repaired = closeJsonStructures(result) parsed, parseErr, _ = tryParseJson(repaired) if parseErr is None: # Valid JSON - return it return json.dumps(parsed, indent=2, ensure_ascii=False) else: # Still invalid - try to extract valid parts validPrefix = JsonResponseHandler._extractValidJsonPrefix(result) if validPrefix: parsedPrefix, parseErr2, _ = tryParseJson(validPrefix) if parseErr2 is None: return json.dumps(parsedPrefix, indent=2, ensure_ascii=False) except Exception: pass # If repair failed, return accumulated (at least we have that) if accumulated: try: repairedAccumulated = closeJsonStructures(accumulated) parsedAcc, parseErrAcc, _ = tryParseJson(repairedAccumulated) if parseErrAcc is None: return json.dumps(parsedAcc, indent=2, ensure_ascii=False) except Exception: pass return accumulated # Last resort: return empty structure return "{}" @staticmethod def isJsonComplete(parsedJson: Dict[str, Any]) -> bool: """ GENERIC function to check if parsed JSON structure is complete. Works for ANY JSON structure - no specific logic for content types. Completeness checks (all generic): - All arrays are properly closed - All objects are properly closed - No incomplete structures - Recursive validation of nested structures Args: parsedJson: Parsed JSON object Returns: True if JSON is complete, False otherwise """ def _checkStructureComplete(obj: Any, depth: int = 0) -> bool: """Recursively check if structure is complete.""" if depth > 50: # Prevent infinite recursion return True if isinstance(obj, dict): # Check all values recursively for value in obj.values(): if not _checkStructureComplete(value, depth + 1): return False return True elif isinstance(obj, list): # Check all items recursively for item in obj: if not _checkStructureComplete(item, depth + 1): return False return True else: # Primitive value - always complete return True try: return _checkStructureComplete(parsedJson) except Exception as e: logger.debug(f"Error checking JSON completeness: {e}") return False @staticmethod def finalizeJson(parsedJson: Dict[str, Any]) -> Dict[str, Any]: """ GENERIC function to finalize complete JSON by adding missing closing elements and repairing corruption. Works for ANY JSON structure - no specific logic for content types. Steps (all generic): 1. Analyze structure for missing closing elements (recursively) 2. Add closing brackets/braces where needed 3. Repair any remaining corruption 4. Validate final structure Args: parsedJson: Parsed JSON object that needs finalization Returns: Finalized JSON object """ # For now, just return as-is since parsing succeeded # If needed, can add logic to check for incomplete structures # and add closing elements return parsedJson @staticmethod def extractKpiValuesFromJson( parsedJson: Dict[str, Any], kpis: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """ Extract current KPI values from parsed JSON and update KPI objects. Args: parsedJson: Parsed JSON object kpis: List of KPI objects (will be updated with currentValue) Returns: Updated list of KPI objects with currentValue set """ updatedKpis = [] for kpi in kpis: kpiId = kpi.get("id") jsonPath = kpi.get("jsonPath") if not kpiId or not jsonPath: continue # Create copy of KPI object updatedKpi = kpi.copy() try: # Extract value using JSON path # Simple path format: "sections[0].elements[0].items" or "sections[0].elements[0].rows" value = JsonResponseHandler._extractValueByPath(parsedJson, jsonPath) # Handle None (path doesn't exist - incomplete JSON) if value is None: updatedKpi["currentValue"] = kpi.get("currentValue", 0) logger.debug(f"KPI {kpiId} path {jsonPath} not found in JSON (incomplete), keeping current value {updatedKpi['currentValue']}") # Count items/rows/elements based on type elif isinstance(value, list): updatedKpi["currentValue"] = len(value) logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: list with {len(value)} items") elif isinstance(value, (int, float)): updatedKpi["currentValue"] = int(value) logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: numeric value {int(value)}") else: updatedKpi["currentValue"] = 0 logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: non-list/non-numeric value, set to 0") except Exception as e: logger.warning(f"Error extracting KPI {kpiId} from path {jsonPath}: {e}") updatedKpi["currentValue"] = kpi.get("currentValue", 0) updatedKpis.append(updatedKpi) return updatedKpis @staticmethod def extractKpiValuesFromIncompleteJson( jsonString: str, kpis: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """ Extract KPI values from incomplete JSON string. Uses existing JSON completion function to close incomplete structures, then extracts KPIs. Args: jsonString: Incomplete JSON string kpis: List of KPI objects Returns: Updated list of KPI objects with currentValue set """ updatedKpis = [] for kpi in kpis: kpiId = kpi.get("id") jsonPath = kpi.get("jsonPath") if not kpiId or not jsonPath: continue updatedKpi = kpi.copy() try: # Use existing JSON completion function to close incomplete structures from modules.shared.jsonUtils import extractJsonString, closeJsonStructures # Extract JSON string and complete it with missing closing elements extracted = extractJsonString(jsonString) completed = closeJsonStructures(extracted) # Parse completed JSON parsed = json.loads(completed) # Extract value using path value = JsonResponseHandler._extractValueByPath(parsed, jsonPath) # Handle None (path doesn't exist - incomplete JSON) if value is None: updatedKpi["currentValue"] = kpi.get("currentValue", 0) logger.debug(f"KPI {kpiId} path {jsonPath} not found in completed JSON (still incomplete), keeping current value {updatedKpi['currentValue']}") # Count items/rows/elements based on type elif isinstance(value, list): updatedKpi["currentValue"] = len(value) logger.debug(f"Extracted KPI {kpiId} from completed JSON: list with {len(value)} items") elif isinstance(value, (int, float)): updatedKpi["currentValue"] = int(value) logger.debug(f"Extracted KPI {kpiId} from completed JSON: numeric value {int(value)}") else: updatedKpi["currentValue"] = 0 logger.debug(f"Extracted KPI {kpiId} from completed JSON: non-list/non-numeric value, set to 0") except Exception as e: logger.warning(f"Error extracting KPI {kpiId} from incomplete JSON: {e}") updatedKpi["currentValue"] = kpi.get("currentValue", 0) updatedKpis.append(updatedKpi) return updatedKpis @staticmethod def _extractValueByPath(obj: Any, path: str) -> Any: """ Extract value from object using dot-notation path with array indices. Example: "sections[0].elements[0].items" Returns None if path doesn't exist (for incomplete JSON handling). """ parts = path.split('.') current = obj for part in parts: if '[' in part and ']' in part: # Handle array access: "sections[0]" key = part[:part.index('[')] index = int(part[part.index('[') + 1:part.index(']')]) if key: if isinstance(current, dict): current = current.get(key) if current is None: return None # Key doesn't exist else: return None # Can't access key on non-dict if isinstance(current, list): if 0 <= index < len(current): current = current[index] else: # Index out of range - return None for incomplete JSON return None else: # Not a list, can't index return None else: # Handle dict access if isinstance(current, dict): current = current.get(part) if current is None: return None # Key doesn't exist else: return None # Can't access key on non-dict return current @staticmethod def validateKpiProgression( accumulationState: JsonAccumulationState, updatedKpis: List[Dict[str, Any]] ) -> Tuple[bool, str]: """ Validate KPI progression from parsed JSON. Validation rules: - Proceed if: At least ONE KPI increased - Stop if: Any KPI went backwards → return (False, "KPI went backwards") - Stop if: No KPIs progressed → return (False, "No progress") - Finish if: All KPIs completed OR JSON is complete → return (True, "Complete") Args: accumulationState: Current accumulation state (contains kpis) updatedKpis: Updated KPI objects with currentValue set Returns: Tuple of (shouldProceed, reason) """ if not accumulationState.kpis: # No KPIs defined - always proceed return True, "No KPIs defined" # Build dict of last values for comparison lastValues = {kpi.get("id"): kpi.get("currentValue", 0) for kpi in accumulationState.kpis} logger.debug(f"KPI validation: lastValues = {lastValues}") logger.debug(f"KPI validation: updatedKpis = {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}") # Check if any KPI went backwards for updatedKpi in updatedKpis: kpiId = updatedKpi.get("id") currentValue = updatedKpi.get("currentValue", 0) if kpiId in lastValues: lastValue = lastValues[kpiId] if currentValue < lastValue: logger.warning(f"KPI {kpiId} went BACKWARDS: {lastValue} → {currentValue}") return False, f"KPI {kpiId} went backwards" # Check if all KPIs are completed allCompleted = True for updatedKpi in updatedKpis: targetValue = updatedKpi.get("targetValue", 0) currentValue = updatedKpi.get("currentValue", 0) if currentValue < targetValue: allCompleted = False break if allCompleted: logger.info("All KPIs completed") return True, "All KPIs completed" # Check if at least one KPI progressed atLeastOneProgressed = False for updatedKpi in updatedKpis: kpiId = updatedKpi.get("id") currentValue = updatedKpi.get("currentValue", 0) if kpiId in lastValues: lastValue = lastValues[kpiId] if currentValue > lastValue: atLeastOneProgressed = True logger.info(f"KPI {kpiId} progressed: {lastValue} → {currentValue}") break else: # First time seeing this KPI - if it has a value, it's progress if currentValue > 0: atLeastOneProgressed = True logger.info(f"KPI {kpiId} initialized: {currentValue}") break if not atLeastOneProgressed: logger.warning(f"No KPIs progressed. Last values: {lastValues}, Current values: {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}") return False, "No progress" return True, "Progress detected" @staticmethod def accumulateAndParseJsonFragments( accumulatedJsonString: str, newFragmentString: str, allSections: List[Dict[str, Any]], iteration: int ) -> Tuple[str, List[Dict[str, Any]], bool, Optional[Dict[str, Any]]]: """ Accumulate JSON fragments and parse when complete. GENERIC function that handles: 1. Concatenating JSON strings with overlap detection 2. Parsing the accumulated string 3. Extracting sections (partial if incomplete, final if complete) 4. Determining completion status Args: accumulatedJsonString: Previously accumulated JSON string newFragmentString: New fragment string from current iteration allSections: Sections extracted so far (for prompt context) iteration: Current iteration number Returns: Tuple of: - accumulatedJsonString: Updated accumulated string - sections: Extracted sections (partial if incomplete, final if complete) - isComplete: True if JSON is complete and valid - parsedResult: Parsed JSON object (if parsing succeeded) """ # Step 1: Clean encoding issues from accumulated string (check end of first delivered part) cleanedAccumulated = JsonResponseHandler.cleanEncodingIssues(accumulatedJsonString) # Step 2: Clean encoding issues from new fragment cleanedFragment = JsonResponseHandler.cleanEncodingIssues(newFragmentString) # Step 3: Concatenate with overlap handling combinedString, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap( cleanedAccumulated, cleanedFragment ) # Note: hasOverlap indicates if iterations should continue, but this function # doesn't control iterations, so we just use the merged string # Step 4: Try to parse try: extracted = extractJsonString(combinedString) parsedResult = json.loads(extracted) # Step 5: Parsing succeeded - check completeness isComplete = JsonResponseHandler.isJsonComplete(parsedResult) if isComplete: # Step 6: Complete JSON - finalize finalizedJson = JsonResponseHandler.finalizeJson(parsedResult) sections = extractSectionsFromDocument(finalizedJson) logger.info(f"Iteration {iteration}: JSON accumulation complete, extracted {len(sections)} sections") return combinedString, sections, True, finalizedJson else: # Step 7: Incomplete but parseable - extract partial sections sections = extractSectionsFromDocument(parsedResult) logger.info(f"Iteration {iteration}: JSON accumulation incomplete but parseable, extracted {len(sections)} partial sections") return combinedString, sections, False, parsedResult except json.JSONDecodeError: # Step 8: Still broken - repair and extract partial sections repaired = repairBrokenJson(combinedString) if repaired: sections = extractSectionsFromDocument(repaired) logger.info(f"Iteration {iteration}: JSON accumulation repaired, extracted {len(sections)} sections") return combinedString, sections, False, repaired else: # Repair failed - continue with data BEFORE merging the problematic piece # Return previous accumulated string (before adding new fragment) # This ensures we don't lose previously accumulated data logger.warning(f"Iteration {iteration}: Repair failed, continuing with previous accumulated data") return accumulatedJsonString, [], False, None