""" JSON Response Handling Module Handles merging of JSON responses from multiple AI iterations, including: - Section merging with intelligent overlap detection - JSON fragment detection and merging - Deep recursive structure merging - Overlap detection for complex nested structures - String accumulation for iterative JSON generation """ import json import logging import re from typing import Dict, Any, List, Optional, Tuple from modules.shared.jsonUtils import extractJsonString, repairBrokenJson, extractSectionsFromDocument from modules.datamodels.datamodelAi import JsonAccumulationState logger = logging.getLogger(__name__) class JsonResponseHandler: """Handles JSON response merging and fragment detection for iterative AI generation.""" @staticmethod def mergeSectionsIntelligently( existingSections: List[Dict[str, Any]], newSections: List[Dict[str, Any]], iteration: int ) -> List[Dict[str, Any]]: """ Intelligently merge sections from multiple iterations. This is a GENERIC merging strategy that handles broken JSON iterations. The break can occur anywhere - in any section, at any depth. Merging strategies (in order of priority): 1. Same Section ID: Merge sections with identical IDs 2. Same Content-Type + Position: If last section is incomplete and new section continues it 3. Same Order: Merge sections with same order value 4. Structural Analysis: Detect continuation based on content structure Args: existingSections: Sections accumulated from previous iterations newSections: Sections extracted from current iteration iteration: Current iteration number Returns: Merged list of sections """ if not newSections: return existingSections if not existingSections: return newSections mergedSections = existingSections.copy() for newSection in newSections: merged = False # Strategy 1: Same Section ID - merge directly newSectionId = newSection.get("id") if newSectionId: for i, existingSection in enumerate(mergedSections): if existingSection.get("id") == newSectionId: # Merge sections with same ID mergedSections[i] = JsonResponseHandler.mergeSectionContent( existingSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'") break if merged: continue # Strategy 2: Same Content-Type + Position (continuation detection) # Check if last section is incomplete and new section continues it if mergedSections: lastSection = mergedSections[-1] lastContentType = lastSection.get("content_type") newContentType = newSection.get("content_type") if lastContentType == newContentType: # Same content type - check if last section is incomplete if JsonResponseHandler.isSectionIncomplete(lastSection): # Last section is incomplete, merge with new section mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})") continue # Strategy 3: Same Order value newOrder = newSection.get("order") if newOrder is not None: for i, existingSection in enumerate(mergedSections): existingOrder = existingSection.get("order") if existingOrder is not None and existingOrder == newOrder: # Merge sections with same order mergedSections[i] = JsonResponseHandler.mergeSectionContent( existingSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}") break if merged: continue # Strategy 4: Structural Analysis - detect continuation # For code_block and table: if last section matches new section type, merge them if mergedSections: lastSection = mergedSections[-1] lastContentType = lastSection.get("content_type") newContentType = newSection.get("content_type") # Both are code blocks - merge them if lastContentType == "code_block" and newContentType == "code_block": mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis") continue # Both are tables - merge them (common case for broken JSON iterations) if lastContentType == "table" and newContentType == "table": mergedSections[-1] = JsonResponseHandler.mergeSectionContent( lastSection, newSection, iteration ) merged = True logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis") continue # No merge strategy matched - add as new section if not merged: mergedSections.append(newSection) logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})") return mergedSections @staticmethod def isSectionIncomplete(section: Dict[str, Any]) -> bool: """ Check if a section is incomplete (broken at the end). This detects incomplete sections based on content analysis: - Code blocks: ends mid-line, ends with comma, ends with incomplete structure - Text sections: ends mid-sentence, ends with incomplete structure - Other types: check for incomplete elements """ contentType = section.get("content_type", "") elements = section.get("elements", []) if not elements: return False # Handle list of elements if isinstance(elements, list) and len(elements) > 0: lastElement = elements[-1] else: lastElement = elements if not isinstance(lastElement, dict): return False # Check code_block for incomplete code if contentType == "code_block": code = lastElement.get("code", "") if code: # Check if code ends incompletely: # - Ends with comma (incomplete CSV line) # - Ends with number but no newline (incomplete line) # - Ends mid-token (e.g., "23431,23" - incomplete number) codeStripped = code.rstrip() if codeStripped: # Check for incomplete patterns if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')): # Ends with comma or has comma but no final newline - likely incomplete return True # Check if last line is incomplete (doesn't end with newline and has partial content) if not code.endswith('\n') and codeStripped: # No final newline - might be incomplete # More sophisticated: check if last number is complete lastLine = codeStripped.split('\n')[-1] if lastLine and ',' in lastLine: # Has commas but might be incomplete parts = lastLine.split(',') if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete return True # Check table for incomplete rows if contentType == "table": rows = lastElement.get("rows", []) if rows: # Check if last row is incomplete (ends with incomplete data) lastRow = rows[-1] if isinstance(rows, list) else [] if isinstance(lastRow, list) and lastRow: # CRITICAL: Check if last row doesn't have expected number of columns (if headers exist) # This is the PRIMARY indicator of incomplete table rows headers = lastElement.get("headers", []) if headers and isinstance(headers, list): expectedCols = len(headers) if len(lastRow) < expectedCols: logger.debug(f"Table section incomplete: last row has {len(lastRow)} columns, expected {expectedCols}") return True # Also check if last row ends with incomplete data (e.g., incomplete string) lastCell = lastRow[-1] if lastRow else "" if isinstance(lastCell, str): # If last cell is incomplete (ends with quote or is very short), section might be incomplete if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell): logger.debug(f"Table section incomplete: last cell appears incomplete: '{lastCell}'") return True # Additional check: if last row has fewer cells than previous rows, it's likely incomplete if len(rows) > 1: prevRow = rows[-2] if isinstance(rows, list) and len(rows) > 1 else [] if isinstance(prevRow, list) and len(prevRow) > len(lastRow): logger.debug(f"Table section incomplete: last row has {len(lastRow)} cells, previous row has {len(prevRow)}") return True # Check paragraph/text for incomplete sentences if contentType in ["paragraph", "heading"]: text = lastElement.get("text", "") if text: # Simple heuristic: if doesn't end with sentence-ending punctuation textStripped = text.rstrip() if textStripped and not textStripped[-1] in '.!?': # Might be incomplete, but this is less reliable # Only mark as incomplete if very short (likely cut off) if len(textStripped) < 20: return True # Check lists for incomplete items if contentType in ["bullet_list", "numbered_list"]: items = lastElement.get("items", []) if items and isinstance(items, list): # Check if last item is incomplete (very short or ends with incomplete string) lastItem = items[-1] if items else None if isinstance(lastItem, str) and len(lastItem) < 3: return True # Check image for incomplete base64 data if contentType == "image": imageData = lastElement.get("base64Data", "") if imageData: # Base64 strings should end with padding ('=' or '==') # If it doesn't, it might be incomplete stripped = imageData.rstrip() if stripped and not stripped.endswith(('=', '==')): # Check if it's a valid base64 character sequence that was cut off if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=': return True # If length is not a multiple of 4 (base64 requirement), it might be incomplete if len(stripped) % 4 != 0: return True # GENERIC CHECK: Recursively analyze structure for incompleteness # This works for ANY structure: arrays, objects, nested, primitives return JsonResponseHandler._isStructureIncomplete(lastElement) @staticmethod def _isStructureIncomplete(structure: Any, max_depth: int = 10) -> bool: """ GENERIC recursive check for incomplete structures. Detects incompleteness by analyzing patterns: - Arrays: Last item shorter than previous items, incomplete patterns - Objects: Last object has fewer keys than pattern, incomplete values - Strings: Very short, ends abruptly, incomplete patterns - Nested: Recursively checks nested structures Works for ANY JSON structure of any depth/complexity. """ if max_depth <= 0: return False # Arrays/Lists - check for incomplete patterns if isinstance(structure, list): if len(structure) == 0: return False # Check if last item is incomplete compared to previous items last_item = structure[-1] # If we have previous items, compare structure if len(structure) > 1: prev_item = structure[-2] # If last item is a list and previous is a list, check length if isinstance(last_item, list) and isinstance(prev_item, list): if len(last_item) < len(prev_item): return True # Last row/item has fewer elements - likely incomplete # If last item is a dict and previous is a dict, check keys if isinstance(last_item, dict) and isinstance(prev_item, dict): if len(last_item) < len(prev_item): return True # Last object has fewer keys - likely incomplete # Recursively check last item for incompleteness if JsonResponseHandler._isStructureIncomplete(last_item, max_depth - 1): return True # Objects/Dicts - check for incomplete values elif isinstance(structure, dict): for key, value in structure.items(): # Recursively check each value if JsonResponseHandler._isStructureIncomplete(value, max_depth - 1): return True # Check for incomplete strings if isinstance(value, str): # Very short strings might be incomplete if len(value) > 0 and len(value) < 3: return True # Strings ending with incomplete patterns (comma, quote, etc.) stripped = value.rstrip() if stripped and stripped.endswith((',', '"', '\\')): return True # Strings - check for incomplete patterns elif isinstance(structure, str): # Very short strings might be incomplete if len(structure) > 0 and len(structure) < 3: return True # Strings ending with incomplete patterns stripped = structure.rstrip() if stripped and stripped.endswith((',', '"', '\\')): return True return False @staticmethod def mergeSectionContent( existingSection: Dict[str, Any], newSection: Dict[str, Any], iteration: int ) -> Dict[str, Any]: """ Merge content from two sections. Handles different content types: - code_block: Append code, handle overlaps, merge incomplete lines - paragraph/heading: Append text - table: Merge rows - list: Merge items - Other: Merge elements """ contentType = existingSection.get("content_type", "") existingElements = existingSection.get("elements", []) newElements = newSection.get("elements", []) if not newElements: return existingSection # Handle list of elements if isinstance(existingElements, list): existingElem = existingElements[-1] if existingElements else {} else: existingElem = existingElements if isinstance(newElements, list): newElem = newElements[0] if newElements else {} else: newElem = newElements if not isinstance(existingElem, dict) or not isinstance(newElem, dict): return existingSection # Merge based on content type if contentType == "code_block": existingCode = existingElem.get("code", "") newCode = newElem.get("code", "") if existingCode and newCode: mergedCode = JsonResponseHandler.mergeCodeBlocks(existingCode, newCode, iteration) existingElem["code"] = mergedCode # Preserve language from existing or new if "language" not in existingElem and "language" in newElem: existingElem["language"] = newElem["language"] elif contentType in ["paragraph", "heading"]: existingText = existingElem.get("text", "") newText = newElem.get("text", "") if existingText and newText: # Append text with space if needed if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n': mergedText = existingText.rstrip() + " " + newText.lstrip() else: mergedText = existingText.rstrip() + "\n" + newText.lstrip() existingElem["text"] = mergedText elif contentType == "table": # Merge table rows with sophisticated overlap detection existingRows = existingElem.get("rows", []) newRows = newElem.get("rows", []) if existingRows and newRows: # Use sophisticated overlap detection that handles multiple overlapping rows mergedRows = JsonResponseHandler.mergeRowsWithOverlap(existingRows, newRows, iteration) existingElem["rows"] = mergedRows logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(mergedRows)}") elif newRows: # If existing has no rows but new does, use new rows existingElem["rows"] = newRows # Preserve headers from existing (or use new if existing has none) if not existingElem.get("headers") and newElem.get("headers"): existingElem["headers"] = newElem["headers"] # Preserve caption from existing (or use new if existing has none) if not existingElem.get("caption") and newElem.get("caption"): existingElem["caption"] = newElem.get("caption") elif contentType in ["bullet_list", "numbered_list"]: # Merge list items with sophisticated overlap detection existingItems = existingElem.get("items", []) newItems = newElem.get("items", []) if existingItems and newItems: mergedItems = JsonResponseHandler.mergeItemsWithOverlap(existingItems, newItems, iteration) existingElem["items"] = mergedItems elif newItems: existingElem["items"] = newItems elif contentType == "image": # Images are typically complete - if new image is provided, replace existing # But check if existing image data is incomplete (e.g., base64 string cut off) existingImageData = existingElem.get("base64Data", "") newImageData = newElem.get("base64Data", "") if existingImageData and newImageData: # If existing image data doesn't end with valid base64 padding, it might be incomplete # Base64 padding is '=' or '==' at the end if not existingImageData.rstrip().endswith(('=', '==')): # Existing image might be incomplete - merge by appending new data # This handles cases where base64 string was cut off existingElem["base64Data"] = existingImageData + newImageData logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data") else: # Existing image is complete - replace with new (or keep existing if new is empty) if newImageData: existingElem["base64Data"] = newImageData elif newImageData: existingElem["base64Data"] = newImageData # Preserve other image metadata if not existingElem.get("altText") and newElem.get("altText"): existingElem["altText"] = newElem["altText"] if not existingElem.get("caption") and newElem.get("caption"): existingElem["caption"] = newElem["caption"] else: # GENERIC FALLBACK: Use deep recursive merging for complex nested structures # This handles any content type with arbitrary depth and complexity merged_element = JsonResponseHandler.mergeDeepStructures( existingElem, newElem, iteration, f"section.{contentType}" ) existingElem = merged_element # Update section with merged content mergedSection = existingSection.copy() if isinstance(existingElements, list): # Update the last element in the list with merged content if existingElements: existingElements[-1] = existingElem mergedSection["elements"] = existingElements else: mergedSection["elements"] = existingElem # Preserve metadata from new section if missing in existing if "order" not in mergedSection and "order" in newSection: mergedSection["order"] = newSection["order"] return mergedSection @staticmethod def mergeCodeBlocks(existingCode: str, newCode: str, iteration: int) -> str: """ Merge two code blocks intelligently, handling overlaps and incomplete lines. """ if not existingCode: return newCode if not newCode: return existingCode existingLines = existingCode.rstrip().split('\n') newLines = newCode.strip().split('\n') if not existingLines or not newLines: return existingCode + "\n" + newCode lastExistingLine = existingLines[-1].strip() firstNewLine = newLines[0].strip() # Strategy 1: Exact overlap - remove duplicate line if lastExistingLine == firstNewLine: newLines = newLines[1:] logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge") # Strategy 2: Incomplete line merge # If last existing line ends with comma or is incomplete, merge with first new line elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5): # Last line is incomplete - merge with first new line # Remove trailing comma from existing line mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip() existingLines[-1] = mergedLine newLines = newLines[1:] logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation") # Strategy 3: Partial overlap detection # Check if first new line starts with the end of last existing line elif ',' in lastExistingLine and ',' in firstNewLine: lastExistingParts = lastExistingLine.split(',') firstNewParts = firstNewLine.split(',') # Check for overlap: if last part of existing matches first part of new if lastExistingParts and firstNewParts: lastExistingPart = lastExistingParts[-1].strip() firstNewPart = firstNewParts[0].strip() # If they match, there's overlap if lastExistingPart == firstNewPart and len(lastExistingParts) > 1: # Remove overlapping part from new line newLines[0] = ','.join(firstNewParts[1:]) logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge") # Reconstruct merged code mergedCode = '\n'.join(existingLines) if newLines: if mergedCode and not mergedCode.endswith('\n'): mergedCode += '\n' mergedCode += '\n'.join(newLines) return mergedCode @staticmethod def detectAndParseJsonFragment( result: str, allSections: List[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: """ GENERIC fragment detection for ANY JSON structure. Detects if response is a JSON fragment (continuation content) rather than full document structure. Works for ANY JSON type: arrays, objects, primitives, nested structures of any depth/complexity. Fragment = Any JSON that: 1. Does NOT have "documents" or "sections" keys (not full document structure) 2. Can be ANY structure: array, object, nested, primitive, etc. 3. Is continuation content that needs to be merged into existing sections Examples (all handled generically): - Array: [["37643", ...], ...] (table rows, list items, any array) - Object: {"rows": [...], "headers": [...]} (partial element) - Primitive: "continuation text" (rare but possible) - Nested: {"data": {"items": [...]}} (any nested structure) Returns fragment info dict with: - fragment_data: The parsed fragment content (ANY type) - target_section_id: ID of last incomplete section (generic, not type-specific) CRITICAL: Fully generic - no specific logic for tables, paragraphs, etc. """ try: extracted = extractJsonString(result) parsed = json.loads(extracted) # GENERIC fragment detection: Check if it's NOT a full document structure is_full_document = False if isinstance(parsed, dict): # Full document structure has "documents" or "sections" keys if "documents" in parsed or "sections" in parsed: is_full_document = True # If it's a full document structure, it's not a fragment if is_full_document: return None # Otherwise, it's a fragment (can be ANY structure: array, object, primitive, nested) # Find target: last incomplete section (generic, regardless of content type) target_section_id = JsonResponseHandler.findLastIncompleteSectionId(allSections) logger.info(f"Detected GENERIC JSON fragment (type: {type(parsed).__name__}), target: {target_section_id}") return { "fragment_data": parsed, # Can be ANY JSON structure "target_section_id": target_section_id } except Exception as e: logger.error(f"Error detecting JSON fragment: {e}") logger.debug(f"Fragment detection failed for result: {result[:500]}...") return None @staticmethod def findLastIncompleteSectionId( allSections: List[Dict[str, Any]] ) -> Optional[str]: """ GENERIC: Find the last incomplete section (regardless of content type). This is fully generic - works for ANY content type, ANY structure. Returns the ID of the last section that is incomplete, or None if all are complete. """ # Find the last incomplete section (generic, not type-specific) for section in reversed(allSections): if JsonResponseHandler.isSectionIncomplete(section): return section.get("id") # If no incomplete section found, return last section as fallback if allSections: return allSections[-1].get("id") return None @staticmethod def mergeFragmentIntoSection( fragment: Dict[str, Any], allSections: List[Dict[str, Any]], iteration: int ) -> Optional[List[Dict[str, Any]]]: """ GENERIC fragment merging for ANY JSON structure. Merges a JSON fragment (ANY structure: array, object, nested, primitive) into the last incomplete section. Uses ONLY deep recursive merging - no specific logic for content types. Handles ALL cases: 1. Fragments with overlap (detected and merged intelligently) 2. Fragments without overlap (continuation after cut-off, appended) 3. Any JSON structure (arrays, objects, nested, primitives) 4. Accumulative merging (uses merged data from past iterations) CRITICAL: Fully generic - works for ANY JSON structure, ANY content type. NO FALLBACKS: Returns None if merge fails (no target section found). """ fragment_data = fragment.get("fragment_data") target_section_id = fragment.get("target_section_id") if fragment_data is None: logger.error(f"Iteration {iteration}: ❌ Fragment has no fragment_data - merge FAILED") return None # Find the target section (last incomplete section, generic) target_section = None target_index = -1 if target_section_id: for i, section in enumerate(allSections): if section.get("id") == target_section_id: target_section = section target_index = i break # NO FALLBACKS: If target not found by ID, try to find incomplete section if not target_section: for i, section in enumerate(reversed(allSections)): if JsonResponseHandler.isSectionIncomplete(section): target_section = section target_index = len(allSections) - 1 - i break # NO FALLBACKS: If no target found, merge FAILS if not target_section: logger.error(f"Iteration {iteration}: ❌ MERGE FAILED - No target section found for fragment!") logger.error(f"Iteration {iteration}: Available sections: {[s.get('id') + ' (' + s.get('content_type', 'unknown') + ')' for s in allSections]}") return None # Get the last element from target section (where fragment will be merged) merged_section = target_section.copy() elements = merged_section.get("elements", []) if not isinstance(elements, list): elements = [elements] if elements else [] if not elements: elements = [{}] last_element = elements[-1] if elements else {} if not isinstance(last_element, dict): last_element = {} elements.append(last_element) # CRITICAL: Use ONLY deep recursive merging for ALL fragment types # This handles ANY structure: arrays, objects, nested, primitives # Handles overlap detection generically (deep recursive comparison) # Handles continuation after cut-off (no overlap case) merged_element = JsonResponseHandler.mergeDeepStructures( last_element, fragment_data, iteration, f"section.{target_section_id}.fragment" ) # Update elements with merged content elements[-1] = merged_element merged_section["elements"] = elements # Update allSections (this ensures accumulative merging - merged data is used for next iteration) merged_sections = allSections.copy() merged_sections[target_index] = merged_section logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section_id}'") # Log merged JSON for debugging try: from modules.shared.debugLogger import writeDebugFile merged_json_str = json.dumps(merged_sections, indent=2, ensure_ascii=False) writeDebugFile(merged_json_str, f"merged_json_iteration_{iteration}.json") except Exception as e: logger.debug(f"Iteration {iteration}: Failed to write merged JSON debug file: {e}") return merged_sections @staticmethod def completeIncompleteStructures(allSections: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Complete any incomplete structures in sections by ensuring proper JSON structure. This ensures JSON is properly closed even if merge failed or iterations stopped early. Works generically for ANY structure type - recursively processes all nested structures. Returns sections with completed structures. """ completed_sections = [] for section in allSections: completed_section = JsonResponseHandler._completeStructure(section) completed_sections.append(completed_section) return completed_sections @staticmethod def _completeStructure(structure: Any) -> Any: """ Recursively complete incomplete structures by ensuring arrays/objects are properly structured. Works generically for ANY JSON structure - no specific logic for content types. """ if isinstance(structure, dict): completed = {} for key, value in structure.items(): completed[key] = JsonResponseHandler._completeStructure(value) return completed elif isinstance(structure, list): completed = [] for item in structure: completed.append(JsonResponseHandler._completeStructure(item)) return completed else: # Primitive value - return as is return structure @staticmethod def getContentTypeForFragment(fragment_type: str) -> str: """Map fragment type to content type.""" mapping = { "table_rows": "table", "table_element": "table", "code_lines": "code_block", "code_element": "code_block", "list_items": "bullet_list" } return mapping.get(fragment_type, "paragraph") @staticmethod def deepCompare(obj1: Any, obj2: Any, max_depth: int = 10) -> bool: """ Deep recursive comparison of two JSON-serializable objects. Handles nested structures of any depth and complexity. Args: obj1: First object to compare obj2: Second object to compare max_depth: Maximum recursion depth to prevent infinite loops Returns: True if objects are deeply equal, False otherwise """ if max_depth <= 0: return False # Type check if type(obj1) != type(obj2): return False # Primitive types if isinstance(obj1, (str, int, float, bool, type(None))): return obj1 == obj2 # Lists/arrays - compare element by element if isinstance(obj1, list): if len(obj1) != len(obj2): return False return all(JsonResponseHandler.deepCompare(item1, item2, max_depth - 1) for item1, item2 in zip(obj1, obj2)) # Dicts/objects - compare key by key if isinstance(obj1, dict): if set(obj1.keys()) != set(obj2.keys()): return False return all(JsonResponseHandler.deepCompare(obj1[key], obj2[key], max_depth - 1) for key in obj1.keys()) # Fallback for other types return obj1 == obj2 @staticmethod def findLongestCommonSuffix( existing_list: List[Any], new_list: List[Any], min_overlap: int = 1 ) -> int: """ Find the longest common suffix of existing_list that matches a prefix of new_list. This handles cases where multiple elements overlap: - existing: [A, B, C, D] - new: [C, D, E, F] - overlap: [C, D] (length 2) Returns the length of the overlap (0 if no overlap found). """ if not existing_list or not new_list: return 0 max_overlap = min(len(existing_list), len(new_list)) # Try all possible overlap lengths (from longest to shortest) for overlap_len in range(max_overlap, min_overlap - 1, -1): existing_suffix = existing_list[-overlap_len:] new_prefix = new_list[:overlap_len] # Deep compare suffix and prefix if all(JsonResponseHandler.deepCompare(existing_suffix[i], new_prefix[i]) for i in range(overlap_len)): return overlap_len return 0 @staticmethod def findPartialOverlap( existing_item: Any, new_item: Any ) -> Tuple[bool, Optional[Any]]: """ Detect if new_item completes an incomplete existing_item. Handles cases like: - existing: ["37643", "37649", "37657", "37663", "37691", "37693", "37699", "37717", "37747", "376"] - new: ["37643", "37649", ...] Returns (is_partial_overlap, merged_item) if partial overlap detected, else (False, None). """ # Check if both are lists if isinstance(existing_item, list) and isinstance(new_item, list): if not existing_item or not new_item: return False, None # Check if last element of existing is incomplete and matches first of new last_existing = existing_item[-1] first_new = new_item[0] # If last existing is a string and first new is a string if isinstance(last_existing, str) and isinstance(first_new, str): # Check if last existing is incomplete (very short, ends with number, etc.) if len(last_existing) < 10 and first_new.startswith(last_existing): # Partial overlap - merge them merged_last = last_existing + first_new[len(last_existing):] merged_item = existing_item[:-1] + [merged_last] + new_item[1:] return True, merged_item # Check if last existing is incomplete list and first new completes it if isinstance(last_existing, list) and isinstance(first_new, list): if len(last_existing) < len(first_new): # Check if last existing is prefix of first new if first_new[:len(last_existing)] == last_existing: # Merge: replace incomplete last with complete first merged_item = existing_item[:-1] + [first_new] + new_item[1:] return True, merged_item # Check if existing is incomplete string and new completes it if isinstance(existing_item, str) and isinstance(new_item, str): if len(existing_item) < 50 and new_item.startswith(existing_item): # Partial overlap merged = existing_item + new_item[len(existing_item):] return True, merged return False, None @staticmethod def mergeRowsWithOverlap( existing_rows: List[List[str]], new_rows: List[List[str]], iteration: int ) -> List[List[str]]: """ Merge table rows with sophisticated overlap detection. Handles multiple overlapping rows and partial overlaps. """ if not new_rows: return existing_rows if not existing_rows: return new_rows # Strategy 1: Find longest common suffix/prefix overlap overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_rows, new_rows, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping table rows, removing duplicates") return existing_rows + new_rows[overlap_len:] # Strategy 2: Check for partial overlap in last row if len(existing_rows) > 0 and len(new_rows) > 0: last_existing = existing_rows[-1] first_new = new_rows[0] is_partial, merged_row = JsonResponseHandler.findPartialOverlap(last_existing, first_new) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap in table rows, merging") return existing_rows[:-1] + [merged_row] + new_rows[1:] # Strategy 3: Simple first/last comparison (fallback) if isinstance(existing_rows[-1], list) and isinstance(new_rows[0], list): if list(existing_rows[-1]) == list(new_rows[0]): logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)") return existing_rows + new_rows[1:] # No overlap detected - append all new rows return existing_rows + new_rows @staticmethod def mergeItemsWithOverlap( existing_items: List[str], new_items: List[str], iteration: int ) -> List[str]: """ Merge list items with sophisticated overlap detection. Handles multiple overlapping items and partial overlaps. """ if not new_items: return existing_items if not existing_items: return new_items # Strategy 1: Find longest common suffix/prefix overlap overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_items, new_items, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping list items, removing duplicates") return existing_items + new_items[overlap_len:] # Strategy 2: Check for partial overlap in last item if len(existing_items) > 0 and len(new_items) > 0: is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing_items[-1], new_items[0]) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap in list items, merging") return existing_items[:-1] + [merged_item] + new_items[1:] # Strategy 3: Simple first/last comparison (fallback) if existing_items[-1] == new_items[0]: logger.debug(f"Iteration {iteration}: Removed duplicate list item (exact match)") return existing_items + new_items[1:] # No overlap detected - append all new items return existing_items + new_items @staticmethod def mergeDeepStructures( existing: Any, new: Any, iteration: int, path: str = "root" ) -> Any: """ FULLY GENERIC recursive merge for ANY JSON structure of arbitrary depth/complexity. Handles ALL cases generically: 1. Arrays/Lists: Overlap detection (suffix/prefix), partial overlap, no overlap (continuation) 2. Objects/Dicts: Key-by-key merge with overlap detection for nested structures 3. Primitives: Equality check, replacement if different 4. Nested structures: Recursively handles any depth/complexity Overlap detection strategies (all generic): - Array overlap: Finds longest common suffix/prefix, handles partial overlaps - Object overlap: Detected recursively through key matching and deep comparison - No overlap: Appends/merges continuation content after cut-off point CRITICAL: Fully generic - no specific logic for content types. Works for ANY JSON structure: arrays, objects, nested, primitives, any combination. """ # Type check if type(existing) != type(new): # Types don't match - return new (replacement) logger.debug(f"Iteration {iteration}: Types don't match at {path} ({type(existing).__name__} vs {type(new).__name__}), replacing") return new # Lists/arrays - GENERIC merge with overlap detection if isinstance(existing, list) and isinstance(new, list): if not new: return existing if not existing: return new # Strategy 1: Find longest common suffix/prefix overlap (handles multiple overlapping elements) overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing, new, min_overlap=1) if overlap_len > 0: logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping elements at {path}, removing duplicates") return existing + new[overlap_len:] # Strategy 2: Check for partial overlap in last element (incomplete element completion) if len(existing) > 0 and len(new) > 0: is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing[-1], new[0]) if is_partial: logger.debug(f"Iteration {iteration}: Found partial overlap at {path}, merging incomplete element") return existing[:-1] + [merged_item] + new[1:] # Strategy 3: No overlap detected - continuation after cut-off point # This handles the case where new data starts exactly after the cut-off logger.debug(f"Iteration {iteration}: No overlap at {path}, appending continuation content ({len(new)} items)") return existing + new # Dicts/objects - GENERIC merge with recursive overlap detection if isinstance(existing, dict) and isinstance(new, dict): merged = existing.copy() # Check for object-level overlap: if new object is subset/superset of existing # This handles cases where same object structure appears in both existing_keys = set(existing.keys()) new_keys = set(new.keys()) # If new is subset of existing and values match, it's overlap (skip) if new_keys.issubset(existing_keys): all_match = True for key in new_keys: if not JsonResponseHandler.deepCompare(existing[key], new[key]): all_match = False break if all_match: logger.debug(f"Iteration {iteration}: Object at {path} is subset overlap, skipping") return existing # Merge key-by-key with recursive overlap detection for key, new_value in new.items(): if key in merged: # Key exists - merge recursively (handles nested overlap detection) merged[key] = JsonResponseHandler.mergeDeepStructures( merged[key], new_value, iteration, f"{path}.{key}" ) else: # New key - add it (continuation content) merged[key] = new_value logger.debug(f"Iteration {iteration}: Added new key '{key}' at {path} (continuation)") return merged # Primitives - equality check if existing == new: return existing # Different primitive values - return new (continuation/replacement) logger.debug(f"Iteration {iteration}: Primitive at {path} differs, using new value") return new @staticmethod def cleanEncodingIssues(jsonString: str) -> str: """ GENERIC function to remove problematic encoding parts from JSON string. Works for ANY JSON structure - removes problematic characters/bytes. Args: jsonString: JSON string that may have encoding issues Returns: Cleaned JSON string """ try: # Try to decode/encode to detect issues jsonString.encode('utf-8').decode('utf-8') return jsonString except UnicodeError: # Remove problematic parts cleaned = jsonString.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore') logger.warning("Removed encoding issues from JSON string") return cleaned @staticmethod def mergeJsonStringsWithOverlap( accumulated: str, newFragment: str ) -> str: """ GENERIC function to merge two JSON strings, handling overlaps intelligently. Works for ANY JSON structure - no specific logic for content types. Overlap scenarios (all handled generically): - Exact continuation: newFragment starts exactly where accumulated ends - Partial overlap: newFragment overlaps with end of accumulated - Full overlap: newFragment is subset of accumulated Strategy: 1. Find longest common suffix/prefix match (string-based comparison) 2. Remove duplicate content 3. Concatenate remaining parts Args: accumulated: Previously accumulated JSON string newFragment: New fragment string to append Returns: Combined JSON string with overlaps removed """ if not accumulated: return newFragment if not newFragment: return accumulated # Find longest common suffix/prefix match # Try different overlap lengths (from longest to shortest) # Overlaps can be as small as 1 character, so we check all possible lengths maxOverlapLen = min(len(accumulated), len(newFragment)) # Start from maximum possible overlap down to 1 character # This ensures we find the longest overlap, even if it's just 1 character for overlapLen in range(maxOverlapLen, 0, -1): accumulatedSuffix = accumulated[-overlapLen:] newFragmentPrefix = newFragment[:overlapLen] if accumulatedSuffix == newFragmentPrefix: # Found overlap - remove duplicate part logger.debug(f"Found overlap of {overlapLen} characters, removing duplicate") return accumulated + newFragment[overlapLen:] # No overlap found - simple concatenation return accumulated + newFragment @staticmethod def isJsonComplete(parsedJson: Dict[str, Any]) -> bool: """ GENERIC function to check if parsed JSON structure is complete. Works for ANY JSON structure - no specific logic for content types. Completeness checks (all generic): - All arrays are properly closed - All objects are properly closed - No incomplete structures - Recursive validation of nested structures Args: parsedJson: Parsed JSON object Returns: True if JSON is complete, False otherwise """ def _checkStructureComplete(obj: Any, depth: int = 0) -> bool: """Recursively check if structure is complete.""" if depth > 50: # Prevent infinite recursion return True if isinstance(obj, dict): # Check all values recursively for value in obj.values(): if not _checkStructureComplete(value, depth + 1): return False return True elif isinstance(obj, list): # Check all items recursively for item in obj: if not _checkStructureComplete(item, depth + 1): return False return True else: # Primitive value - always complete return True try: return _checkStructureComplete(parsedJson) except Exception as e: logger.debug(f"Error checking JSON completeness: {e}") return False @staticmethod def finalizeJson(parsedJson: Dict[str, Any]) -> Dict[str, Any]: """ GENERIC function to finalize complete JSON by adding missing closing elements and repairing corruption. Works for ANY JSON structure - no specific logic for content types. Steps (all generic): 1. Analyze structure for missing closing elements (recursively) 2. Add closing brackets/braces where needed 3. Repair any remaining corruption 4. Validate final structure Args: parsedJson: Parsed JSON object that needs finalization Returns: Finalized JSON object """ # For now, just return as-is since parsing succeeded # If needed, can add logic to check for incomplete structures # and add closing elements return parsedJson @staticmethod def extractKpiValuesFromJson( parsedJson: Dict[str, Any], kpis: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """ Extract current KPI values from parsed JSON and update KPI objects. Args: parsedJson: Parsed JSON object kpis: List of KPI objects (will be updated with currentValue) Returns: Updated list of KPI objects with currentValue set """ updatedKpis = [] for kpi in kpis: kpiId = kpi.get("id") jsonPath = kpi.get("jsonPath") if not kpiId or not jsonPath: continue # Create copy of KPI object updatedKpi = kpi.copy() try: # Extract value using JSON path # Simple path format: "sections[0].elements[0].items" or "sections[0].elements[0].rows" value = JsonResponseHandler._extractValueByPath(parsedJson, jsonPath) # Handle None (path doesn't exist - incomplete JSON) if value is None: updatedKpi["currentValue"] = kpi.get("currentValue", 0) logger.debug(f"KPI {kpiId} path {jsonPath} not found in JSON (incomplete), keeping current value {updatedKpi['currentValue']}") # Count items/rows/elements based on type elif isinstance(value, list): updatedKpi["currentValue"] = len(value) logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: list with {len(value)} items") elif isinstance(value, (int, float)): updatedKpi["currentValue"] = int(value) logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: numeric value {int(value)}") else: updatedKpi["currentValue"] = 0 logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: non-list/non-numeric value, set to 0") except Exception as e: logger.warning(f"Error extracting KPI {kpiId} from path {jsonPath}: {e}") updatedKpi["currentValue"] = kpi.get("currentValue", 0) updatedKpis.append(updatedKpi) return updatedKpis @staticmethod def extractKpiValuesFromIncompleteJson( jsonString: str, kpis: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """ Extract KPI values from incomplete JSON string. Uses existing JSON completion function to close incomplete structures, then extracts KPIs. Args: jsonString: Incomplete JSON string kpis: List of KPI objects Returns: Updated list of KPI objects with currentValue set """ updatedKpis = [] for kpi in kpis: kpiId = kpi.get("id") jsonPath = kpi.get("jsonPath") if not kpiId or not jsonPath: continue updatedKpi = kpi.copy() try: # Use existing JSON completion function to close incomplete structures from modules.shared.jsonUtils import extractJsonString, closeJsonStructures # Extract JSON string and complete it with missing closing elements extracted = extractJsonString(jsonString) completed = closeJsonStructures(extracted) # Parse completed JSON parsed = json.loads(completed) # Extract value using path value = JsonResponseHandler._extractValueByPath(parsed, jsonPath) # Handle None (path doesn't exist - incomplete JSON) if value is None: updatedKpi["currentValue"] = kpi.get("currentValue", 0) logger.debug(f"KPI {kpiId} path {jsonPath} not found in completed JSON (still incomplete), keeping current value {updatedKpi['currentValue']}") # Count items/rows/elements based on type elif isinstance(value, list): updatedKpi["currentValue"] = len(value) logger.debug(f"Extracted KPI {kpiId} from completed JSON: list with {len(value)} items") elif isinstance(value, (int, float)): updatedKpi["currentValue"] = int(value) logger.debug(f"Extracted KPI {kpiId} from completed JSON: numeric value {int(value)}") else: updatedKpi["currentValue"] = 0 logger.debug(f"Extracted KPI {kpiId} from completed JSON: non-list/non-numeric value, set to 0") except Exception as e: logger.warning(f"Error extracting KPI {kpiId} from incomplete JSON: {e}") updatedKpi["currentValue"] = kpi.get("currentValue", 0) updatedKpis.append(updatedKpi) return updatedKpis @staticmethod def _extractValueByPath(obj: Any, path: str) -> Any: """ Extract value from object using dot-notation path with array indices. Example: "sections[0].elements[0].items" Returns None if path doesn't exist (for incomplete JSON handling). """ parts = path.split('.') current = obj for part in parts: if '[' in part and ']' in part: # Handle array access: "sections[0]" key = part[:part.index('[')] index = int(part[part.index('[') + 1:part.index(']')]) if key: if isinstance(current, dict): current = current.get(key) if current is None: return None # Key doesn't exist else: return None # Can't access key on non-dict if isinstance(current, list): if 0 <= index < len(current): current = current[index] else: # Index out of range - return None for incomplete JSON return None else: # Not a list, can't index return None else: # Handle dict access if isinstance(current, dict): current = current.get(part) if current is None: return None # Key doesn't exist else: return None # Can't access key on non-dict return current @staticmethod def validateKpiProgression( accumulationState: JsonAccumulationState, updatedKpis: List[Dict[str, Any]] ) -> Tuple[bool, str]: """ Validate KPI progression from parsed JSON. Validation rules: - Proceed if: At least ONE KPI increased - Stop if: Any KPI went backwards → return (False, "KPI went backwards") - Stop if: No KPIs progressed → return (False, "No progress") - Finish if: All KPIs completed OR JSON is complete → return (True, "Complete") Args: accumulationState: Current accumulation state (contains kpis) updatedKpis: Updated KPI objects with currentValue set Returns: Tuple of (shouldProceed, reason) """ if not accumulationState.kpis: # No KPIs defined - always proceed return True, "No KPIs defined" # Build dict of last values for comparison lastValues = {kpi.get("id"): kpi.get("currentValue", 0) for kpi in accumulationState.kpis} logger.debug(f"KPI validation: lastValues = {lastValues}") logger.debug(f"KPI validation: updatedKpis = {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}") # Check if any KPI went backwards for updatedKpi in updatedKpis: kpiId = updatedKpi.get("id") currentValue = updatedKpi.get("currentValue", 0) if kpiId in lastValues: lastValue = lastValues[kpiId] if currentValue < lastValue: logger.warning(f"KPI {kpiId} went BACKWARDS: {lastValue} → {currentValue}") return False, f"KPI {kpiId} went backwards" # Check if all KPIs are completed allCompleted = True for updatedKpi in updatedKpis: targetValue = updatedKpi.get("targetValue", 0) currentValue = updatedKpi.get("currentValue", 0) if currentValue < targetValue: allCompleted = False break if allCompleted: logger.info("All KPIs completed") return True, "All KPIs completed" # Check if at least one KPI progressed atLeastOneProgressed = False for updatedKpi in updatedKpis: kpiId = updatedKpi.get("id") currentValue = updatedKpi.get("currentValue", 0) if kpiId in lastValues: lastValue = lastValues[kpiId] if currentValue > lastValue: atLeastOneProgressed = True logger.info(f"KPI {kpiId} progressed: {lastValue} → {currentValue}") break else: # First time seeing this KPI - if it has a value, it's progress if currentValue > 0: atLeastOneProgressed = True logger.info(f"KPI {kpiId} initialized: {currentValue}") break if not atLeastOneProgressed: logger.warning(f"No KPIs progressed. Last values: {lastValues}, Current values: {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}") return False, "No progress" return True, "Progress detected" @staticmethod def accumulateAndParseJsonFragments( accumulatedJsonString: str, newFragmentString: str, allSections: List[Dict[str, Any]], iteration: int ) -> Tuple[str, List[Dict[str, Any]], bool, Optional[Dict[str, Any]]]: """ Accumulate JSON fragments and parse when complete. GENERIC function that handles: 1. Concatenating JSON strings with overlap detection 2. Parsing the accumulated string 3. Extracting sections (partial if incomplete, final if complete) 4. Determining completion status Args: accumulatedJsonString: Previously accumulated JSON string newFragmentString: New fragment string from current iteration allSections: Sections extracted so far (for prompt context) iteration: Current iteration number Returns: Tuple of: - accumulatedJsonString: Updated accumulated string - sections: Extracted sections (partial if incomplete, final if complete) - isComplete: True if JSON is complete and valid - parsedResult: Parsed JSON object (if parsing succeeded) """ # Step 1: Clean encoding issues from accumulated string (check end of first delivered part) cleanedAccumulated = JsonResponseHandler.cleanEncodingIssues(accumulatedJsonString) # Step 2: Clean encoding issues from new fragment cleanedFragment = JsonResponseHandler.cleanEncodingIssues(newFragmentString) # Step 3: Concatenate with overlap handling combinedString = JsonResponseHandler.mergeJsonStringsWithOverlap( cleanedAccumulated, cleanedFragment ) # Step 4: Try to parse try: extracted = extractJsonString(combinedString) parsedResult = json.loads(extracted) # Step 5: Parsing succeeded - check completeness isComplete = JsonResponseHandler.isJsonComplete(parsedResult) if isComplete: # Step 6: Complete JSON - finalize finalizedJson = JsonResponseHandler.finalizeJson(parsedResult) sections = extractSectionsFromDocument(finalizedJson) logger.info(f"Iteration {iteration}: JSON accumulation complete, extracted {len(sections)} sections") return combinedString, sections, True, finalizedJson else: # Step 7: Incomplete but parseable - extract partial sections sections = extractSectionsFromDocument(parsedResult) logger.info(f"Iteration {iteration}: JSON accumulation incomplete but parseable, extracted {len(sections)} partial sections") return combinedString, sections, False, parsedResult except json.JSONDecodeError: # Step 8: Still broken - repair and extract partial sections repaired = repairBrokenJson(combinedString) if repaired: sections = extractSectionsFromDocument(repaired) logger.info(f"Iteration {iteration}: JSON accumulation repaired, extracted {len(sections)} sections") return combinedString, sections, False, repaired else: # Repair failed - continue with data BEFORE merging the problematic piece # Return previous accumulated string (before adding new fragment) # This ensures we don't lose previously accumulated data logger.warning(f"Iteration {iteration}: Repair failed, continuing with previous accumulated data") return accumulatedJsonString, [], False, None