gateway/modules/services/serviceAi/subJsonResponseHandling.py
ValueOn AG 64590aa61e fixes
2026-01-04 20:01:34 +01:00

3134 lines
139 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
JSON Response Handling Module
Handles merging of JSON responses from multiple AI iterations, including:
- Section merging with intelligent overlap detection
- JSON fragment detection and merging
- Deep recursive structure merging
- Overlap detection for complex nested structures
- String accumulation for iterative JSON generation
"""
import json
import logging
import re
from typing import Dict, Any, List, Optional, Tuple
from modules.shared.jsonUtils import extractJsonString, repairBrokenJson, extractSectionsFromDocument
from modules.datamodels.datamodelAi import JsonAccumulationState
logger = logging.getLogger(__name__)
class JsonResponseHandler:
"""Handles JSON response merging and fragment detection for iterative AI generation."""
@staticmethod
def mergeSectionsIntelligently(
existingSections: List[Dict[str, Any]],
newSections: List[Dict[str, Any]],
iteration: int
) -> List[Dict[str, Any]]:
"""
Intelligently merge sections from multiple iterations.
This is a GENERIC merging strategy that handles broken JSON iterations.
The break can occur anywhere - in any section, at any depth.
Merging strategies (in order of priority):
1. Same Section ID: Merge sections with identical IDs
2. Same Content-Type + Position: If last section is incomplete and new section continues it
3. Same Order: Merge sections with same order value
4. Structural Analysis: Detect continuation based on content structure
Args:
existingSections: Sections accumulated from previous iterations
newSections: Sections extracted from current iteration
iteration: Current iteration number
Returns:
Merged list of sections
"""
if not newSections:
return existingSections
if not existingSections:
return newSections
mergedSections = existingSections.copy()
for newSection in newSections:
merged = False
# Strategy 1: Same Section ID - merge directly
newSectionId = newSection.get("id")
if newSectionId:
for i, existingSection in enumerate(mergedSections):
if existingSection.get("id") == newSectionId:
# Merge sections with same ID
mergedSections[i] = JsonResponseHandler.mergeSectionContent(
existingSection, newSection, iteration
)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'")
break
if merged:
continue
# Strategy 2: Same Content-Type + Position (continuation detection)
# Check if last section is incomplete and new section continues it
if mergedSections:
lastSection = mergedSections[-1]
lastContentType = lastSection.get("content_type")
newContentType = newSection.get("content_type")
if lastContentType == newContentType:
# Same content type - check if last section is incomplete
if JsonResponseHandler.isSectionIncomplete(lastSection):
# Last section is incomplete, merge with new section
mergedSections[-1] = JsonResponseHandler.mergeSectionContent(
lastSection, newSection, iteration
)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})")
continue
# Strategy 3: Same Order value
newOrder = newSection.get("order")
if newOrder is not None:
for i, existingSection in enumerate(mergedSections):
existingOrder = existingSection.get("order")
if existingOrder is not None and existingOrder == newOrder:
# Merge sections with same order
mergedSections[i] = JsonResponseHandler.mergeSectionContent(
existingSection, newSection, iteration
)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}")
break
if merged:
continue
# Strategy 4: Structural Analysis - detect continuation
# For code_block and table: if last section matches new section type, merge them
if mergedSections:
lastSection = mergedSections[-1]
lastContentType = lastSection.get("content_type")
newContentType = newSection.get("content_type")
# Both are code blocks - merge them
if lastContentType == "code_block" and newContentType == "code_block":
mergedSections[-1] = JsonResponseHandler.mergeSectionContent(
lastSection, newSection, iteration
)
merged = True
logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis")
continue
# Both are tables - merge them (common case for broken JSON iterations)
if lastContentType == "table" and newContentType == "table":
mergedSections[-1] = JsonResponseHandler.mergeSectionContent(
lastSection, newSection, iteration
)
merged = True
logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis")
continue
# No merge strategy matched - add as new section
if not merged:
mergedSections.append(newSection)
logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})")
return mergedSections
@staticmethod
def isSectionIncomplete(section: Dict[str, Any]) -> bool:
"""
Check if a section is incomplete (broken at the end).
This detects incomplete sections based on content analysis:
- Code blocks: ends mid-line, ends with comma, ends with incomplete structure
- Text sections: ends mid-sentence, ends with incomplete structure
- Other types: check for incomplete elements
"""
contentType = section.get("content_type", "")
elements = section.get("elements", [])
if not elements:
return False
# Handle list of elements
if isinstance(elements, list) and len(elements) > 0:
lastElement = elements[-1]
else:
lastElement = elements
if not isinstance(lastElement, dict):
return False
# Check code_block for incomplete code
if contentType == "code_block":
code = lastElement.get("code", "")
if code:
# Check if code ends incompletely:
# - Ends with comma (incomplete CSV line)
# - Ends with number but no newline (incomplete line)
# - Ends mid-token (e.g., "23431,23" - incomplete number)
codeStripped = code.rstrip()
if codeStripped:
# Check for incomplete patterns
if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')):
# Ends with comma or has comma but no final newline - likely incomplete
return True
# Check if last line is incomplete (doesn't end with newline and has partial content)
if not code.endswith('\n') and codeStripped:
# No final newline - might be incomplete
# More sophisticated: check if last number is complete
lastLine = codeStripped.split('\n')[-1]
if lastLine and ',' in lastLine:
# Has commas but might be incomplete
parts = lastLine.split(',')
if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete
return True
# Check table for incomplete rows
if contentType == "table":
rows = lastElement.get("rows", [])
if rows:
# Check if last row is incomplete (ends with incomplete data)
lastRow = rows[-1] if isinstance(rows, list) else []
if isinstance(lastRow, list) and lastRow:
# CRITICAL: Check if last row doesn't have expected number of columns (if headers exist)
# This is the PRIMARY indicator of incomplete table rows
headers = lastElement.get("headers", [])
if headers and isinstance(headers, list):
expectedCols = len(headers)
if len(lastRow) < expectedCols:
logger.debug(f"Table section incomplete: last row has {len(lastRow)} columns, expected {expectedCols}")
return True
# Also check if last row ends with incomplete data (e.g., incomplete string)
lastCell = lastRow[-1] if lastRow else ""
if isinstance(lastCell, str):
# If last cell is incomplete (ends with quote or is very short), section might be incomplete
if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell):
logger.debug(f"Table section incomplete: last cell appears incomplete: '{lastCell}'")
return True
# Additional check: if last row has fewer cells than previous rows, it's likely incomplete
if len(rows) > 1:
prevRow = rows[-2] if isinstance(rows, list) and len(rows) > 1 else []
if isinstance(prevRow, list) and len(prevRow) > len(lastRow):
logger.debug(f"Table section incomplete: last row has {len(lastRow)} cells, previous row has {len(prevRow)}")
return True
# Check paragraph/text for incomplete sentences
if contentType in ["paragraph", "heading"]:
text = lastElement.get("text", "")
if text:
# Simple heuristic: if doesn't end with sentence-ending punctuation
textStripped = text.rstrip()
if textStripped and not textStripped[-1] in '.!?':
# Might be incomplete, but this is less reliable
# Only mark as incomplete if very short (likely cut off)
if len(textStripped) < 20:
return True
# Check lists for incomplete items
if contentType in ["bullet_list", "numbered_list"]:
items = lastElement.get("items", [])
if items and isinstance(items, list):
# Check if last item is incomplete (very short or ends with incomplete string)
lastItem = items[-1] if items else None
if isinstance(lastItem, str) and len(lastItem) < 3:
return True
# Check image for incomplete base64 data
if contentType == "image":
imageData = lastElement.get("base64Data", "")
if imageData:
# Base64 strings should end with padding ('=' or '==')
# If it doesn't, it might be incomplete
stripped = imageData.rstrip()
if stripped and not stripped.endswith(('=', '==')):
# Check if it's a valid base64 character sequence that was cut off
if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=':
return True
# If length is not a multiple of 4 (base64 requirement), it might be incomplete
if len(stripped) % 4 != 0:
return True
# GENERIC CHECK: Recursively analyze structure for incompleteness
# This works for ANY structure: arrays, objects, nested, primitives
return JsonResponseHandler._isStructureIncomplete(lastElement)
@staticmethod
def _isStructureIncomplete(structure: Any, max_depth: int = 10) -> bool:
"""
GENERIC recursive check for incomplete structures.
Detects incompleteness by analyzing patterns:
- Arrays: Last item shorter than previous items, incomplete patterns
- Objects: Last object has fewer keys than pattern, incomplete values
- Strings: Very short, ends abruptly, incomplete patterns
- Nested: Recursively checks nested structures
Works for ANY JSON structure of any depth/complexity.
"""
if max_depth <= 0:
return False
# Arrays/Lists - check for incomplete patterns
if isinstance(structure, list):
if len(structure) == 0:
return False
# Check if last item is incomplete compared to previous items
last_item = structure[-1]
# If we have previous items, compare structure
if len(structure) > 1:
prev_item = structure[-2]
# If last item is a list and previous is a list, check length
if isinstance(last_item, list) and isinstance(prev_item, list):
if len(last_item) < len(prev_item):
return True # Last row/item has fewer elements - likely incomplete
# If last item is a dict and previous is a dict, check keys
if isinstance(last_item, dict) and isinstance(prev_item, dict):
if len(last_item) < len(prev_item):
return True # Last object has fewer keys - likely incomplete
# Recursively check last item for incompleteness
if JsonResponseHandler._isStructureIncomplete(last_item, max_depth - 1):
return True
# Objects/Dicts - check for incomplete values
elif isinstance(structure, dict):
for key, value in structure.items():
# Recursively check each value
if JsonResponseHandler._isStructureIncomplete(value, max_depth - 1):
return True
# Check for incomplete strings
if isinstance(value, str):
# Very short strings might be incomplete
if len(value) > 0 and len(value) < 3:
return True
# Strings ending with incomplete patterns (comma, quote, etc.)
stripped = value.rstrip()
if stripped and stripped.endswith((',', '"', '\\')):
return True
# Strings - check for incomplete patterns
elif isinstance(structure, str):
# Very short strings might be incomplete
if len(structure) > 0 and len(structure) < 3:
return True
# Strings ending with incomplete patterns
stripped = structure.rstrip()
if stripped and stripped.endswith((',', '"', '\\')):
return True
return False
@staticmethod
def mergeSectionContent(
existingSection: Dict[str, Any],
newSection: Dict[str, Any],
iteration: int
) -> Dict[str, Any]:
"""
Merge content from two sections.
Handles different content types:
- code_block: Append code, handle overlaps, merge incomplete lines
- paragraph/heading: Append text
- table: Merge rows
- list: Merge items
- Other: Merge elements
"""
contentType = existingSection.get("content_type", "")
existingElements = existingSection.get("elements", [])
newElements = newSection.get("elements", [])
if not newElements:
return existingSection
# Handle list of elements
if isinstance(existingElements, list):
existingElem = existingElements[-1] if existingElements else {}
else:
existingElem = existingElements
if isinstance(newElements, list):
newElem = newElements[0] if newElements else {}
else:
newElem = newElements
if not isinstance(existingElem, dict) or not isinstance(newElem, dict):
return existingSection
# Merge based on content type
if contentType == "code_block":
existingCode = existingElem.get("code", "")
newCode = newElem.get("code", "")
if existingCode and newCode:
mergedCode = JsonResponseHandler.mergeCodeBlocks(existingCode, newCode, iteration)
existingElem["code"] = mergedCode
# Preserve language from existing or new
if "language" not in existingElem and "language" in newElem:
existingElem["language"] = newElem["language"]
elif contentType in ["paragraph", "heading"]:
existingText = existingElem.get("text", "")
newText = newElem.get("text", "")
if existingText and newText:
# Append text with space if needed
if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n':
mergedText = existingText.rstrip() + " " + newText.lstrip()
else:
mergedText = existingText.rstrip() + "\n" + newText.lstrip()
existingElem["text"] = mergedText
elif contentType == "table":
# Merge table rows with sophisticated overlap detection
# CRITICAL: Tables can have rows in two places:
# 1. Direct: existingElem["rows"] (legacy format)
# 2. Nested: existingElem["content"]["rows"] (current format)
existingRows = None
newRows = None
# Check nested structure first (current format)
if "content" in existingElem and isinstance(existingElem["content"], dict):
existingRows = existingElem["content"].get("rows", [])
# Fallback to direct structure (legacy format)
if not existingRows:
existingRows = existingElem.get("rows", [])
# Check nested structure first (current format)
if "content" in newElem and isinstance(newElem["content"], dict):
newRows = newElem["content"].get("rows", [])
# Fallback to direct structure (legacy format)
if not newRows:
newRows = newElem.get("rows", [])
if existingRows and newRows:
# Use sophisticated overlap detection that handles multiple overlapping rows
mergedRows = JsonResponseHandler.mergeRowsWithOverlap(existingRows, newRows, iteration)
# Store in nested structure (current format)
if "content" not in existingElem:
existingElem["content"] = {}
existingElem["content"]["rows"] = mergedRows
# Also set type if missing
if "type" not in existingElem:
existingElem["type"] = "table"
logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(mergedRows)}")
elif newRows:
# If existing has no rows but new does, use new rows
if "content" not in existingElem:
existingElem["content"] = {}
existingElem["content"]["rows"] = newRows
if "type" not in existingElem:
existingElem["type"] = "table"
# Preserve headers from existing (or use new if existing has none)
# Headers can be in content.headers or directly in element
existingHeaders = existingElem.get("content", {}).get("headers", []) if "content" in existingElem else existingElem.get("headers", [])
newHeaders = newElem.get("content", {}).get("headers", []) if "content" in newElem else newElem.get("headers", [])
if not existingHeaders and newHeaders:
if "content" not in existingElem:
existingElem["content"] = {}
existingElem["content"]["headers"] = newHeaders
# Preserve caption from existing (or use new if existing has none)
existingCaption = existingElem.get("content", {}).get("caption") if "content" in existingElem else existingElem.get("caption")
newCaption = newElem.get("content", {}).get("caption") if "content" in newElem else newElem.get("caption")
if not existingCaption and newCaption:
if "content" not in existingElem:
existingElem["content"] = {}
existingElem["content"]["caption"] = newCaption
elif contentType in ["bullet_list", "numbered_list"]:
# Merge list items with sophisticated overlap detection
existingItems = existingElem.get("items", [])
newItems = newElem.get("items", [])
if existingItems and newItems:
mergedItems = JsonResponseHandler.mergeItemsWithOverlap(existingItems, newItems, iteration)
existingElem["items"] = mergedItems
elif newItems:
existingElem["items"] = newItems
elif contentType == "image":
# Images are typically complete - if new image is provided, replace existing
# But check if existing image data is incomplete (e.g., base64 string cut off)
existingImageData = existingElem.get("base64Data", "")
newImageData = newElem.get("base64Data", "")
if existingImageData and newImageData:
# If existing image data doesn't end with valid base64 padding, it might be incomplete
# Base64 padding is '=' or '==' at the end
if not existingImageData.rstrip().endswith(('=', '==')):
# Existing image might be incomplete - merge by appending new data
# This handles cases where base64 string was cut off
existingElem["base64Data"] = existingImageData + newImageData
logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data")
else:
# Existing image is complete - replace with new (or keep existing if new is empty)
if newImageData:
existingElem["base64Data"] = newImageData
elif newImageData:
existingElem["base64Data"] = newImageData
# Preserve other image metadata
if not existingElem.get("altText") and newElem.get("altText"):
existingElem["altText"] = newElem["altText"]
if not existingElem.get("caption") and newElem.get("caption"):
existingElem["caption"] = newElem["caption"]
else:
# GENERIC FALLBACK: Use deep recursive merging for complex nested structures
# This handles any content type with arbitrary depth and complexity
merged_element = JsonResponseHandler.mergeDeepStructures(
existingElem,
newElem,
iteration,
f"section.{contentType}"
)
existingElem = merged_element
# Update section with merged content
mergedSection = existingSection.copy()
if isinstance(existingElements, list):
# Update the last element in the list with merged content
if existingElements:
existingElements[-1] = existingElem
mergedSection["elements"] = existingElements
else:
mergedSection["elements"] = existingElem
# Preserve metadata from new section if missing in existing
if "order" not in mergedSection and "order" in newSection:
mergedSection["order"] = newSection["order"]
return mergedSection
@staticmethod
def mergeCodeBlocks(existingCode: str, newCode: str, iteration: int) -> str:
"""
Merge two code blocks intelligently, handling overlaps and incomplete lines.
"""
if not existingCode:
return newCode
if not newCode:
return existingCode
existingLines = existingCode.rstrip().split('\n')
newLines = newCode.strip().split('\n')
if not existingLines or not newLines:
return existingCode + "\n" + newCode
lastExistingLine = existingLines[-1].strip()
firstNewLine = newLines[0].strip()
# Strategy 1: Exact overlap - remove duplicate line
if lastExistingLine == firstNewLine:
newLines = newLines[1:]
logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge")
# Strategy 2: Incomplete line merge
# If last existing line ends with comma or is incomplete, merge with first new line
elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5):
# Last line is incomplete - merge with first new line
# Remove trailing comma from existing line
mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip()
existingLines[-1] = mergedLine
newLines = newLines[1:]
logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation")
# Strategy 3: Partial overlap detection
# Check if first new line starts with the end of last existing line
elif ',' in lastExistingLine and ',' in firstNewLine:
lastExistingParts = lastExistingLine.split(',')
firstNewParts = firstNewLine.split(',')
# Check for overlap: if last part of existing matches first part of new
if lastExistingParts and firstNewParts:
lastExistingPart = lastExistingParts[-1].strip()
firstNewPart = firstNewParts[0].strip()
# If they match, there's overlap
if lastExistingPart == firstNewPart and len(lastExistingParts) > 1:
# Remove overlapping part from new line
newLines[0] = ','.join(firstNewParts[1:])
logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge")
# Reconstruct merged code
mergedCode = '\n'.join(existingLines)
if newLines:
if mergedCode and not mergedCode.endswith('\n'):
mergedCode += '\n'
mergedCode += '\n'.join(newLines)
return mergedCode
@staticmethod
def detectAndParseJsonFragment(
result: str,
allSections: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""
GENERIC fragment detection for ANY JSON structure.
Detects if response is a JSON fragment (continuation content) rather than full document structure.
Works for ANY JSON type: arrays, objects, primitives, nested structures of any depth/complexity.
Fragment = Any JSON that:
1. Does NOT have "documents" or "sections" keys (not full document structure)
2. Can be ANY structure: array, object, nested, primitive, etc.
3. Is continuation content that needs to be merged into existing sections
Examples (all handled generically):
- Array: [["37643", ...], ...] (table rows, list items, any array)
- Object: {"rows": [...], "headers": [...]} (partial element)
- Primitive: "continuation text" (rare but possible)
- Nested: {"data": {"items": [...]}} (any nested structure)
Returns fragment info dict with:
- fragment_data: The parsed fragment content (ANY type)
- target_section_id: ID of last incomplete section (generic, not type-specific)
CRITICAL: Fully generic - no specific logic for tables, paragraphs, etc.
"""
try:
extracted = extractJsonString(result)
parsed = json.loads(extracted)
# GENERIC fragment detection: Check if it's NOT a full document structure
is_full_document = False
if isinstance(parsed, dict):
# Full document structure has "documents" or "sections" keys
if "documents" in parsed or "sections" in parsed:
is_full_document = True
# If it's a full document structure, it's not a fragment
if is_full_document:
return None
# Otherwise, it's a fragment (can be ANY structure: array, object, primitive, nested)
# Find target: last incomplete section (generic, regardless of content type)
target_section_id = JsonResponseHandler.findLastIncompleteSectionId(allSections)
logger.info(f"Detected GENERIC JSON fragment (type: {type(parsed).__name__}), target: {target_section_id}")
return {
"fragment_data": parsed, # Can be ANY JSON structure
"target_section_id": target_section_id
}
except Exception as e:
logger.error(f"Error detecting JSON fragment: {e}")
logger.debug(f"Fragment detection failed for result: {result[:500]}...")
return None
@staticmethod
def findLastIncompleteSectionId(
allSections: List[Dict[str, Any]]
) -> Optional[str]:
"""
GENERIC: Find the last incomplete section (regardless of content type).
This is fully generic - works for ANY content type, ANY structure.
Returns the ID of the last section that is incomplete, or None if all are complete.
"""
# Find the last incomplete section (generic, not type-specific)
for section in reversed(allSections):
if JsonResponseHandler.isSectionIncomplete(section):
return section.get("id")
# If no incomplete section found, return last section as fallback
if allSections:
return allSections[-1].get("id")
return None
@staticmethod
def mergeFragmentIntoSection(
fragment: Dict[str, Any],
allSections: List[Dict[str, Any]],
iteration: int
) -> Optional[List[Dict[str, Any]]]:
"""
GENERIC fragment merging for ANY JSON structure.
Merges a JSON fragment (ANY structure: array, object, nested, primitive) into the last incomplete section.
Uses ONLY deep recursive merging - no specific logic for content types.
Handles ALL cases:
1. Fragments with overlap (detected and merged intelligently)
2. Fragments without overlap (continuation after cut-off, appended)
3. Any JSON structure (arrays, objects, nested, primitives)
4. Accumulative merging (uses merged data from past iterations)
CRITICAL: Fully generic - works for ANY JSON structure, ANY content type.
NO FALLBACKS: Returns None if merge fails (no target section found).
"""
fragment_data = fragment.get("fragment_data")
target_section_id = fragment.get("target_section_id")
if fragment_data is None:
logger.error(f"Iteration {iteration}: ❌ Fragment has no fragment_data - merge FAILED")
return None
# Find the target section (last incomplete section, generic)
target_section = None
target_index = -1
if target_section_id:
for i, section in enumerate(allSections):
if section.get("id") == target_section_id:
target_section = section
target_index = i
break
# NO FALLBACKS: If target not found by ID, try to find incomplete section
if not target_section:
for i, section in enumerate(reversed(allSections)):
if JsonResponseHandler.isSectionIncomplete(section):
target_section = section
target_index = len(allSections) - 1 - i
break
# NO FALLBACKS: If no target found, merge FAILS
if not target_section:
logger.error(f"Iteration {iteration}: ❌ MERGE FAILED - No target section found for fragment!")
logger.error(f"Iteration {iteration}: Available sections: {[s.get('id') + ' (' + s.get('content_type', 'unknown') + ')' for s in allSections]}")
return None
# Get the last element from target section (where fragment will be merged)
merged_section = target_section.copy()
elements = merged_section.get("elements", [])
if not isinstance(elements, list):
elements = [elements] if elements else []
if not elements:
elements = [{}]
last_element = elements[-1] if elements else {}
if not isinstance(last_element, dict):
last_element = {}
elements.append(last_element)
# CRITICAL: GENERIC fragment merging for ALL structure types
# Automatically detects the structure type and merges accordingly
# Works for: tables, lists, code blocks, paragraphs, images, and any nested structures
merged_element = JsonResponseHandler._mergeFragmentIntoElement(
last_element,
fragment_data,
target_section,
iteration,
f"section.{target_section_id}.fragment"
)
# Update elements with merged content
elements[-1] = merged_element
merged_section["elements"] = elements
# Update allSections (this ensures accumulative merging - merged data is used for next iteration)
merged_sections = allSections.copy()
merged_sections[target_index] = merged_section
logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section_id}'")
# Log merged JSON for debugging
try:
from modules.shared.debugLogger import writeDebugFile
merged_json_str = json.dumps(merged_sections, indent=2, ensure_ascii=False)
writeDebugFile(merged_json_str, f"merged_json_iteration_{iteration}.json")
except Exception as e:
logger.debug(f"Iteration {iteration}: Failed to write merged JSON debug file: {e}")
return merged_sections
@staticmethod
def completeIncompleteStructures(allSections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Complete any incomplete structures in sections by ensuring proper JSON structure.
This ensures JSON is properly closed even if merge failed or iterations stopped early.
Works generically for ANY structure type - recursively processes all nested structures.
Returns sections with completed structures.
"""
completed_sections = []
for section in allSections:
completed_section = JsonResponseHandler._completeStructure(section)
completed_sections.append(completed_section)
return completed_sections
@staticmethod
def _completeStructure(structure: Any) -> Any:
"""
Recursively complete incomplete structures by ensuring arrays/objects are properly structured.
Works generically for ANY JSON structure - no specific logic for content types.
"""
if isinstance(structure, dict):
completed = {}
for key, value in structure.items():
completed[key] = JsonResponseHandler._completeStructure(value)
return completed
elif isinstance(structure, list):
completed = []
for item in structure:
completed.append(JsonResponseHandler._completeStructure(item))
return completed
else:
# Primitive value - return as is
return structure
@staticmethod
def getContentTypeForFragment(fragment_type: str) -> str:
"""Map fragment type to content type."""
mapping = {
"table_rows": "table",
"table_element": "table",
"code_lines": "code_block",
"code_element": "code_block",
"list_items": "bullet_list"
}
return mapping.get(fragment_type, "paragraph")
@staticmethod
def deepCompare(obj1: Any, obj2: Any, max_depth: int = 10) -> bool:
"""
Deep recursive comparison of two JSON-serializable objects.
Handles nested structures of any depth and complexity.
Args:
obj1: First object to compare
obj2: Second object to compare
max_depth: Maximum recursion depth to prevent infinite loops
Returns:
True if objects are deeply equal, False otherwise
"""
if max_depth <= 0:
return False
# Type check
if type(obj1) != type(obj2):
return False
# Primitive types
if isinstance(obj1, (str, int, float, bool, type(None))):
return obj1 == obj2
# Lists/arrays - compare element by element
if isinstance(obj1, list):
if len(obj1) != len(obj2):
return False
return all(JsonResponseHandler.deepCompare(item1, item2, max_depth - 1)
for item1, item2 in zip(obj1, obj2))
# Dicts/objects - compare key by key
if isinstance(obj1, dict):
if set(obj1.keys()) != set(obj2.keys()):
return False
return all(JsonResponseHandler.deepCompare(obj1[key], obj2[key], max_depth - 1)
for key in obj1.keys())
# Fallback for other types
return obj1 == obj2
@staticmethod
def findLongestCommonSuffix(
existing_list: List[Any],
new_list: List[Any],
min_overlap: int = 1
) -> int:
"""
Find the longest common suffix of existing_list that matches a prefix of new_list.
This handles cases where multiple elements overlap:
- existing: [A, B, C, D]
- new: [C, D, E, F]
- overlap: [C, D] (length 2)
Returns the length of the overlap (0 if no overlap found).
"""
if not existing_list or not new_list:
return 0
max_overlap = min(len(existing_list), len(new_list))
# Try all possible overlap lengths (from longest to shortest)
for overlap_len in range(max_overlap, min_overlap - 1, -1):
existing_suffix = existing_list[-overlap_len:]
new_prefix = new_list[:overlap_len]
# Deep compare suffix and prefix
if all(JsonResponseHandler.deepCompare(existing_suffix[i], new_prefix[i])
for i in range(overlap_len)):
return overlap_len
return 0
@staticmethod
def findPartialOverlap(
existing_item: Any,
new_item: Any
) -> Tuple[bool, Optional[Any]]:
"""
Detect if new_item completes an incomplete existing_item.
Handles cases like:
- existing: ["37643", "37649", "37657", "37663", "37691", "37693", "37699", "37717", "37747", "376"]
- new: ["37643", "37649", ...]
Returns (is_partial_overlap, merged_item) if partial overlap detected, else (False, None).
"""
# Check if both are lists
if isinstance(existing_item, list) and isinstance(new_item, list):
if not existing_item or not new_item:
return False, None
# Check if last element of existing is incomplete and matches first of new
last_existing = existing_item[-1]
first_new = new_item[0]
# If last existing is a string and first new is a string
if isinstance(last_existing, str) and isinstance(first_new, str):
# Check if last existing is incomplete (very short, ends with number, etc.)
if len(last_existing) < 10 and first_new.startswith(last_existing):
# Partial overlap - merge them
merged_last = last_existing + first_new[len(last_existing):]
merged_item = existing_item[:-1] + [merged_last] + new_item[1:]
return True, merged_item
# Check if last existing is incomplete list and first new completes it
if isinstance(last_existing, list) and isinstance(first_new, list):
if len(last_existing) < len(first_new):
# Check if last existing is prefix of first new
if first_new[:len(last_existing)] == last_existing:
# Merge: replace incomplete last with complete first
merged_item = existing_item[:-1] + [first_new] + new_item[1:]
return True, merged_item
# Check if existing is incomplete string and new completes it
if isinstance(existing_item, str) and isinstance(new_item, str):
if len(existing_item) < 50 and new_item.startswith(existing_item):
# Partial overlap
merged = existing_item + new_item[len(existing_item):]
return True, merged
return False, None
@staticmethod
def mergeRowsWithOverlap(
existing_rows: List[List[str]],
new_rows: List[List[str]],
iteration: int
) -> List[List[str]]:
"""
Merge table rows with sophisticated overlap detection.
Handles multiple overlapping rows and partial overlaps.
"""
if not new_rows:
return existing_rows
if not existing_rows:
return new_rows
# Strategy 1: Find longest common suffix/prefix overlap
overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_rows, new_rows, min_overlap=1)
if overlap_len > 0:
logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping table rows, removing duplicates")
return existing_rows + new_rows[overlap_len:]
# Strategy 2: Check for partial overlap in last row
if len(existing_rows) > 0 and len(new_rows) > 0:
last_existing = existing_rows[-1]
first_new = new_rows[0]
is_partial, merged_row = JsonResponseHandler.findPartialOverlap(last_existing, first_new)
if is_partial:
logger.debug(f"Iteration {iteration}: Found partial overlap in table rows, merging")
return existing_rows[:-1] + [merged_row] + new_rows[1:]
# Strategy 3: Simple first/last comparison (fallback)
if isinstance(existing_rows[-1], list) and isinstance(new_rows[0], list):
if list(existing_rows[-1]) == list(new_rows[0]):
logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)")
return existing_rows + new_rows[1:]
# No overlap detected - append all new rows
return existing_rows + new_rows
@staticmethod
def mergeItemsWithOverlap(
existing_items: List[str],
new_items: List[str],
iteration: int
) -> List[str]:
"""
Merge list items with sophisticated overlap detection.
Handles multiple overlapping items and partial overlaps.
"""
if not new_items:
return existing_items
if not existing_items:
return new_items
# Strategy 1: Find longest common suffix/prefix overlap
overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_items, new_items, min_overlap=1)
if overlap_len > 0:
logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping list items, removing duplicates")
return existing_items + new_items[overlap_len:]
# Strategy 2: Check for partial overlap in last item
if len(existing_items) > 0 and len(new_items) > 0:
is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing_items[-1], new_items[0])
if is_partial:
logger.debug(f"Iteration {iteration}: Found partial overlap in list items, merging")
return existing_items[:-1] + [merged_item] + new_items[1:]
# Strategy 3: Simple first/last comparison (fallback)
if existing_items[-1] == new_items[0]:
logger.debug(f"Iteration {iteration}: Removed duplicate list item (exact match)")
return existing_items + new_items[1:]
# No overlap detected - append all new items
return existing_items + new_items
@staticmethod
def mergeDeepStructures(
existing: Any,
new: Any,
iteration: int,
path: str = "root"
) -> Any:
"""
FULLY GENERIC recursive merge for ANY JSON structure of arbitrary depth/complexity.
Handles ALL cases generically:
1. Arrays/Lists: Overlap detection (suffix/prefix), partial overlap, no overlap (continuation)
2. Objects/Dicts: Key-by-key merge with overlap detection for nested structures
3. Primitives: Equality check, replacement if different
4. Nested structures: Recursively handles any depth/complexity
Overlap detection strategies (all generic):
- Array overlap: Finds longest common suffix/prefix, handles partial overlaps
- Object overlap: Detected recursively through key matching and deep comparison
- No overlap: Appends/merges continuation content after cut-off point
CRITICAL: Fully generic - no specific logic for content types.
Works for ANY JSON structure: arrays, objects, nested, primitives, any combination.
"""
# Type check
if type(existing) != type(new):
# Types don't match - return new (replacement)
logger.debug(f"Iteration {iteration}: Types don't match at {path} ({type(existing).__name__} vs {type(new).__name__}), replacing")
return new
# Lists/arrays - GENERIC merge with overlap detection
if isinstance(existing, list) and isinstance(new, list):
if not new:
return existing
if not existing:
return new
# Strategy 1: Find longest common suffix/prefix overlap (handles multiple overlapping elements)
overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing, new, min_overlap=1)
if overlap_len > 0:
logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping elements at {path}, removing duplicates")
return existing + new[overlap_len:]
# Strategy 2: Check for partial overlap in last element (incomplete element completion)
if len(existing) > 0 and len(new) > 0:
is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing[-1], new[0])
if is_partial:
logger.debug(f"Iteration {iteration}: Found partial overlap at {path}, merging incomplete element")
return existing[:-1] + [merged_item] + new[1:]
# Strategy 3: No overlap detected - continuation after cut-off point
# This handles the case where new data starts exactly after the cut-off
logger.debug(f"Iteration {iteration}: No overlap at {path}, appending continuation content ({len(new)} items)")
return existing + new
# Dicts/objects - GENERIC merge with recursive overlap detection
if isinstance(existing, dict) and isinstance(new, dict):
merged = existing.copy()
# Check for object-level overlap: if new object is subset/superset of existing
# This handles cases where same object structure appears in both
existing_keys = set(existing.keys())
new_keys = set(new.keys())
# If new is subset of existing and values match, it's overlap (skip)
if new_keys.issubset(existing_keys):
all_match = True
for key in new_keys:
if not JsonResponseHandler.deepCompare(existing[key], new[key]):
all_match = False
break
if all_match:
logger.debug(f"Iteration {iteration}: Object at {path} is subset overlap, skipping")
return existing
# Merge key-by-key with recursive overlap detection
for key, new_value in new.items():
if key in merged:
# Key exists - merge recursively (handles nested overlap detection)
merged[key] = JsonResponseHandler.mergeDeepStructures(
merged[key],
new_value,
iteration,
f"{path}.{key}"
)
else:
# New key - add it (continuation content)
merged[key] = new_value
logger.debug(f"Iteration {iteration}: Added new key '{key}' at {path} (continuation)")
return merged
# Primitives - equality check
if existing == new:
return existing
# Different primitive values - return new (continuation/replacement)
logger.debug(f"Iteration {iteration}: Primitive at {path} differs, using new value")
return new
@staticmethod
def _mergeFragmentIntoElement(
last_element: Dict[str, Any],
fragment_data: Any,
target_section: Dict[str, Any],
iteration: int,
path: str
) -> Dict[str, Any]:
"""
GENERIC fragment merging for ALL structure types.
Automatically detects the structure type and merges fragments accordingly.
Works for: tables, lists, code blocks, paragraphs, images, and any nested structures.
Strategy:
1. Analyze last_element structure to determine content location (content.rows, content.items, etc.)
2. Detect fragment type (array, object, primitive)
3. Merge fragment into appropriate location using mergeDeepStructures
Args:
last_element: The existing element to merge into
fragment_data: The fragment data to merge (can be any JSON structure)
target_section: The target section (for content_type detection)
iteration: Current iteration number
path: Path for logging
Returns:
Merged element
"""
contentType = target_section.get("content_type", "")
elementType = last_element.get("type", "")
# Determine the content structure path based on element type and content type
# This handles both nested (content.rows) and flat (rows) structures
contentPath = None
fragmentIsArray = isinstance(fragment_data, list) and len(fragment_data) > 0
# Detect structure type and determine merge path
if contentType == "table" or elementType == "table":
# Tables: merge into content.rows or rows
if "content" in last_element and isinstance(last_element["content"], dict):
contentPath = "content.rows"
else:
contentPath = "rows"
elif contentType in ["bullet_list", "numbered_list", "list"] or elementType in ["bullet_list", "numbered_list", "list"]:
# Lists: merge into content.items or items
if "content" in last_element and isinstance(last_element["content"], dict):
contentPath = "content.items"
else:
contentPath = "items"
elif contentType == "code_block" or elementType == "code_block":
# Code blocks: merge into content.code or code
if "content" in last_element and isinstance(last_element["content"], dict):
contentPath = "content.code"
else:
contentPath = "code"
elif contentType in ["paragraph", "heading"] or elementType in ["paragraph", "heading"]:
# Text: merge into content.text or text
if "content" in last_element and isinstance(last_element["content"], dict):
contentPath = "content.text"
else:
contentPath = "text"
elif contentType == "image" or elementType == "image":
# Images: merge into base64Data
contentPath = "base64Data"
# If we have a specific content path, merge into that location
if contentPath:
# Split path (e.g., "content.rows" -> ["content", "rows"])
pathParts = contentPath.split(".")
# Ensure nested structure exists
current = last_element
for i, part in enumerate(pathParts[:-1]):
if part not in current:
current[part] = {}
elif not isinstance(current[part], dict):
current[part] = {}
current = current[part]
# Get existing content at target path
targetKey = pathParts[-1]
existingContent = current.get(targetKey, [])
# Merge fragment into existing content
# CRITICAL: Handle both array fragments and object fragments generically
if fragmentIsArray:
# Fragment is an array - merge arrays
if isinstance(existingContent, list):
# Check if fragment is array of arrays (e.g., table rows) or array of primitives
if len(fragment_data) > 0 and isinstance(fragment_data[0], list):
# Array of arrays - use rows merge for tables, generic merge for others
if contentPath.endswith(".rows"):
mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragment_data, iteration)
else:
# Generic array-of-arrays merge
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent,
fragment_data,
iteration,
f"{path}.{targetKey}"
)
else:
# Array of primitives - use items merge for lists, generic merge for others
if contentPath.endswith(".items"):
mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragment_data, iteration)
else:
# Generic array merge using mergeDeepStructures
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent,
fragment_data,
iteration,
f"{path}.{targetKey}"
)
else:
# Existing content is not a list - replace with fragment
mergedContent = fragment_data
elif isinstance(fragment_data, dict):
# Fragment is an object - check if it contains nested content (e.g., {"content": {"rows": [...]}})
# If fragment has same structure as target, merge nested content
if "content" in fragment_data and isinstance(fragment_data["content"], dict):
fragmentNested = fragment_data["content"]
# Check if fragment has the same key as our target (e.g., fragment.content.rows)
if targetKey in fragmentNested:
# Fragment has nested content matching our target - merge that content
fragmentNestedContent = fragmentNested[targetKey]
if isinstance(existingContent, list) and isinstance(fragmentNestedContent, list):
# Both are lists - merge them
if contentPath.endswith(".rows"):
mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragmentNestedContent, iteration)
elif contentPath.endswith(".items"):
mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragmentNestedContent, iteration)
else:
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent,
fragmentNestedContent,
iteration,
f"{path}.{targetKey}"
)
else:
# Use deep merge for nested content
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent if existingContent else {},
fragmentNestedContent,
iteration,
f"{path}.{targetKey}"
)
else:
# Fragment has different structure - merge entire fragment object
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent if existingContent else {},
fragment_data,
iteration,
f"{path}.{targetKey}"
)
else:
# Fragment is a simple object - use deep merge
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent if existingContent else {},
fragment_data,
iteration,
f"{path}.{targetKey}"
)
else:
# Fragment is a primitive or unknown type - use deep merge
mergedContent = JsonResponseHandler.mergeDeepStructures(
existingContent if existingContent else {},
fragment_data,
iteration,
f"{path}.{targetKey}"
)
# Update the merged content
current[targetKey] = mergedContent
# Ensure type is set
if elementType and "type" not in last_element:
last_element["type"] = elementType
elif contentType and "type" not in last_element:
last_element["type"] = contentType
logger.info(f"Iteration {iteration}: ✅ Merged fragment into {contentPath} for section '{target_section.get('id')}'")
return last_element
# No specific content path - use generic deep merge
# This handles any structure type generically
merged_element = JsonResponseHandler.mergeDeepStructures(
last_element,
fragment_data,
iteration,
path
)
logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section.get('id')}'")
return merged_element
@staticmethod
def cleanEncodingIssues(jsonString: str) -> str:
"""
GENERIC function to remove problematic encoding parts from JSON string.
Works for ANY JSON structure - removes problematic characters/bytes.
Args:
jsonString: JSON string that may have encoding issues
Returns:
Cleaned JSON string
"""
try:
# Try to decode/encode to detect issues
jsonString.encode('utf-8').decode('utf-8')
return jsonString
except UnicodeError:
# Remove problematic parts
cleaned = jsonString.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
logger.warning("Removed encoding issues from JSON string")
return cleaned
@staticmethod
def mergeJsonStringsWithOverlap(
accumulated: str,
newFragment: str
) -> Tuple[str, bool]:
"""
Merge JSON fragments intelligently using modular parser.
Uses the new ModularJsonMerger for clean, robust merging.
Falls back to legacy code only if new merger fails completely.
Args:
accumulated: Previously accumulated JSON string (may be incomplete/fragmented)
newFragment: New fragment string to append (may be incomplete/fragmented)
Returns:
Tuple of (merged_json_string, has_overlap):
- merged_json_string: Combined JSON string with fragments properly merged
- has_overlap: True if overlap was found (iterations should continue), False if no overlap (iterations should stop)
"""
if not accumulated:
result = newFragment if newFragment else "{}"
return (result, False) # No overlap if no accumulated data
if not newFragment:
return (accumulated, False) # No overlap if no new fragment
# Use new modular merger
try:
from modules.services.serviceAi.subJsonMerger import ModularJsonMerger
result, hasOverlap = ModularJsonMerger.merge(accumulated, newFragment)
# IMPORTANT: ModularJsonMerger returns unclosed JSON if overlap found (with incomplete element at end)
# If no overlap, returns closed JSON (iterations should stop)
if result and result.strip() and result.strip() != "{}":
# Return result with overlap flag
return (result, hasOverlap)
except Exception as e:
logger.debug(f"Modular merger failed, using fallback: {e}")
# Fallback to legacy merger (simplified)
from modules.shared.jsonUtils import normalizeJsonText, stripCodeFences, closeJsonStructures, tryParseJson
accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip()
newFragmentExtracted = stripCodeFences(normalizeJsonText(newFragment)).strip()
# Try simple string merge with repair
try:
# Close structures
accClosed = closeJsonStructures(accumulatedExtracted) if accumulatedExtracted else "{}"
fragClosed = closeJsonStructures(newFragmentExtracted) if newFragmentExtracted else "{}"
# Try to parse both
accParsed, accErr, _ = tryParseJson(accClosed)
fragParsed, fragErr, _ = tryParseJson(fragClosed)
# If both parse, merge structurally
if accErr is None and fragErr is None:
merged = JsonResponseHandler._mergeParsedJson(accParsed, fragParsed)
if merged:
result = json.dumps(merged, indent=2, ensure_ascii=False)
return (result, False) # No overlap in fallback - close and stop
# If only accumulated parses, return it
if accErr is None and accParsed:
result = json.dumps(accParsed, indent=2, ensure_ascii=False)
return (result, False) # No overlap - close and stop
except Exception:
pass
# Last resort: return accumulated (at least we have that) - close it
if accumulatedExtracted:
try:
closed = closeJsonStructures(accumulatedExtracted)
return (closed, False) # No overlap - close and stop
except Exception:
return (accumulatedExtracted, False) # No overlap - return as-is
result = accumulated if accumulated else "{}"
return (result, False) # No overlap - return as-is
@staticmethod
def _mergeParsedJson(accParsed: Any, fragParsed: Any) -> Optional[Dict[str, Any]]:
"""Simple merge of two parsed JSON objects."""
if isinstance(accParsed, dict) and isinstance(fragParsed, dict):
# Merge dicts
merged = accParsed.copy()
# Merge elements if both have them
if "elements" in accParsed and "elements" in fragParsed:
accElements = accParsed.get("elements", [])
fragElements = fragParsed.get("elements", [])
# Simple merge - append new elements
merged["elements"] = accElements + fragElements
elif "elements" in fragParsed:
merged["elements"] = fragParsed["elements"]
# Merge other keys
for key, value in fragParsed.items():
if key != "elements":
if key in merged and isinstance(merged[key], list) and isinstance(value, list):
merged[key] = merged[key] + value
else:
merged[key] = value
return merged
return None
@staticmethod
def _normalizeToElementsStructure(
jsonString: str,
originalString: str
) -> Optional[Dict[str, Any]]:
"""
Normalize any JSON structure (Dict, List, None, or parse error) to {"elements": [...]} format.
Handles:
- Dict with "elements" → return as-is
- Dict without "elements" but with "type" → wrap in elements array
- List → wrap in elements structure
- Parse error → try repairBrokenJson
- None → return None
Args:
jsonString: Extracted JSON string
originalString: Original string (for context)
Returns:
Normalized Dict with "elements" array, or None if normalization fails
"""
if not jsonString:
return None
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson, closeJsonStructures
# Try to parse directly first
try:
parsed = json.loads(jsonString)
parseErr = None
except Exception as e:
parseErr = e
parsed = None
# If parsing failed, try closing structures first (for incomplete fragments)
if parseErr is not None:
try:
closed = closeJsonStructures(jsonString)
parsed = json.loads(closed)
parseErr = None
except Exception:
pass
# If still failed, try repairBrokenJson ONLY if it looks like document structure
# For other structures (like section_content), use fragment detection instead
if parseErr is not None:
# Check if this looks like a document structure (has "documents" or "sections")
isDocumentStructure = '"documents"' in jsonString or '"sections"' in jsonString
if isDocumentStructure:
# Use repairBrokenJson for document structures
repaired = repairBrokenJson(jsonString)
if repaired:
parsed = repaired
parseErr = None
else:
# Still can't parse - try to detect fragment structure
return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
else:
# For non-document structures, skip repairBrokenJson and go straight to fragment detection
# repairBrokenJson tries to extract "sections" which doesn't work for other structures
return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
# Normalize based on type
if parsed is None:
return None
elif isinstance(parsed, dict):
# Already a dict
if "elements" in parsed:
return parsed
elif "type" in parsed:
# Single element - wrap in elements array
return {"elements": [parsed]}
else:
# Unknown dict structure - try to extract elements
return JsonResponseHandler._extractElementsFromDict(parsed)
elif isinstance(parsed, list):
# List - check if it's a list of elements or a fragment
if parsed and isinstance(parsed[0], dict) and "type" in parsed[0]:
# List of elements
return {"elements": parsed}
else:
# Fragment list (e.g., array of rows) - detect structure
return JsonResponseHandler._detectAndNormalizeFragment(jsonString, originalString)
else:
# Primitive type - can't normalize
return None
@staticmethod
def _detectAndNormalizeFragment(
jsonString: str,
originalString: str
) -> Optional[Dict[str, Any]]:
"""
Detect fragment structure and normalize it.
Fragments can be:
- Array of arrays (table rows): `[["row1"], ["row2"]]` or `["1947", "16883"], ["1948", "16889"]`
- Array of strings (list items): `["item1", "item2"]`
- Incomplete structure: `["item1", "item2", ` (ends with comma)
- Partial object: `{"type": "table", "content": {"rows": [["1947"...` (cut mid-string)
Returns normalized structure or None if detection fails.
"""
jsonStripped = jsonString.strip()
# Strategy 1: Check if it's an array fragment
if jsonStripped.startswith('['):
# Try to parse as array
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Close incomplete structures
closed = closeJsonStructures(jsonStripped)
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None and isinstance(parsed, list):
# Check structure: array of arrays (table rows) or array of strings (list items)
if parsed and isinstance(parsed[0], list):
# Array of arrays - likely table rows fragment
return {
"elements": [{
"type": "table",
"content": {
"rows": parsed
}
}]
}
elif parsed and isinstance(parsed[0], str):
# Array of strings - likely list items fragment
return {
"elements": [{
"type": "bullet_list",
"content": {
"items": parsed
}
}]
}
elif parseErr is not None:
# Can't parse - try regex extraction for table rows
rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
if rows:
return {
"elements": [{
"type": "table",
"content": {
"rows": rows
}
}]
}
# Strategy 2: Check if it's a partial object (cut mid-structure)
# Look for patterns like: {"elements": [...] or {"type": "table"...
if jsonStripped.startswith('{'):
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Try to close and parse
closed = closeJsonStructures(jsonStripped)
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None and isinstance(parsed, dict):
# Successfully parsed - normalize it
return JsonResponseHandler._normalizeToElementsStructure(closed, originalString)
elif parseErr is not None:
# Can't parse - try to extract table rows from the raw string
# This handles cases like: {"elements": [{"type": "table", "content": {"rows": [["1947"...
rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
if rows:
return {
"elements": [{
"type": "table",
"content": {
"rows": rows
}
}]
}
# Try to extract any array patterns that might be table rows
# Look for patterns like: ["1947", "10000"], ["1948", "10100"]
import re
# Pattern: ["value1", "value2"], ["value3", "value4"]
rowPattern = r'\["([^"]*)",\s*"([^"]*)"\]'
matches = re.findall(rowPattern, jsonStripped)
if matches and len(matches) >= 2:
# Found multiple row patterns - likely table rows
rows = [[match[0], match[1]] for match in matches]
return {
"elements": [{
"type": "table",
"content": {
"rows": rows
}
}]
}
# Strategy 3: Try to extract rows from any text (even if not starting with [ or {)
rows = JsonResponseHandler._extractRowsFromFragment(jsonStripped)
if rows:
return {
"elements": [{
"type": "table",
"content": {
"rows": rows
}
}]
}
return None
@staticmethod
def _extractElementsFromDict(d: Dict[str, Any]) -> Dict[str, Any]:
"""
Try to extract elements from unknown dict structure.
Returns normalized structure or empty elements array.
"""
# Check common patterns
if "sections" in d:
# Document structure with sections
sections = d.get("sections", [])
elements = []
for section in sections:
if isinstance(section, dict) and "elements" in section:
elements.extend(section.get("elements", []))
return {"elements": elements}
# Unknown structure - return empty
return {"elements": []}
@staticmethod
def _mergeJsonStructuresGeneric(
accumulatedObj: Dict[str, Any],
newFragmentObj: Dict[str, Any],
accumulatedRaw: str,
newFragmentRaw: str,
overlapElements: Optional[List[Dict[str, Any]]] = None
) -> Optional[Dict[str, Any]]:
"""
GENERIC merge of two JSON structures, handling overlaps and missing parts.
Strategy:
1. Extract elements from both structures (both are normalized to {"elements": [...]})
2. Use overlap elements if provided to identify merge point
3. Detect if both have same structure (same content type)
4. Group elements by type
5. Merge elements of same type using content-type-specific logic with overlap detection
6. Handle overlaps and missing parts intelligently
Args:
accumulatedObj: Normalized accumulated JSON object (guaranteed to have "elements")
newFragmentObj: Normalized new fragment JSON object (guaranteed to have "elements")
accumulatedRaw: Raw accumulated string (for fragment detection)
newFragmentRaw: Raw new fragment string (for fragment detection)
overlapElements: Optional list of overlap elements from continuation response
Returns:
Merged JSON object or None if merging fails
"""
try:
# Step 1: Extract elements (both are normalized, so this should always work)
accumulatedElements = accumulatedObj.get("elements", []) if isinstance(accumulatedObj, dict) else []
newFragmentElements = newFragmentObj.get("elements", []) if isinstance(newFragmentObj, dict) else []
if not accumulatedElements and not newFragmentElements:
# No elements found - try to extract from raw strings
# Try to extract any valid JSON structure from raw strings
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Try accumulated first
if accumulatedRaw:
try:
closedAccumulated = closeJsonStructures(accumulatedRaw)
parsed, parseErr, _ = tryParseJson(closedAccumulated)
if parseErr is None and parsed:
normalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulatedRaw)
if normalized:
return normalized
except Exception:
pass
# Try new fragment
if newFragmentRaw:
try:
closedFragment = closeJsonStructures(newFragmentRaw)
parsed, parseErr, _ = tryParseJson(closedFragment)
if parseErr is None and parsed:
normalized = JsonResponseHandler._normalizeToElementsStructure(closedFragment, newFragmentRaw)
if normalized:
return normalized
except Exception:
pass
# If still nothing, return empty structure (never None)
return {"elements": []}
# Step 2: Use overlap elements to identify merge point
# If overlap elements are provided, use them to find where to merge
if overlapElements and isinstance(overlapElements, list) and len(overlapElements) > 0:
# Find overlap in accumulated elements
overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements)
if overlapStartIndex >= 0:
# Remove overlapping elements from accumulated (they'll be replaced by continuation)
accumulatedElements = accumulatedElements[:overlapStartIndex]
logger.debug(f"Found overlap at index {overlapStartIndex}, removed {len(accumulatedElements) - overlapStartIndex} overlapping elements")
# Step 3: Detect if newFragment is a continuation fragment
# Check if newFragment starts with array elements (fragment, not full JSON)
isFragment = JsonResponseHandler._isFragment(newFragmentRaw, newFragmentElements)
# Step 4: Group elements by type for intelligent merging
accumulatedByType = {}
for elem in accumulatedElements:
if isinstance(elem, dict):
elemType = elem.get("type", "unknown")
if elemType not in accumulatedByType:
accumulatedByType[elemType] = []
accumulatedByType[elemType].append(elem)
newFragmentByType = {}
for elem in newFragmentElements:
if isinstance(elem, dict):
elemType = elem.get("type", "unknown")
if elemType not in newFragmentByType:
newFragmentByType[elemType] = []
newFragmentByType[elemType].append(elem)
# Step 5: Merge elements intelligently
mergedElements = []
allTypes = set(accumulatedByType.keys()) | set(newFragmentByType.keys())
for elemType in allTypes:
accElems = accumulatedByType.get(elemType, [])
fragElems = newFragmentByType.get(elemType, [])
if not accElems:
# Only in fragment - add all
mergedElements.extend(fragElems)
elif not fragElems:
# Only in accumulated - add all
mergedElements.extend(accElems)
else:
# Both have elements of this type - merge them using content-type-specific logic
mergedElem = JsonResponseHandler._mergeElementsOfSameTypeGeneric(
accElems[0], fragElems[0], elemType, accumulatedRaw, newFragmentRaw, isFragment
)
if mergedElem:
mergedElements.append(mergedElem)
# Step 6: Reconstruct base structure
if mergedElements:
return {"elements": mergedElements}
else:
# No merged elements - return accumulated if available (NEVER return None)
if accumulatedElements:
return {"elements": accumulatedElements}
# If no accumulated, return new fragment if available
if newFragmentElements:
return {"elements": newFragmentElements}
# Last resort: return empty structure (never None)
return {"elements": []}
except Exception as e:
logger.debug(f"Structure-based merge failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return None
@staticmethod
def _isFragment(jsonString: str, elements: List[Dict[str, Any]]) -> bool:
"""
Detect if JSON string is a fragment (not a complete JSON object).
Fragments:
- Start with `[` but not `[{"` (array fragment, not full elements array)
- Start with array elements like `["cell1", "cell2"],` (table rows fragment)
- Don't have full structure (missing outer object with "elements")
- Are continuations of previous structure
"""
jsonStripped = jsonString.strip()
# Check if it starts with array (fragment)
if jsonStripped.startswith('['):
# Check if it's a full elements array `[{"type": ...}]` or a fragment `["cell1", "cell2"]`
if jsonStripped.startswith('[{"') or jsonStripped.startswith('[{'):
# Could be full structure - check if it has "type" field
if elements and isinstance(elements[0], dict) and "type" in elements[0]:
return False # Full structure
# Otherwise it's a fragment (array of primitives or incomplete)
return True
# Check if it starts with object but missing "elements" wrapper
if jsonStripped.startswith('{'):
# Check if it has "elements" field
if '"elements"' not in jsonStripped[:200]: # Check first 200 chars
# Might be a single element fragment
return True
# Check if elements are incomplete (no full structure)
if elements and isinstance(elements[0], dict):
# Check if first element is missing required fields
firstElem = elements[0]
if "type" not in firstElem and "content" not in firstElem:
return True
return False
@staticmethod
def _mergeElementsOfSameTypeGeneric(
accumulatedElem: Dict[str, Any],
newFragmentElem: Dict[str, Any],
elemType: str,
accumulatedRaw: str,
newFragmentRaw: str,
isFragment: bool
) -> Optional[Dict[str, Any]]:
"""
GENERIC merge of two elements of the same type, with content-type-specific optimizations.
Content-type-specific merging:
- table: Merge rows arrays with overlap detection
- paragraph: Merge text content
- code_block: Merge code strings
- bullet_list/numbered_list: Merge items arrays
- heading: Use new fragment (usually complete)
- image: Use new fragment (usually complete)
- Other: Generic deep merge
Args:
accumulatedElem: Accumulated element
newFragmentElem: New fragment element
elemType: Content type (table, paragraph, etc.)
accumulatedRaw: Raw accumulated string
newFragmentRaw: Raw new fragment string
isFragment: Whether newFragment is a fragment (continuation)
Returns:
Merged element or None if merging fails
"""
if elemType == "table":
return JsonResponseHandler._mergeTableElementsGeneric(
accumulatedElem, newFragmentElem, accumulatedRaw, newFragmentRaw, isFragment
)
elif elemType == "paragraph":
return JsonResponseHandler._mergeParagraphElements(
accumulatedElem, newFragmentElem, isFragment
)
elif elemType == "code_block":
return JsonResponseHandler._mergeCodeBlockElements(
accumulatedElem, newFragmentElem, isFragment
)
elif elemType in ["bullet_list", "numbered_list"]:
return JsonResponseHandler._mergeListElements(
accumulatedElem, newFragmentElem, isFragment
)
elif elemType in ["heading", "image"]:
# Usually complete - use new fragment if it exists, otherwise accumulated
return newFragmentElem if newFragmentElem else accumulatedElem
else:
# Generic merge: use mergeDeepStructures
return JsonResponseHandler.mergeDeepStructures(
accumulatedElem, newFragmentElem, 0, f"element_merge.{elemType}"
)
@staticmethod
def _mergeTableElementsGeneric(
accumulatedElem: Dict[str, Any],
newFragmentElem: Dict[str, Any],
accumulatedRaw: str,
newFragmentRaw: str,
isFragment: bool
) -> Dict[str, Any]:
"""
GENERIC merge of two table elements with content-type-specific optimizations.
Handles:
- Overlapping rows (detect duplicates by comparing row content)
- Missing headers (complete with existing headers)
- Incomplete rows (complete with null values if needed)
- Fragment rows (if newFragment is a fragment, extract rows from raw string)
Args:
accumulatedElem: Accumulated table element
newFragmentElem: New fragment table element
accumulatedRaw: Raw accumulated string (for fragment detection)
newFragmentRaw: Raw new fragment string (for fragment extraction)
isFragment: Whether newFragment is a fragment
Returns:
Merged table element
"""
# Extract content (handle both nested and flat structures)
accContent = accumulatedElem.get("content", {})
if not accContent and "rows" in accumulatedElem:
accContent = accumulatedElem
fragContent = newFragmentElem.get("content", {})
if not fragContent and "rows" in newFragmentElem:
fragContent = newFragmentElem
# Extract rows
accRows = accContent.get("rows", []) if isinstance(accContent, dict) else []
# If fragment, try to extract rows from raw string
fragRows = fragContent.get("rows", []) if isinstance(fragContent, dict) else []
if isFragment and not fragRows:
fragRows = JsonResponseHandler._extractRowsFromFragment(newFragmentRaw)
# Extract headers (complete missing with existing)
accHeaders = accContent.get("headers", []) if isinstance(accContent, dict) else []
fragHeaders = fragContent.get("headers", []) if isinstance(fragContent, dict) else []
mergedHeaders = accHeaders if accHeaders else fragHeaders
# Merge rows with overlap detection
mergedRows = JsonResponseHandler._mergeRowsWithOverlapDetection(accRows, fragRows)
# Reconstruct table element
mergedContent = {
"headers": mergedHeaders,
"rows": mergedRows
}
# Preserve other fields (caption, etc.)
if isinstance(accContent, dict) and "caption" in accContent:
mergedContent["caption"] = accContent["caption"]
elif isinstance(fragContent, dict) and "caption" in fragContent:
mergedContent["caption"] = fragContent["caption"]
return {
"type": "table",
"content": mergedContent
}
@staticmethod
def _extractRowsFromFragment(fragmentRaw: str) -> List[List[str]]:
"""
Extract table rows from fragment string.
Handles fragments like:
- `["1947", "16883"], ["1948", "16889"], ...`
- `"rows": [["1947", "10000"], ["1948", "10100"]...`
- Incomplete fragments cut mid-string
Also handles fragments with more than 2 columns.
"""
import re
rows = []
# Pattern 1: Array of arrays with 2 columns `["cell1", "cell2"], ["cell3", "cell4"]`
# This pattern matches complete arrays: ["value1", "value2"]
pattern2Col = r'\["([^"]*)",\s*"([^"]*)"\]'
matches2Col = re.findall(pattern2Col, fragmentRaw)
if matches2Col and len(matches2Col) >= 2: # Need at least 2 rows to be confident
for match in matches2Col:
if len(match) == 2:
rows.append([match[0], match[1]])
if rows:
return rows
# Pattern 2: Array of arrays with variable columns (more robust)
# Find all array patterns: ["...", "...", ...]
# Use non-greedy matching but ensure we get complete arrays
arrayPattern = r'\[(.*?)\]'
arrayMatches = re.findall(arrayPattern, fragmentRaw)
# Filter to only arrays that look like table rows (have multiple quoted values)
validArrays = []
for arrayContent in arrayMatches:
# Extract quoted strings from array content
cellPattern = r'"([^"]*)"'
cells = re.findall(cellPattern, arrayContent)
# Only consider arrays with 2+ cells (likely table rows)
if len(cells) >= 2:
validArrays.append(cells)
if validArrays and len(validArrays) >= 2: # Need at least 2 rows
return validArrays
# Pattern 3: Look for "rows": [...] pattern in incomplete JSON
# This handles cases like: "rows": [["1947", "10000"], ["1948", "10100"]...
rowsPattern = r'"rows"\s*:\s*\[(.*?)(?:\]|$)'
rowsMatch = re.search(rowsPattern, fragmentRaw, re.DOTALL)
if rowsMatch:
rowsContent = rowsMatch.group(1)
# Extract all array patterns from rows content
arrayPattern = r'\[(.*?)\]'
arrayMatches = re.findall(arrayPattern, rowsContent)
for arrayContent in arrayMatches:
cellPattern = r'"([^"]*)"'
cells = re.findall(cellPattern, arrayContent)
if len(cells) >= 2: # At least 2 columns
rows.append(cells)
if rows:
return rows
# Pattern 4: Try to parse as JSON array (handles complete arrays)
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Try to close incomplete structures
closed = closeJsonStructures(fragmentRaw.strip())
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None and isinstance(parsed, list):
if parsed and isinstance(parsed[0], list):
# Array of arrays - table rows
return parsed
elif parsed and isinstance(parsed[0], str):
# Array of strings - might be single column table
return [[item] for item in parsed]
# Pattern 5: Last resort - extract any array patterns we can find
# Even if incomplete, try to extract what we can
if not rows:
# Find all patterns like ["value1", "value2"] even if incomplete
# Use a more lenient pattern that handles incomplete strings
incompletePattern = r'\["([^"]*)"(?:,\s*"([^"]*)")?'
incompleteMatches = re.findall(incompletePattern, fragmentRaw)
for match in incompleteMatches:
if match[0]: # First value exists
if match[1]: # Second value exists
rows.append([match[0], match[1]])
else:
# Only one value - might be incomplete, skip for now
pass
return rows
@staticmethod
def _mergeParagraphElements(
accumulatedElem: Dict[str, Any],
newFragmentElem: Dict[str, Any],
isFragment: bool
) -> Dict[str, Any]:
"""Merge two paragraph elements."""
accContent = accumulatedElem.get("content", {})
fragContent = newFragmentElem.get("content", {})
accText = accContent.get("text", "") if isinstance(accContent, dict) else ""
fragText = fragContent.get("text", "") if isinstance(fragContent, dict) else ""
# Merge text (remove overlap if fragment)
mergedText = accText + fragText if not isFragment else (accText.rstrip() + " " + fragText.lstrip())
return {
"type": "paragraph",
"content": {"text": mergedText}
}
@staticmethod
def _mergeCodeBlockElements(
accumulatedElem: Dict[str, Any],
newFragmentElem: Dict[str, Any],
isFragment: bool
) -> Dict[str, Any]:
"""Merge two code block elements."""
accContent = accumulatedElem.get("content", {})
fragContent = newFragmentElem.get("content", {})
accCode = accContent.get("code", "") if isinstance(accContent, dict) else ""
fragCode = fragContent.get("code", "") if isinstance(fragContent, dict) else ""
accLanguage = accContent.get("language") if isinstance(accContent, dict) else None
fragLanguage = fragContent.get("language") if isinstance(fragContent, dict) else None
mergedCode = accCode + "\n" + fragCode if fragCode else accCode
mergedLanguage = accLanguage or fragLanguage
result = {
"type": "code_block",
"content": {"code": mergedCode}
}
if mergedLanguage:
result["content"]["language"] = mergedLanguage
return result
@staticmethod
def _mergeListElements(
accumulatedElem: Dict[str, Any],
newFragmentElem: Dict[str, Any],
isFragment: bool
) -> Dict[str, Any]:
"""Merge two list elements (bullet_list or numbered_list)."""
accContent = accumulatedElem.get("content", {})
fragContent = newFragmentElem.get("content", {})
accItems = accContent.get("items", []) if isinstance(accContent, dict) else []
fragItems = fragContent.get("items", []) if isinstance(fragContent, dict) else []
# Merge items with overlap detection
mergedItems = JsonResponseHandler._mergeItemsWithOverlapDetection(accItems, fragItems)
elemType = accumulatedElem.get("type") or newFragmentElem.get("type")
return {
"type": elemType,
"content": {"items": mergedItems}
}
@staticmethod
def _findOverlapStartIndex(
accumulatedElements: List[Dict[str, Any]],
overlapElements: List[Dict[str, Any]]
) -> int:
"""
Find the start index in accumulatedElements where overlapElements begin.
This helps identify where to merge continuation elements by matching
the overlap elements with the end of accumulated elements.
Args:
accumulatedElements: List of accumulated elements
overlapElements: List of overlap elements from continuation response
Returns:
Index where overlap starts, or -1 if not found
"""
if not overlapElements or not accumulatedElements:
return -1
# Try to find overlap by matching element structures
# Start from the end of accumulatedElements and work backwards
overlapLen = len(overlapElements)
accLen = len(accumulatedElements)
if overlapLen > accLen:
return -1
# Try matching from different start positions
for startIdx in range(max(0, accLen - overlapLen), accLen):
# Check if elements from startIdx match overlapElements
matches = True
for i in range(min(overlapLen, accLen - startIdx)):
accElem = accumulatedElements[startIdx + i]
overlapElem = overlapElements[i]
# Compare element types
if isinstance(accElem, dict) and isinstance(overlapElem, dict):
accType = accElem.get("type")
overlapType = overlapElem.get("type")
if accType != overlapType:
matches = False
break
# For tables, compare row counts or last rows
if accType == "table":
accRows = accElem.get("rows", []) or (accElem.get("content", {}).get("rows", []) if isinstance(accElem.get("content"), dict) else [])
overlapRows = overlapElem.get("rows", []) or (overlapElem.get("content", {}).get("rows", []) if isinstance(overlapElem.get("content"), dict) else [])
if accRows and overlapRows:
# Check if last rows match
if len(accRows) >= len(overlapRows):
lastAccRows = accRows[-len(overlapRows):]
if lastAccRows != overlapRows:
matches = False
break
# For lists, compare items
elif accType in ["bullet_list", "numbered_list"]:
accItems = accElem.get("items", []) or (accElem.get("content", {}).get("items", []) if isinstance(accElem.get("content"), dict) else [])
overlapItems = overlapElem.get("items", []) or (overlapElem.get("content", {}).get("items", []) if isinstance(overlapElem.get("content"), dict) else [])
if accItems and overlapItems:
if len(accItems) >= len(overlapItems):
lastAccItems = accItems[-len(overlapItems):]
if lastAccItems != overlapItems:
matches = False
break
else:
matches = False
break
if matches:
return startIdx
return -1
@staticmethod
def _mergeRowsWithOverlapDetection(
accRows: List[List[str]],
fragRows: List[List[str]]
) -> List[List[str]]:
"""
Merge two row arrays, detecting and removing overlaps.
Overlap detection: Compare rows to find duplicates.
Missing parts: Complete with null values if needed.
"""
if not accRows:
return fragRows
if not fragRows:
return accRows
# Find overlap by comparing last rows of accRows with first rows of fragRows
overlapStart = 0
maxOverlap = min(len(accRows), len(fragRows))
# Find the longest overlap
for overlapLen in range(maxOverlap, 0, -1):
accSuffix = accRows[-overlapLen:]
fragPrefix = fragRows[:overlapLen]
# Compare rows (exact match)
if accSuffix == fragPrefix:
overlapStart = overlapLen
break
# Merge: accumulated rows + non-overlapping fragment rows
merged = accRows + fragRows[overlapStart:]
return merged
@staticmethod
def _mergeItemsWithOverlapDetection(
accItems: List[str],
fragItems: List[str]
) -> List[str]:
"""
Merge two item arrays (for lists), detecting and removing overlaps.
Overlap detection: Compare items to find duplicates.
"""
if not accItems:
return fragItems
if not fragItems:
return accItems
# Find overlap by comparing last items of accItems with first items of fragItems
overlapStart = 0
maxOverlap = min(len(accItems), len(fragItems))
# Find the longest overlap
for overlapLen in range(maxOverlap, 0, -1):
accSuffix = accItems[-overlapLen:]
fragPrefix = fragItems[:overlapLen]
# Compare items (exact match)
if accSuffix == fragPrefix:
overlapStart = overlapLen
break
# Merge: accumulated items + non-overlapping fragment items
merged = accItems + fragItems[overlapStart:]
return merged
@staticmethod
def _extractOverlapAndContinuation(jsonString: str) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
"""
Extract overlap and continuation sections from AI response with explicit overlap structure.
Expected format:
{
"overlap": [...], // Elements to repeat for merging
"continuation": [...] // New elements to add
}
Or alternative format:
{
"overlap": "...", // Overlap as string
"continuation": "..." // Continuation as string
}
Args:
jsonString: JSON string that may contain overlap/continuation structure
Returns:
Tuple of (overlap_elements, continuation_json_string) or (None, None) if not found
"""
if not jsonString:
return None, None
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures
# Extract and normalize JSON
extracted = stripCodeFences(normalizeJsonText(jsonString)).strip()
if not extracted:
return None, None
# Try to parse
try:
closed = closeJsonStructures(extracted)
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None and isinstance(parsed, dict):
# Check for overlap/continuation structure
overlap = parsed.get("overlap")
continuation = parsed.get("continuation")
if overlap is not None and continuation is not None:
# Found explicit overlap structure
overlapElements = None
continuationJson = None
# Extract overlap elements
if isinstance(overlap, list):
overlapElements = overlap
elif isinstance(overlap, str):
# Overlap is a string - try to parse it
try:
overlapParsed, _, _ = tryParseJson(closeJsonStructures(overlap))
if isinstance(overlapParsed, list):
overlapElements = overlapParsed
except Exception:
pass
# Extract continuation JSON
if isinstance(continuation, (dict, list)):
continuationJson = json.dumps(continuation, indent=2, ensure_ascii=False)
elif isinstance(continuation, str):
continuationJson = continuation
if overlapElements is not None and continuationJson:
return overlapElements, continuationJson
except Exception:
pass
return None, None
@staticmethod
def _mergeWithExplicitOverlap(
accumulated: str,
continuationJson: str,
overlapElements: List[Dict[str, Any]]
) -> str:
"""
Merge accumulated JSON with continuation JSON using explicit overlap information.
Strategy:
1. Find overlap in accumulated using overlapElements
2. Remove overlapping elements from accumulated
3. Append continuation JSON
Args:
accumulated: Previously accumulated JSON string
continuationJson: Continuation JSON string (new content)
overlapElements: List of overlap elements from AI response
Returns:
Merged JSON string
"""
if not accumulated:
return continuationJson
if not continuationJson:
return accumulated
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText, tryParseJson, closeJsonStructures
# Normalize accumulated
accumulatedExtracted = stripCodeFences(normalizeJsonText(accumulated)).strip()
accumulatedNormalized = JsonResponseHandler._normalizeToElementsStructure(
accumulatedExtracted, accumulated
)
# Normalize continuation
continuationExtracted = stripCodeFences(normalizeJsonText(continuationJson)).strip()
continuationNormalized = JsonResponseHandler._normalizeToElementsStructure(
continuationExtracted, continuationJson
)
# If both normalized successfully, use structure-based merge with overlap
if accumulatedNormalized and continuationNormalized:
merged = JsonResponseHandler._mergeJsonStructuresGeneric(
accumulatedNormalized, continuationNormalized, accumulatedExtracted, continuationExtracted,
overlapElements=overlapElements
)
if merged:
return json.dumps(merged, indent=2, ensure_ascii=False)
# Fallback: use overlap elements to find merge point in accumulated
# Find where overlap elements match in accumulated
if accumulatedNormalized and overlapElements:
accumulatedElements = accumulatedNormalized.get("elements", [])
overlapStartIndex = JsonResponseHandler._findOverlapStartIndex(accumulatedElements, overlapElements)
if overlapStartIndex >= 0:
# Remove overlapping elements
accumulatedElements = accumulatedElements[:overlapStartIndex]
accumulatedNormalized["elements"] = accumulatedElements
# Merge continuation
if continuationNormalized:
continuationElements = continuationNormalized.get("elements", [])
accumulatedElements.extend(continuationElements)
accumulatedNormalized["elements"] = accumulatedElements
return json.dumps(accumulatedNormalized, indent=2, ensure_ascii=False)
# Last resort: simple concatenation
return JsonResponseHandler._mergeJsonStringsWithOverlapFallback(accumulated, continuationJson)
@staticmethod
def _extractValidJsonPrefix(jsonString: str) -> str:
"""
Extract the longest valid JSON prefix from a string that may be cut randomly.
Strategy:
1. Try to find the longest prefix that can be closed and parsed
2. Handle random cuts (mid-string, mid-number, etc.)
3. Return the longest valid prefix found
Args:
jsonString: JSON string that may be cut randomly
Returns:
Longest valid JSON prefix, or empty string if none found
"""
if not jsonString or not jsonString.strip():
return ""
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Strategy 1: Try progressive truncation to find longest valid JSON
# Use binary search-like approach for efficiency
bestValid = ""
bestLength = 0
maxLen = len(jsonString)
# Generate test lengths: full, 95%, 90%, ..., 10%
testLengths = []
for percent in range(100, 9, -5):
testLen = int(maxLen * percent / 100)
if testLen > bestLength:
testLengths.append(testLen)
# Also test specific points near the end (common cut points)
for offset in [200, 100, 50, 20, 10, 5, 2, 1]:
if maxLen > offset:
testLen = maxLen - offset
if testLen > bestLength:
testLengths.append(testLen)
# Sort and deduplicate
testLengths = sorted(set(testLengths), reverse=True)
for testLen in testLengths:
if testLen <= bestLength:
continue # Already found better
testStr = jsonString[:testLen]
if not testStr.strip():
continue
# Try to close and parse
try:
closed = closeJsonStructures(testStr)
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None and parsed is not None:
# Valid JSON found
if testLen > bestLength:
bestValid = closed
bestLength = testLen
except Exception:
continue
# Strategy 2: If we found valid JSON, return it
if bestValid:
return bestValid
# Strategy 3: Try to extract balanced JSON (find first complete structure)
jsonStripped = jsonString.strip()
if jsonStripped.startswith('{') or jsonStripped.startswith('['):
# Try to extract balanced JSON
from modules.shared.jsonUtils import extractFirstBalancedJson
balanced = extractFirstBalancedJson(jsonStripped)
if balanced and balanced != jsonStripped:
try:
closed = closeJsonStructures(balanced)
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None:
return closed
except Exception:
pass
# Strategy 4: Try to repair by removing incomplete trailing structures
# Find the last complete element/item before the cut
try:
# For arrays: find last complete element
if jsonStripped.startswith('['):
# Find last complete array element
lastComma = jsonStripped.rfind(',')
if lastComma > 0:
# Try prefix up to last comma
prefix = jsonStripped[:lastComma].strip()
if prefix.endswith(','):
prefix = prefix[:-1].strip()
if prefix:
closed = closeJsonStructures(prefix + ']')
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None:
return closed
# For objects: find last complete key-value pair
elif jsonStripped.startswith('{'):
# Find last complete key-value pair
lastComma = jsonStripped.rfind(',')
if lastComma > 0:
# Try prefix up to last comma
prefix = jsonStripped[:lastComma].strip()
if prefix.endswith(','):
prefix = prefix[:-1].strip()
if prefix:
closed = closeJsonStructures(prefix + '}')
parsed, parseErr, _ = tryParseJson(closed)
if parseErr is None:
return closed
except Exception:
pass
# Last resort: return empty (caller will handle)
return ""
@staticmethod
def _smartConcatenate(accumulated: str, newFragment: str) -> str:
"""
Smart concatenation that tries to merge JSON fragments intelligently.
Strategy:
1. Extract valid JSON from both fragments
2. Parse both as JSON objects/arrays
3. Merge them structurally
4. Return valid JSON
Args:
accumulated: Accumulated JSON string
newFragment: New fragment to append
Returns:
Merged string with valid JSON, or empty if merging not possible
"""
if not accumulated or not newFragment:
return ""
from modules.shared.jsonUtils import closeJsonStructures, tryParseJson
# Extract valid JSON prefixes from both
accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated)
newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment)
if not accumulatedValid:
accumulatedValid = accumulated
if not newFragmentValid:
newFragmentValid = newFragment
# Try to parse both
try:
closedAccumulated = closeJsonStructures(accumulatedValid)
parsedAccumulated, parseErr1, _ = tryParseJson(closedAccumulated)
closedNewFragment = closeJsonStructures(newFragmentValid)
parsedNewFragment, parseErr2, _ = tryParseJson(closedNewFragment)
# If both parse successfully, merge structurally
if parseErr1 is None and parseErr2 is None:
# Normalize both to elements structure
accNormalized = JsonResponseHandler._normalizeToElementsStructure(closedAccumulated, accumulated)
newNormalized = JsonResponseHandler._normalizeToElementsStructure(closedNewFragment, newFragment)
if accNormalized and newNormalized:
merged = JsonResponseHandler._mergeJsonStructuresGeneric(
accNormalized, newNormalized, closedAccumulated, closedNewFragment
)
if merged:
return json.dumps(merged, indent=2, ensure_ascii=False)
# If only accumulated parses, return it
if parseErr1 is None and parsedAccumulated:
return json.dumps(parsedAccumulated, indent=2, ensure_ascii=False)
# If only new fragment parses, return it
if parseErr2 is None and parsedNewFragment:
return json.dumps(parsedNewFragment, indent=2, ensure_ascii=False)
except Exception:
pass
# Fallback: Try simple string concatenation with repair
accumulatedStripped = accumulated.strip()
newFragmentStripped = newFragment.strip()
# If accumulated doesn't end with } or ], it might be incomplete
if accumulatedStripped and not accumulatedStripped.endswith(('}', ']')):
try:
closedAccumulated = closeJsonStructures(accumulatedStripped)
# Check if newFragment starts with continuation
if newFragmentStripped.startswith(','):
# Remove leading comma and append
merged = closedAccumulated.rstrip() + newFragmentStripped.lstrip(',').strip()
elif newFragmentStripped.startswith(('}', ']')):
# Fragment starts with closing - might be completing accumulated
merged = closedAccumulated.rstrip() + newFragmentStripped
else:
# Try to append as continuation
# Check if we need a comma separator
if not closedAccumulated.rstrip().endswith((',', '[', '{')):
merged = closedAccumulated.rstrip() + ',' + newFragmentStripped
else:
merged = closedAccumulated.rstrip() + newFragmentStripped
# Try to repair and parse the merged result
repaired = closeJsonStructures(merged)
parsed, parseErr, _ = tryParseJson(repaired)
if parseErr is None:
return json.dumps(parsed, indent=2, ensure_ascii=False)
except Exception:
pass
# If smart concatenation failed, return empty (caller will handle)
return ""
@staticmethod
def _mergeJsonStringsWithOverlapFallback(
accumulated: str,
newFragment: str
) -> str:
"""
Fallback overlap detection using string comparison.
Used when both strings are complete JSON structures or fragments.
CRITICAL: Never returns empty JSON - always returns at least accumulated.
"""
if not accumulated:
return newFragment if newFragment else "{}"
if not newFragment:
return accumulated
from modules.shared.jsonUtils import tryParseJson, closeJsonStructures
# Strategy 1: Try to extract valid JSON parts from both fragments
# This handles random cuts better by finding the longest valid prefix/suffix
# Extract valid JSON from accumulated (find longest valid prefix)
accumulatedValid = JsonResponseHandler._extractValidJsonPrefix(accumulated)
# Extract valid JSON from newFragment (find longest valid prefix)
newFragmentValid = JsonResponseHandler._extractValidJsonPrefix(newFragment)
# If we have valid JSON from both, try structure-based merge
if accumulatedValid and newFragmentValid:
try:
parsedAccumulated, parseErr1, _ = tryParseJson(closeJsonStructures(accumulatedValid))
parsedNewFragment, parseErr2, _ = tryParseJson(closeJsonStructures(newFragmentValid))
if parseErr1 is None and parseErr2 is None:
# Both are valid JSON - try structure merge
accNormalized = JsonResponseHandler._normalizeToElementsStructure(accumulatedValid, accumulated)
newNormalized = JsonResponseHandler._normalizeToElementsStructure(newFragmentValid, newFragment)
if accNormalized and newNormalized:
merged = JsonResponseHandler._mergeJsonStructuresGeneric(
accNormalized, newNormalized, accumulatedValid, newFragmentValid
)
if merged:
return json.dumps(merged, indent=2, ensure_ascii=False)
except Exception:
pass
# Strategy 2: Find longest common suffix/prefix match (character-level overlap)
maxOverlapLen = min(len(accumulated), len(newFragment))
# Start from maximum possible overlap down to 1 character
# But limit to reasonable overlap (max 50% of shorter string)
maxReasonableOverlap = min(maxOverlapLen, min(len(accumulated), len(newFragment)) // 2)
for overlapLen in range(maxReasonableOverlap, 0, -1):
accumulatedSuffix = accumulated[-overlapLen:]
newFragmentPrefix = newFragment[:overlapLen]
if accumulatedSuffix == newFragmentPrefix:
# Found overlap - remove duplicate part
logger.debug(f"Found overlap of {overlapLen} characters, removing duplicate")
merged = accumulated + newFragment[overlapLen:]
# Ensure result is not empty
if merged and merged.strip():
return merged
# Strategy 3: No overlap found - try smart concatenation
# Check if we can append newFragment to accumulated without breaking JSON structure
merged = JsonResponseHandler._smartConcatenate(accumulated, newFragment)
if merged and merged.strip():
return merged
# Strategy 4: Last resort - simple concatenation (but ensure non-empty and valid JSON)
result = accumulated + newFragment
if not result or result.strip() in ['{}', '[]', '']:
# Return accumulated as fallback (at least we have that)
return accumulated if accumulated else "{}"
# CRITICAL: Try to repair and validate the merged result
try:
repaired = closeJsonStructures(result)
parsed, parseErr, _ = tryParseJson(repaired)
if parseErr is None:
# Valid JSON - return it
return json.dumps(parsed, indent=2, ensure_ascii=False)
else:
# Still invalid - try to extract valid parts
validPrefix = JsonResponseHandler._extractValidJsonPrefix(result)
if validPrefix:
parsedPrefix, parseErr2, _ = tryParseJson(validPrefix)
if parseErr2 is None:
return json.dumps(parsedPrefix, indent=2, ensure_ascii=False)
except Exception:
pass
# If repair failed, return accumulated (at least we have that)
if accumulated:
try:
repairedAccumulated = closeJsonStructures(accumulated)
parsedAcc, parseErrAcc, _ = tryParseJson(repairedAccumulated)
if parseErrAcc is None:
return json.dumps(parsedAcc, indent=2, ensure_ascii=False)
except Exception:
pass
return accumulated
# Last resort: return empty structure
return "{}"
@staticmethod
def isJsonComplete(parsedJson: Dict[str, Any]) -> bool:
"""
GENERIC function to check if parsed JSON structure is complete.
Works for ANY JSON structure - no specific logic for content types.
Completeness checks (all generic):
- All arrays are properly closed
- All objects are properly closed
- No incomplete structures
- Recursive validation of nested structures
Args:
parsedJson: Parsed JSON object
Returns:
True if JSON is complete, False otherwise
"""
def _checkStructureComplete(obj: Any, depth: int = 0) -> bool:
"""Recursively check if structure is complete."""
if depth > 50: # Prevent infinite recursion
return True
if isinstance(obj, dict):
# Check all values recursively
for value in obj.values():
if not _checkStructureComplete(value, depth + 1):
return False
return True
elif isinstance(obj, list):
# Check all items recursively
for item in obj:
if not _checkStructureComplete(item, depth + 1):
return False
return True
else:
# Primitive value - always complete
return True
try:
return _checkStructureComplete(parsedJson)
except Exception as e:
logger.debug(f"Error checking JSON completeness: {e}")
return False
@staticmethod
def finalizeJson(parsedJson: Dict[str, Any]) -> Dict[str, Any]:
"""
GENERIC function to finalize complete JSON by adding missing closing elements and repairing corruption.
Works for ANY JSON structure - no specific logic for content types.
Steps (all generic):
1. Analyze structure for missing closing elements (recursively)
2. Add closing brackets/braces where needed
3. Repair any remaining corruption
4. Validate final structure
Args:
parsedJson: Parsed JSON object that needs finalization
Returns:
Finalized JSON object
"""
# For now, just return as-is since parsing succeeded
# If needed, can add logic to check for incomplete structures
# and add closing elements
return parsedJson
@staticmethod
def extractKpiValuesFromJson(
parsedJson: Dict[str, Any],
kpis: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
Extract current KPI values from parsed JSON and update KPI objects.
Args:
parsedJson: Parsed JSON object
kpis: List of KPI objects (will be updated with currentValue)
Returns:
Updated list of KPI objects with currentValue set
"""
updatedKpis = []
for kpi in kpis:
kpiId = kpi.get("id")
jsonPath = kpi.get("jsonPath")
if not kpiId or not jsonPath:
continue
# Create copy of KPI object
updatedKpi = kpi.copy()
try:
# Extract value using JSON path
# Simple path format: "sections[0].elements[0].items" or "sections[0].elements[0].rows"
value = JsonResponseHandler._extractValueByPath(parsedJson, jsonPath)
# Handle None (path doesn't exist - incomplete JSON)
if value is None:
updatedKpi["currentValue"] = kpi.get("currentValue", 0)
logger.debug(f"KPI {kpiId} path {jsonPath} not found in JSON (incomplete), keeping current value {updatedKpi['currentValue']}")
# Count items/rows/elements based on type
elif isinstance(value, list):
updatedKpi["currentValue"] = len(value)
logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: list with {len(value)} items")
elif isinstance(value, (int, float)):
updatedKpi["currentValue"] = int(value)
logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: numeric value {int(value)}")
else:
updatedKpi["currentValue"] = 0
logger.debug(f"Extracted KPI {kpiId} from path {jsonPath}: non-list/non-numeric value, set to 0")
except Exception as e:
logger.warning(f"Error extracting KPI {kpiId} from path {jsonPath}: {e}")
updatedKpi["currentValue"] = kpi.get("currentValue", 0)
updatedKpis.append(updatedKpi)
return updatedKpis
@staticmethod
def extractKpiValuesFromIncompleteJson(
jsonString: str,
kpis: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
Extract KPI values from incomplete JSON string.
Uses existing JSON completion function to close incomplete structures, then extracts KPIs.
Args:
jsonString: Incomplete JSON string
kpis: List of KPI objects
Returns:
Updated list of KPI objects with currentValue set
"""
updatedKpis = []
for kpi in kpis:
kpiId = kpi.get("id")
jsonPath = kpi.get("jsonPath")
if not kpiId or not jsonPath:
continue
updatedKpi = kpi.copy()
try:
# Use existing JSON completion function to close incomplete structures
from modules.shared.jsonUtils import extractJsonString, closeJsonStructures
# Extract JSON string and complete it with missing closing elements
extracted = extractJsonString(jsonString)
completed = closeJsonStructures(extracted)
# Parse completed JSON
parsed = json.loads(completed)
# Extract value using path
value = JsonResponseHandler._extractValueByPath(parsed, jsonPath)
# Handle None (path doesn't exist - incomplete JSON)
if value is None:
updatedKpi["currentValue"] = kpi.get("currentValue", 0)
logger.debug(f"KPI {kpiId} path {jsonPath} not found in completed JSON (still incomplete), keeping current value {updatedKpi['currentValue']}")
# Count items/rows/elements based on type
elif isinstance(value, list):
updatedKpi["currentValue"] = len(value)
logger.debug(f"Extracted KPI {kpiId} from completed JSON: list with {len(value)} items")
elif isinstance(value, (int, float)):
updatedKpi["currentValue"] = int(value)
logger.debug(f"Extracted KPI {kpiId} from completed JSON: numeric value {int(value)}")
else:
updatedKpi["currentValue"] = 0
logger.debug(f"Extracted KPI {kpiId} from completed JSON: non-list/non-numeric value, set to 0")
except Exception as e:
logger.warning(f"Error extracting KPI {kpiId} from incomplete JSON: {e}")
updatedKpi["currentValue"] = kpi.get("currentValue", 0)
updatedKpis.append(updatedKpi)
return updatedKpis
@staticmethod
def _extractValueByPath(obj: Any, path: str) -> Any:
"""
Extract value from object using dot-notation path with array indices.
Example: "sections[0].elements[0].items"
Returns None if path doesn't exist (for incomplete JSON handling).
"""
parts = path.split('.')
current = obj
for part in parts:
if '[' in part and ']' in part:
# Handle array access: "sections[0]"
key = part[:part.index('[')]
index = int(part[part.index('[') + 1:part.index(']')])
if key:
if isinstance(current, dict):
current = current.get(key)
if current is None:
return None # Key doesn't exist
else:
return None # Can't access key on non-dict
if isinstance(current, list):
if 0 <= index < len(current):
current = current[index]
else:
# Index out of range - return None for incomplete JSON
return None
else:
# Not a list, can't index
return None
else:
# Handle dict access
if isinstance(current, dict):
current = current.get(part)
if current is None:
return None # Key doesn't exist
else:
return None # Can't access key on non-dict
return current
@staticmethod
def validateKpiProgression(
accumulationState: JsonAccumulationState,
updatedKpis: List[Dict[str, Any]]
) -> Tuple[bool, str]:
"""
Validate KPI progression from parsed JSON.
Validation rules:
- Proceed if: At least ONE KPI increased
- Stop if: Any KPI went backwards → return (False, "KPI went backwards")
- Stop if: No KPIs progressed → return (False, "No progress")
- Finish if: All KPIs completed OR JSON is complete → return (True, "Complete")
Args:
accumulationState: Current accumulation state (contains kpis)
updatedKpis: Updated KPI objects with currentValue set
Returns:
Tuple of (shouldProceed, reason)
"""
if not accumulationState.kpis:
# No KPIs defined - always proceed
return True, "No KPIs defined"
# Build dict of last values for comparison
lastValues = {kpi.get("id"): kpi.get("currentValue", 0) for kpi in accumulationState.kpis}
logger.debug(f"KPI validation: lastValues = {lastValues}")
logger.debug(f"KPI validation: updatedKpis = {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}")
# Check if any KPI went backwards
for updatedKpi in updatedKpis:
kpiId = updatedKpi.get("id")
currentValue = updatedKpi.get("currentValue", 0)
if kpiId in lastValues:
lastValue = lastValues[kpiId]
if currentValue < lastValue:
logger.warning(f"KPI {kpiId} went BACKWARDS: {lastValue}{currentValue}")
return False, f"KPI {kpiId} went backwards"
# Check if all KPIs are completed
allCompleted = True
for updatedKpi in updatedKpis:
targetValue = updatedKpi.get("targetValue", 0)
currentValue = updatedKpi.get("currentValue", 0)
if currentValue < targetValue:
allCompleted = False
break
if allCompleted:
logger.info("All KPIs completed")
return True, "All KPIs completed"
# Check if at least one KPI progressed
atLeastOneProgressed = False
for updatedKpi in updatedKpis:
kpiId = updatedKpi.get("id")
currentValue = updatedKpi.get("currentValue", 0)
if kpiId in lastValues:
lastValue = lastValues[kpiId]
if currentValue > lastValue:
atLeastOneProgressed = True
logger.info(f"KPI {kpiId} progressed: {lastValue}{currentValue}")
break
else:
# First time seeing this KPI - if it has a value, it's progress
if currentValue > 0:
atLeastOneProgressed = True
logger.info(f"KPI {kpiId} initialized: {currentValue}")
break
if not atLeastOneProgressed:
logger.warning(f"No KPIs progressed. Last values: {lastValues}, Current values: {[(kpi.get('id'), kpi.get('currentValue')) for kpi in updatedKpis]}")
return False, "No progress"
return True, "Progress detected"
@staticmethod
def accumulateAndParseJsonFragments(
accumulatedJsonString: str,
newFragmentString: str,
allSections: List[Dict[str, Any]],
iteration: int
) -> Tuple[str, List[Dict[str, Any]], bool, Optional[Dict[str, Any]]]:
"""
Accumulate JSON fragments and parse when complete.
GENERIC function that handles:
1. Concatenating JSON strings with overlap detection
2. Parsing the accumulated string
3. Extracting sections (partial if incomplete, final if complete)
4. Determining completion status
Args:
accumulatedJsonString: Previously accumulated JSON string
newFragmentString: New fragment string from current iteration
allSections: Sections extracted so far (for prompt context)
iteration: Current iteration number
Returns:
Tuple of:
- accumulatedJsonString: Updated accumulated string
- sections: Extracted sections (partial if incomplete, final if complete)
- isComplete: True if JSON is complete and valid
- parsedResult: Parsed JSON object (if parsing succeeded)
"""
# Step 1: Clean encoding issues from accumulated string (check end of first delivered part)
cleanedAccumulated = JsonResponseHandler.cleanEncodingIssues(accumulatedJsonString)
# Step 2: Clean encoding issues from new fragment
cleanedFragment = JsonResponseHandler.cleanEncodingIssues(newFragmentString)
# Step 3: Concatenate with overlap handling
combinedString, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(
cleanedAccumulated,
cleanedFragment
)
# Note: hasOverlap indicates if iterations should continue, but this function
# doesn't control iterations, so we just use the merged string
# Step 4: Try to parse
try:
extracted = extractJsonString(combinedString)
parsedResult = json.loads(extracted)
# Step 5: Parsing succeeded - check completeness
isComplete = JsonResponseHandler.isJsonComplete(parsedResult)
if isComplete:
# Step 6: Complete JSON - finalize
finalizedJson = JsonResponseHandler.finalizeJson(parsedResult)
sections = extractSectionsFromDocument(finalizedJson)
logger.info(f"Iteration {iteration}: JSON accumulation complete, extracted {len(sections)} sections")
return combinedString, sections, True, finalizedJson
else:
# Step 7: Incomplete but parseable - extract partial sections
sections = extractSectionsFromDocument(parsedResult)
logger.info(f"Iteration {iteration}: JSON accumulation incomplete but parseable, extracted {len(sections)} partial sections")
return combinedString, sections, False, parsedResult
except json.JSONDecodeError:
# Step 8: Still broken - repair and extract partial sections
repaired = repairBrokenJson(combinedString)
if repaired:
sections = extractSectionsFromDocument(repaired)
logger.info(f"Iteration {iteration}: JSON accumulation repaired, extracted {len(sections)} sections")
return combinedString, sections, False, repaired
else:
# Repair failed - continue with data BEFORE merging the problematic piece
# Return previous accumulated string (before adding new fragment)
# This ensures we don't lose previously accumulated data
logger.warning(f"Iteration {iteration}: Repair failed, continuing with previous accumulated data")
return accumulatedJsonString, [], False, None