fixed json merging chain for cut-off mapping with full-dynamc json merger engine for any json structure and complexity

This commit is contained in:
ValueOn AG 2025-11-30 17:35:19 +01:00
parent 11bb127a43
commit 3ccd284a58
11 changed files with 1263 additions and 577 deletions

View file

@ -16,6 +16,7 @@ from modules.shared.jsonUtils import (
buildContinuationContext,
parseJsonWithModel
)
from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler
logger = logging.getLogger(__name__)
@ -304,7 +305,39 @@ Respond with ONLY a JSON object in this exact format:
# Extract sections from response (handles both valid and broken JSON)
# Only for document generation (JSON responses)
extractedSections, wasJsonComplete, parsedResult = self._extractSectionsFromResponse(result, iteration, debugPrefix)
# CRITICAL: Pass allSections to enable fragment detection and merging
extractedSections, wasJsonComplete, parsedResult = self._extractSectionsFromResponse(
result, iteration, debugPrefix, allSections
)
# CRITICAL: Handle JSON fragments (continuation content)
# Fragment merging happens inside _extractSectionsFromResponse and updates allSections in place
# If no sections extracted but fragment was merged, allSections was updated in place
# Check if fragment was merged by checking if allSections was modified
if not extractedSections and allSections:
# Fragment was detected and merged directly into allSections (side effect in _extractSectionsFromResponse)
logger.info(f"Iteration {iteration}: JSON fragment detected and merged, continuing")
# Don't break - fragment was merged, continue to get more content if needed
# Check if we should continue based on JSON completeness
shouldContinue = self._shouldContinueGeneration(
allSections,
iteration,
wasJsonComplete,
result
)
if shouldContinue:
if iterationOperationId:
self.services.chat.progressLogUpdate(iterationOperationId, 0.8, "Fragment merged, continuing")
self.services.chat.progressLogFinish(iterationOperationId, True)
continue
else:
# Done - fragment was merged and JSON is complete
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True)
if operationId:
self.services.chat.progressLogUpdate(operationId, 0.95, f"Generation complete ({iteration} iterations, fragment merged)")
logger.info(f"Generation complete after {iteration} iterations: fragment merged")
break
# Extract document metadata from first iteration if available
if iteration == 1 and parsedResult and not documentMetadata:
@ -321,14 +354,15 @@ Respond with ONLY a JSON object in this exact format:
if not wasJsonComplete:
logger.warning(f"Iteration {iteration}: No sections extracted from broken JSON, continuing for another attempt")
continue
# If JSON was complete but no sections extracted - this is an error, stop
# If JSON was complete but no sections extracted - check if it was a fragment
# Fragments are handled above, so if we get here and it's complete, it's an error
logger.warning(f"Iteration {iteration}: No sections extracted from complete JSON, stopping")
break
# Merge new sections with existing sections intelligently
# This handles the STANDARD CASE: broken JSON iterations must be merged together
# The break can occur anywhere - in any section, at any depth
allSections = self._mergeSectionsIntelligently(allSections, extractedSections, iteration)
allSections = JsonResponseHandler.mergeSectionsIntelligently(allSections, extractedSections, iteration)
# Check if we should continue (completion detection)
# Simple logic: JSON completeness determines continuation
@ -370,484 +404,24 @@ Respond with ONLY a JSON object in this exact format:
return final_result
def _mergeSectionsIntelligently(
self,
existingSections: List[Dict[str, Any]],
newSections: List[Dict[str, Any]],
iteration: int
) -> List[Dict[str, Any]]:
"""
Intelligently merge sections from multiple iterations.
This is a GENERIC merging strategy that handles broken JSON iterations.
The break can occur anywhere - in any section, at any depth.
Merging strategies (in order of priority):
1. Same Section ID: Merge sections with identical IDs
2. Same Content-Type + Position: If last section is incomplete and new section continues it
3. Same Order: Merge sections with same order value
4. Structural Analysis: Detect continuation based on content structure
Args:
existingSections: Sections accumulated from previous iterations
newSections: Sections extracted from current iteration
iteration: Current iteration number
Returns:
Merged list of sections
"""
if not newSections:
return existingSections
if not existingSections:
return newSections
mergedSections = existingSections.copy()
for newSection in newSections:
merged = False
# Strategy 1: Same Section ID - merge directly
newSectionId = newSection.get("id")
if newSectionId:
for i, existingSection in enumerate(mergedSections):
if existingSection.get("id") == newSectionId:
# Merge sections with same ID
mergedSections[i] = self._mergeSectionContent(existingSection, newSection, iteration)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'")
break
if merged:
continue
# Strategy 2: Same Content-Type + Position (continuation detection)
# Check if last section is incomplete and new section continues it
if mergedSections:
lastSection = mergedSections[-1]
lastContentType = lastSection.get("content_type")
newContentType = newSection.get("content_type")
if lastContentType == newContentType:
# Same content type - check if last section is incomplete
if self._isSectionIncomplete(lastSection):
# Last section is incomplete, merge with new section
mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})")
continue
# Strategy 3: Same Order value
newOrder = newSection.get("order")
if newOrder is not None:
for i, existingSection in enumerate(mergedSections):
existingOrder = existingSection.get("order")
if existingOrder is not None and existingOrder == newOrder:
# Merge sections with same order
mergedSections[i] = self._mergeSectionContent(existingSection, newSection, iteration)
merged = True
logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}")
break
if merged:
continue
# Strategy 4: Structural Analysis - detect continuation
# For code_block and table: if last section matches new section type, merge them
if mergedSections:
lastSection = mergedSections[-1]
lastContentType = lastSection.get("content_type")
newContentType = newSection.get("content_type")
# Both are code blocks - merge them
if lastContentType == "code_block" and newContentType == "code_block":
mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration)
merged = True
logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis")
continue
# Both are tables - merge them (common case for broken JSON iterations)
if lastContentType == "table" and newContentType == "table":
mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration)
merged = True
logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis")
continue
# No merge strategy matched - add as new section
if not merged:
mergedSections.append(newSection)
logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})")
return mergedSections
def _isSectionIncomplete(self, section: Dict[str, Any]) -> bool:
"""
Check if a section is incomplete (broken at the end).
This detects incomplete sections based on content analysis:
- Code blocks: ends mid-line, ends with comma, ends with incomplete structure
- Text sections: ends mid-sentence, ends with incomplete structure
- Other types: check for incomplete elements
"""
contentType = section.get("content_type", "")
elements = section.get("elements", [])
if not elements:
return False
# Handle list of elements
if isinstance(elements, list) and len(elements) > 0:
lastElement = elements[-1]
else:
lastElement = elements
if not isinstance(lastElement, dict):
return False
# Check code_block for incomplete code
if contentType == "code_block":
code = lastElement.get("code", "")
if code:
# Check if code ends incompletely:
# - Ends with comma (incomplete CSV line)
# - Ends with number but no newline (incomplete line)
# - Ends mid-token (e.g., "23431,23" - incomplete number)
codeStripped = code.rstrip()
if codeStripped:
# Check for incomplete patterns
if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')):
# Ends with comma or has comma but no final newline - likely incomplete
return True
# Check if last line is incomplete (doesn't end with newline and has partial content)
if not code.endswith('\n') and codeStripped:
# No final newline - might be incomplete
# More sophisticated: check if last number is complete
lastLine = codeStripped.split('\n')[-1]
if lastLine and ',' in lastLine:
# Has commas but might be incomplete
parts = lastLine.split(',')
if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete
return True
# Check table for incomplete rows
if contentType == "table":
rows = lastElement.get("rows", [])
if rows:
# Check if last row is incomplete (ends with incomplete data)
lastRow = rows[-1] if isinstance(rows, list) else []
if isinstance(lastRow, list) and lastRow:
# Check if last row ends with incomplete data (e.g., incomplete string)
lastCell = lastRow[-1] if lastRow else ""
if isinstance(lastCell, str):
# If last cell is incomplete (ends with quote or is very short), section might be incomplete
if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell):
return True
# Also check if last row doesn't have expected number of columns (if headers exist)
headers = lastElement.get("headers", [])
if headers and isinstance(headers, list):
expectedCols = len(headers)
if len(lastRow) < expectedCols:
return True
# Check paragraph/text for incomplete sentences
if contentType in ["paragraph", "heading"]:
text = lastElement.get("text", "")
if text:
# Simple heuristic: if doesn't end with sentence-ending punctuation
textStripped = text.rstrip()
if textStripped and not textStripped[-1] in '.!?':
# Might be incomplete, but this is less reliable
# Only mark as incomplete if very short (likely cut off)
if len(textStripped) < 20:
return True
# Check lists for incomplete items
if contentType in ["bullet_list", "numbered_list"]:
items = lastElement.get("items", [])
if items and isinstance(items, list):
# Check if last item is incomplete (very short or ends with incomplete string)
lastItem = items[-1] if items else None
if isinstance(lastItem, str) and len(lastItem) < 3:
return True
# Check if items array seems incomplete (e.g., expected count not reached)
# This is harder to detect without context, so we rely on other heuristics
# Check image for incomplete base64 data
if contentType == "image":
imageData = lastElement.get("base64Data", "")
if imageData:
# Base64 strings should end with padding ('=' or '==')
# If it doesn't, it might be incomplete
stripped = imageData.rstrip()
if stripped and not stripped.endswith(('=', '==')):
# Check if it's a valid base64 character sequence that was cut off
# Base64 uses A-Z, a-z, 0-9, +, /, and = for padding
if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=':
return True
# If length is not a multiple of 4 (base64 requirement), it might be incomplete
if len(stripped) % 4 != 0:
return True
# GENERIC CHECK: Look for incomplete structures in any element
# Check if element has arrays/lists that might be incomplete
for key, value in lastElement.items():
if isinstance(value, list) and len(value) > 0:
# Check last item in list
lastItem = value[-1]
if isinstance(lastItem, str):
# If last string item is very short, might be incomplete
if len(lastItem) < 3:
return True
elif isinstance(lastItem, dict):
# If last dict item has very few keys, might be incomplete
if len(lastItem) < 2:
return True
elif isinstance(value, str):
# Check if string ends abruptly (no punctuation, very short)
if len(value) > 0 and len(value) < 10 and not value[-1] in '.!?\n':
return True
return False
def _mergeSectionContent(
self,
existingSection: Dict[str, Any],
newSection: Dict[str, Any],
iteration: int
) -> Dict[str, Any]:
"""
Merge content from two sections.
Handles different content types:
- code_block: Append code, handle overlaps, merge incomplete lines
- paragraph/heading: Append text
- table: Merge rows
- list: Merge items
- Other: Merge elements
"""
contentType = existingSection.get("content_type", "")
existingElements = existingSection.get("elements", [])
newElements = newSection.get("elements", [])
if not newElements:
return existingSection
# Handle list of elements
if isinstance(existingElements, list):
existingElem = existingElements[-1] if existingElements else {}
else:
existingElem = existingElements
if isinstance(newElements, list):
newElem = newElements[0] if newElements else {}
else:
newElem = newElements
if not isinstance(existingElem, dict) or not isinstance(newElem, dict):
return existingSection
# Merge based on content type
if contentType == "code_block":
existingCode = existingElem.get("code", "")
newCode = newElem.get("code", "")
if existingCode and newCode:
mergedCode = self._mergeCodeBlocks(existingCode, newCode, iteration)
existingElem["code"] = mergedCode
# Preserve language from existing or new
if "language" not in existingElem and "language" in newElem:
existingElem["language"] = newElem["language"]
elif contentType in ["paragraph", "heading"]:
existingText = existingElem.get("text", "")
newText = newElem.get("text", "")
if existingText and newText:
# Append text with space if needed
if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n':
mergedText = existingText.rstrip() + " " + newText.lstrip()
else:
mergedText = existingText.rstrip() + "\n" + newText.lstrip()
existingElem["text"] = mergedText
elif contentType == "table":
# Merge table rows with overlap detection
existingRows = existingElem.get("rows", [])
newRows = newElem.get("rows", [])
if existingRows and newRows:
# CRITICAL: Detect and remove overlaps before merging
# Check if last existing row matches first new row (exact overlap)
if len(existingRows) > 0 and len(newRows) > 0:
lastExistingRow = existingRows[-1]
firstNewRow = newRows[0]
# Compare rows (handle both list and tuple formats)
if isinstance(lastExistingRow, (list, tuple)) and isinstance(firstNewRow, (list, tuple)):
if list(lastExistingRow) == list(firstNewRow):
# Exact duplicate - remove first new row
newRows = newRows[1:]
logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)")
# Combine rows from both sections (after removing overlaps)
existingElem["rows"] = existingRows + newRows
logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(existingRows) + len(newRows)}")
elif newRows:
# If existing has no rows but new does, use new rows
existingElem["rows"] = newRows
# Preserve headers from existing (or use new if existing has none)
if not existingElem.get("headers") and newElem.get("headers"):
existingElem["headers"] = newElem["headers"]
# Preserve caption from existing (or use new if existing has none)
if not existingElem.get("caption") and newElem.get("caption"):
existingElem["caption"] = newElem["caption"]
elif contentType in ["bullet_list", "numbered_list"]:
# Merge list items
existingItems = existingElem.get("items", [])
newItems = newElem.get("items", [])
if existingItems and newItems:
existingElem["items"] = existingItems + newItems
elif contentType == "image":
# Images are typically complete - if new image is provided, replace existing
# But check if existing image data is incomplete (e.g., base64 string cut off)
existingImageData = existingElem.get("base64Data", "")
newImageData = newElem.get("base64Data", "")
if existingImageData and newImageData:
# If existing image data doesn't end with valid base64 padding, it might be incomplete
# Base64 padding is '=' or '==' at the end
if not existingImageData.rstrip().endswith(('=', '==')):
# Existing image might be incomplete - merge by appending new data
# This handles cases where base64 string was cut off
existingElem["base64Data"] = existingImageData + newImageData
logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data")
else:
# Existing image is complete - replace with new (or keep existing if new is empty)
if newImageData:
existingElem["base64Data"] = newImageData
elif newImageData:
existingElem["base64Data"] = newImageData
# Preserve other image metadata
if not existingElem.get("altText") and newElem.get("altText"):
existingElem["altText"] = newElem["altText"]
if not existingElem.get("caption") and newElem.get("caption"):
existingElem["caption"] = newElem["caption"]
else:
# GENERIC FALLBACK: Handle any other content types or unknown structures
# Try to merge common array/list fields generically
for key in ["items", "rows", "columns", "cells", "elements", "data", "content"]:
if key in existingElem and key in newElem:
existingValue = existingElem[key]
newValue = newElem[key]
if isinstance(existingValue, list) and isinstance(newValue, list):
# Merge lists by concatenation
existingElem[key] = existingValue + newValue
logger.debug(f"Iteration {iteration}: Merged generic list field '{key}' - existing: {len(existingValue)}, new: {len(newValue)}")
break
# If no common list fields found, try to merge all fields from newElem into existingElem
# This handles cases where objects have different structures
for key, value in newElem.items():
if key not in existingElem:
# New field - add it
existingElem[key] = value
elif isinstance(existingElem[key], list) and isinstance(value, list):
# Both are lists - merge them
existingElem[key] = existingElem[key] + value
elif isinstance(existingElem[key], dict) and isinstance(value, dict):
# Both are dicts - recursively merge (shallow merge)
existingElem[key].update(value)
elif isinstance(existingElem[key], str) and isinstance(value, str):
# Both are strings - append new to existing
existingElem[key] = existingElem[key] + "\n" + value
# Update section with merged content
mergedSection = existingSection.copy()
if isinstance(existingElements, list):
# Update the last element in the list with merged content
if existingElements:
existingElements[-1] = existingElem
mergedSection["elements"] = existingElements
else:
mergedSection["elements"] = existingElem
# Preserve metadata from new section if missing in existing
if "order" not in mergedSection and "order" in newSection:
mergedSection["order"] = newSection["order"]
return mergedSection
def _mergeCodeBlocks(self, existingCode: str, newCode: str, iteration: int) -> str:
"""
Merge two code blocks intelligently, handling overlaps and incomplete lines.
"""
if not existingCode:
return newCode
if not newCode:
return existingCode
existingLines = existingCode.rstrip().split('\n')
newLines = newCode.strip().split('\n')
if not existingLines or not newLines:
return existingCode + "\n" + newCode
lastExistingLine = existingLines[-1].strip()
firstNewLine = newLines[0].strip()
# Strategy 1: Exact overlap - remove duplicate line
if lastExistingLine == firstNewLine:
newLines = newLines[1:]
logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge")
# Strategy 2: Incomplete line merge
# If last existing line ends with comma or is incomplete, merge with first new line
elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5):
# Last line is incomplete - merge with first new line
# Remove trailing comma from existing line
mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip()
existingLines[-1] = mergedLine
newLines = newLines[1:]
logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation")
# Strategy 3: Partial overlap detection
# Check if first new line starts with the end of last existing line
elif ',' in lastExistingLine and ',' in firstNewLine:
lastExistingParts = lastExistingLine.split(',')
firstNewParts = firstNewLine.split(',')
# Check for overlap: if last part of existing matches first part of new
if lastExistingParts and firstNewParts:
lastExistingPart = lastExistingParts[-1].strip()
firstNewPart = firstNewParts[0].strip()
# If they match, there's overlap
if lastExistingPart == firstNewPart and len(lastExistingParts) > 1:
# Remove overlapping part from new line
newLines[0] = ','.join(firstNewParts[1:])
logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge")
# Reconstruct merged code
mergedCode = '\n'.join(existingLines)
if newLines:
if mergedCode and not mergedCode.endswith('\n'):
mergedCode += '\n'
mergedCode += '\n'.join(newLines)
return mergedCode
# JSON merging logic moved to subJsonResponseHandling.py
def _extractSectionsFromResponse(
self,
result: str,
iteration: int,
debugPrefix: str
debugPrefix: str,
allSections: List[Dict[str, Any]] = None
) -> Tuple[List[Dict[str, Any]], bool, Optional[Dict[str, Any]]]:
"""
Extract sections from AI response, handling both valid and broken JSON.
Uses repair mechanism for broken JSON.
Handles JSON fragments (continuation content) that need to be merged into existing sections.
Determines completion based on JSON structure (complete JSON = complete, broken/incomplete = incomplete).
Returns (sections, wasJsonComplete, parsedResult)
"""
if allSections is None:
allSections = []
# First, try to parse as valid JSON
# CRITICAL: JSON completeness is determined by parsing, NOT by last character check!
@ -862,6 +436,20 @@ Respond with ONLY a JSON object in this exact format:
# Extract sections from parsed JSON
sections = extractSectionsFromDocument(parsed_result)
# CRITICAL: If no sections extracted but we have existing sections, check if it's a fragment
if not sections and allSections:
fragment = JsonResponseHandler.detectAndParseJsonFragment(result, allSections)
if fragment:
logger.info(f"Iteration {iteration}: Detected JSON fragment ({fragment.get('fragment_type')}), merging into existing sections")
# Merge fragment into existing sections
merged_sections = JsonResponseHandler.mergeFragmentIntoSection(fragment, allSections, iteration)
# Update allSections in place (this is a side effect, but necessary for continuation)
# Note: This modifies the caller's allSections list
allSections[:] = merged_sections
# Return empty list to indicate we merged directly (not new sections)
# But mark as incomplete so loop continues if needed
return [], False, parsed_result
# JSON parsed successfully = complete
logger.info(f"Iteration {iteration}: JSON parsed successfully - marking as complete")
return sections, True, parsed_result
@ -885,7 +473,7 @@ Respond with ONLY a JSON object in this exact format:
# Repair failed - but we should still continue to allow AI to retry
logger.warning(f"Iteration {iteration}: All repair strategies failed, but continuing to allow retry")
return [], False, None # Mark as incomplete so loop continues
except Exception as e:
logger.error(f"Iteration {iteration}: Unexpected error during parsing: {str(e)}")
return [], False, None
@ -1413,8 +1001,3 @@ Respond with ONLY a JSON object in this exact format:
self.services.chat.progressLogFinish(aiOperationId, False)
raise
# DEPRECATED METHODS REMOVED:
# - callAiDocuments() - replaced by callAiContent()
# - callAiText() - replaced by callAiContent()
# All call sites have been updated to use callAiContent()

File diff suppressed because it is too large Load diff

View file

@ -497,13 +497,11 @@ class RendererDocx(BaseRenderer):
# Extract title from prompt if not provided
if not title or title == "Generated Document":
# Look for "create a ... document" or "generate a ... report"
import re
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', userPrompt.lower())
if title_match:
structure['title'] = title_match.group(1).strip().title()
# Extract sections from numbered lists in prompt
import re
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
sections = re.findall(section_pattern, userPrompt)
@ -849,7 +847,6 @@ class RendererDocx(BaseRenderer):
Returns the content with tables replaced by placeholders.
"""
import csv
import io
lines = content.split('\n')
processed_lines = []

View file

@ -95,7 +95,7 @@ class RendererXlsx(BaseRenderer):
# Title
sheet['A1'] = title
sheet['A1'].font = Font(size=16, bold=True)
sheet['A1'].alignment = Alignment(horizontal='center')
sheet['A1'].alignment = Alignment(horizontal='left')
# Generation info
sheet['A3'] = "Generated:"
@ -325,7 +325,7 @@ class RendererXlsx(BaseRenderer):
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default Excel style set - used when no style instructions present."""
return {
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"},
"title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "left"},
"heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"},
"table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"},
@ -543,8 +543,9 @@ class RendererXlsx(BaseRenderer):
try:
# Sheet title
sheet['A1'] = sheetTitle
sheet['A1'].font = Font(size=16, bold=True, color=self._getSafeColor(styles.get("title", {}).get("color", "FF1F4E79")))
sheet['A1'].alignment = Alignment(horizontal="center")
title_style = styles.get("title", {})
sheet['A1'].font = Font(size=16, bold=True, color=self._getSafeColor(title_style.get("color", "FF1F4E79")))
sheet['A1'].alignment = Alignment(horizontal=title_style.get("align", "left"))
# Get table data from elements (canonical JSON format)
elements = section.get("elements", [])
@ -592,7 +593,7 @@ class RendererXlsx(BaseRenderer):
sheet['A1'] = documentTitle
# Safety check for title style
title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"})
title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "left"})
try:
safe_color = self._getSafeColor(title_style["color"])
sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color)

View file

@ -271,12 +271,6 @@ class UtilsService:
def jsonTryParse(self, text) -> tuple:
return jsonUtils.tryParseJson(text)
def jsonParseOrRaise(self, text):
return jsonUtils.parseJsonOrRaise(text)
def jsonMergeRootLists(self, parts):
return jsonUtils.mergeRootLists(parts)
# ===== Enum utility functions =====
def mapToEnum(self, enum_class, value_str, default_value):

View file

@ -159,7 +159,6 @@ def storeDebugMessageAndDocuments(message, currentUser) -> None:
"""
try:
import json
from datetime import datetime, UTC
# Create base debug directory (use base debug dir, not prompts subdirectory)
baseDebugDir = _getBaseDebugDir()

View file

@ -97,47 +97,6 @@ def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]],
return None, e, cleaned
def parseJsonOrRaise(text: Union[str, bytes]) -> Union[Dict, List]:
obj, err, cleaned = tryParseJson(text)
if err is not None:
logger.error(f"parse_json_or_raise failed: {err}. Cleaned preview: {cleaned[:200]}...")
raise err
return obj
def mergeRootLists(jsonParts: List[Union[str, Dict, List]]) -> Dict[str, Any]:
"""
Generic merger for root-level lists: take first dict as base; for each subsequent part:
- if value is list and same key exists as list, extend it
- if key absent, add it
- for non-list keys, keep the original (from the first part)
Sets continuation=None if present in base.
"""
base: Optional[Dict[str, Any]] = None
parsed: List[Dict[str, Any]] = []
for part in jsonParts:
if isinstance(part, (dict, list)):
obj = part
else:
obj, err, _ = tryParseJson(part)
if err is not None or not isinstance(obj, (dict, list)):
continue
if isinstance(obj, dict):
parsed.append(obj)
if not parsed:
return {}
base = dict(parsed[0])
for obj in parsed[1:]:
for k, v in obj.items():
if isinstance(v, list) and isinstance(base.get(k), list):
base[k].extend(v)
elif k not in base:
base[k] = v
if 'continuation' in base:
base['continuation'] = None
return base
def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
"""
Attempt to repair broken JSON using multiple strategies.

View file

@ -271,7 +271,20 @@ class MethodAi(MethodBase):
# Prepare extraction options
self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options")
extractionOptions = parameters.extractionOptions
extractionOptionsParam = parameters.get("extractionOptions")
# Convert dict to ExtractionOptions object if needed, or create defaults
if extractionOptionsParam:
if isinstance(extractionOptionsParam, dict):
# Convert dict to ExtractionOptions object
extractionOptions = ExtractionOptions(**extractionOptionsParam)
elif isinstance(extractionOptionsParam, ExtractionOptions):
extractionOptions = extractionOptionsParam
else:
# Invalid type, use defaults
extractionOptions = None
else:
extractionOptions = None
# If extractionOptions not provided, create defaults
if not extractionOptions:
@ -297,10 +310,21 @@ class MethodAi(MethodBase):
# Build ActionDocuments from ContentExtracted results
self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents")
actionDocuments = []
for extracted in extractedResults:
# Map extracted results back to original documents by index (results are in same order)
for i, extracted in enumerate(extractedResults):
# Get original document name if available
originalDoc = chatDocuments[i] if i < len(chatDocuments) else None
if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName:
# Use original filename with "extracted_" prefix
baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName
documentName = f"{baseName}_extracted_{extracted.id}.json"
else:
# Fallback to generic name with index
documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json"
# Store ContentExtracted object in ActionDocument.documentData
actionDoc = ActionDocument(
documentName=f"extracted_{extracted.id}.json",
documentName=documentName,
documentData=extracted, # ContentExtracted object
mimeType="application/json"
)

View file

@ -22,7 +22,7 @@ class ContentValidator:
self.services = services
self.learningEngine = learningEngine
async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None) -> Dict[str, Any]:
async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Validates delivered content against user intent using AI (single attempt; parse-or-fail)
Args:
@ -30,8 +30,9 @@ class ContentValidator:
intent: Workflow-level intent dict (for format requirements)
taskStep: Optional TaskStep object (preferred source for objective)
actionName: Optional action name (e.g., "ai.process", "ai.webResearch") that created the documents
actionParameters: Optional action parameters used during execution (e.g., {"columnsPerRow": 10, "researchDepth": "deep"})
"""
return await self._validateWithAI(documents, intent, taskStep, actionName)
return await self._validateWithAI(documents, intent, taskStep, actionName, actionParameters)
def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
"""Generic document analysis - create simple summaries with metadata."""
@ -368,7 +369,7 @@ class ContentValidator:
return False
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None) -> Dict[str, Any]:
async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""AI-based comprehensive validation - generic approach"""
try:
if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'):
@ -430,48 +431,91 @@ class ContentValidator:
actionDescription = "Content processing"
actionContext = f"\nDOCUMENTS CREATED BY: {actionDescription} ({actionName})"
# Format success criteria for display
criteriaDisplay = json.dumps(successCriteria, ensure_ascii=False) if successCriteria else "[]"
# Build action parameters context
actionParamsContext = ""
if actionParameters and isinstance(actionParameters, dict) and len(actionParameters) > 0:
# Filter out documentList and other large/redundant parameters for clarity
relevantParams = {k: v for k, v in actionParameters.items()
if k not in ['documentList', 'connections'] and v is not None}
if relevantParams:
paramsJson = json.dumps(relevantParams, ensure_ascii=False, indent=2)
actionParamsContext = f"\nACTION PARAMETERS USED: {paramsJson}"
# Build successCriteriaMet example - show proper array format
criteriaMetExample = json.dumps([False] * criteriaCount) if criteriaCount > 0 else "[]"
# Format success criteria for display with index numbers
if successCriteria:
criteriaDisplay = "\n".join([f"[{i}] {criterion}" for i, criterion in enumerate(successCriteria)])
else:
criteriaDisplay = "[]"
promptBase = f"""TASK VALIDATION
=== TASK INFORMATION ===
{objectiveLabel}: '{objectiveText}'
EXPECTED DATA TYPE: {dataType}
EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}
SUCCESS CRITERIA ({criteriaCount} items): {criteriaDisplay}{actionContext}
EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContext}{actionParamsContext}
=== VALIDATION INSTRUCTIONS ===
VALIDATION CONTEXT:
You have METADATA (filename, format, size, mimeType) and STRUCTURE SUMMARY (if available: sections, tables, captions, IDs, statistics).
VALIDATION PRINCIPLES:
1. Format compatibility: Match delivered format to expected format
2. Structure validation: Use structure summary to verify requirements (section count, table captions, IDs, section types, etc.)
3. Filename appropriateness: Check if filename suggests correct content type
4. Document count: Verify number matches expectations
5. Size sanity: Only flag if clearly wrong (<1KB for complex content or suspiciously large)
1. EVIDENCE-BASED VALIDATION (CRITICAL): Claims must match structure evidence. If structure shows different values than claimed, trust the structure evidence, not claims.
2. INDEPENDENT CRITERIA EVALUATION (CRITICAL): For criteriaMapping reason field - address ONLY the specific criterion requirement. Do not mention other criteria or other issues.
3. PRIORITY: Missing data > Formatting issues. Always prioritize data completeness over format correctness.
4. Structure validation: Use structure summary (statistics, counts, structure metadata) as PRIMARY evidence. Compare with task requirements.
5. Discrepancy detection: If task requires specific quantities/amounts but structure shows different values, classify as missing_data or incomplete_data, not success.
6. Format compatibility: Match delivered format to expected format (secondary priority after data completeness)
7. Filename appropriateness: Check if filename suggests correct content type
8. Document count: Verify number matches expectations
LIMITATIONS:
- Cannot validate: Content accuracy, data correctness, formatting details, or requirements requiring full content reading
- If structure summary unavailable, validate only metadata (format, filename, count, size)
SCORING GUIDELINES:
- Format matches + reasonable structure qualityScore: 0.8-1.0
- Format matches but structure issues qualityScore: 0.7-0.8
- Data complete + format matches + structure matches requirements qualityScore: 0.9-1.0
- Data complete but format/structure issues qualityScore: 0.7-0.9
- Missing/incomplete data (even if format correct) qualityScore: <0.7
- Claims don't match structure evidence → qualityScore: <0.6 (trust structure, not claims)
- Format mismatch qualityScore: <0.7
- Only suggest improvements for CLEAR metadata/structure issues
OUTPUT FORMAT (JSON only):
VALIDATION LOGIC:
- If structure shows fewer quantities/amounts than required gapType: missing_data or incomplete_data
- If structure shows wrong organization but correct quantity gapType: wrong_structure
- If structure matches requirements but format wrong gapType: wrong_format
- If claims say "X delivered" but structure shows "Y" (Y < X) overallSuccess: false, gapType: missing_data
- Always trust structure statistics over any claims or descriptions
IMPROVEMENT SUGGESTIONS PRIORITY (CRITICAL):
- Order by CRITERIA PRIORITY first, then gapType priority: missing_data > incomplete_data > wrong_structure > wrong_format
- [0] MUST address the HIGHEST PRIORITY unmet criterion (check criteriaMapping for which criteria are unmet)
- If multiple criteria are unmet, prioritize by: data completeness > structure > format
- gapType indicates the PRIMARY issue, but improvement suggestions must prioritize based on unmet criteria order
=== OUTPUT FORMAT (JSON TEMPLATE) ===
{{
"overallSuccess": false,
"qualityScore": 0.0,
"dataTypeMatch": false,
"formatMatch": false,
"documentCount": {len(documents)},
"successCriteriaMet": {criteriaMetExample},
"criteriaMapping": [
{{
"index": 0,
"criterion": "exact_criterion_text_from_data_section",
"met": false,
"reason": "explanation_about_this_criterion_based_on_structure_evidence"
}}
],
"gapAnalysis": "Brief description of gaps based on metadata/structure only. If validation is limited, state this clearly.",
"gapType": "missing_data" | "wrong_structure" | "wrong_format" | "incomplete_data" | "no_gap",
"structureComparison": {{
"required": {{}},
"found": {{}},
"gap": {{}}
}},
"improvementSuggestions": [],
"validationDetails": [
{{
@ -482,6 +526,15 @@ OUTPUT FORMAT (JSON only):
]
}}
OUTPUT FORMAT NOTES:
- criteriaMapping reason: Address ONLY the specific criterion requirement.
- improvementSuggestions: [0] = highest priority unmet criterion from criteriaMapping. Order: unmet criteria by index first (data completeness > structure > format), then by gapType priority.
=== DATA ===
SUCCESS CRITERIA TO VALIDATE in criteriaMapping array:
{criteriaDisplay}
DELIVERED DOCUMENTS ({len(documents)} items):
"""
@ -522,7 +575,6 @@ DELIVERED DOCUMENTS ({len(documents)} items):
# Proactively fix Python-style booleans (False/True -> false/true) BEFORE parsing
# This handles booleans in any context: standalone, in lists, in dicts, etc.
import re
# Use word boundaries but also handle cases where booleans are in brackets/arrays
# Replace False/True regardless of context (word boundary handles string matching correctly)
normalizedJson = re.sub(r'\bFalse\b', 'false', extractedJson)
@ -544,8 +596,10 @@ DELIVERED DOCUMENTS ({len(documents)} items):
quality = aiResult.get("qualityScore")
details = aiResult.get("validationDetails")
gap = aiResult.get("gapAnalysis", "")
criteria = aiResult.get("successCriteriaMet")
improvements = aiResult.get("improvementSuggestions", [])
gap_type = aiResult.get("gapType", "")
structure_comp = aiResult.get("structureComparison", {})
criteria_mapping = aiResult.get("criteriaMapping", [])
# Normalize while keeping failures explicit
normalized = {
@ -553,10 +607,12 @@ DELIVERED DOCUMENTS ({len(documents)} items):
"qualityScore": float(quality) if isinstance(quality, (int, float)) else None,
"documentCount": len(documentSummaries),
"gapAnalysis": gap if gap else "",
"gapType": gap_type if gap_type else "",
"structureComparison": structure_comp if structure_comp else {},
"criteriaMapping": criteria_mapping if isinstance(criteria_mapping, list) else [],
"validationDetails": details if isinstance(details, list) else [{
"documentName": "AI Validation",
"gapAnalysis": gap,
"successCriteriaMet": criteria if isinstance(criteria, list) else []
"gapAnalysis": gap
}],
"improvementSuggestions": improvements,
"schemaCompliant": True,
@ -585,7 +641,7 @@ DELIVERED DOCUMENTS ({len(documents)} items):
"dataTypeMatch": False,
"formatMatch": False,
"documentCount": 0,
"successCriteriaMet": [],
"criteriaMapping": [],
"gapAnalysis": errorMessage,
"improvementSuggestions": [],
"validationDetails": [],

View file

@ -133,8 +133,10 @@ class DynamicMode(BaseMode):
# Pass ALL documents to validator - validator decides what to validate (generic approach)
# Pass taskStep so validator can use task.objective and format fields
# Pass action name so validator knows which action created the documents
# Pass action parameters so validator can verify parameter-specific requirements
actionName = selection.get('action', 'unknown')
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName)
actionParameters = selection.get('parameters', {})
validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName, actionParameters)
observation.contentValidation = validationResult
quality_score = validationResult.get('qualityScore', 0.0)
if quality_score is None:
@ -807,9 +809,9 @@ class DynamicMode(BaseMode):
'documentsCount': observation.documentsCount,
'previews': [p.model_dump(exclude_none=True) if hasattr(p, 'model_dump') else p.dict() for p in observation.previews] if observation.previews else [],
'notes': observation.notes,
'contentValidation': observation.contentValidation if observation.contentValidation else {},
'contentAnalysis': observation.contentAnalysis if observation.contentAnalysis else {}
}
# Note: contentValidation is shown separately in CONTENT VALIDATION section, not duplicated here
reviewContext = ReviewContext(
taskStep=context.taskStep,
taskActions=[],
@ -822,21 +824,36 @@ class DynamicMode(BaseMode):
baseReviewContent = extractReviewContent(reviewContext)
placeholders = {"REVIEW_CONTENT": baseReviewContent}
# NEW: Add content validation to review content
enhancedReviewContent = placeholders.get("REVIEW_CONTENT", "")
# NEW: Add content validation to review content - extract separately for prominence
baseReviewContent = placeholders.get("REVIEW_CONTENT", "")
# Add observation title if there's content
if baseReviewContent.strip():
baseReviewContent = f"=== OBSERVATION ===\n{baseReviewContent}"
contentValidationSection = ""
if observation.contentValidation:
validation = observation.contentValidation
enhancedReviewContent += f"\n\nCONTENT VALIDATION:\n"
enhancedReviewContent += f"Overall Success: {validation.get('overallSuccess', False)}\n"
contentValidationSection += f"\n=== CONTENT VALIDATION ===\n"
gap_type = validation.get('gapType', '')
if gap_type:
contentValidationSection += f"Gap Type: {gap_type}\n"
contentValidationSection += f"Overall Success: {validation.get('overallSuccess', False)}\n"
quality_score = validation.get('qualityScore', 0.0)
if quality_score is None:
quality_score = 0.0
enhancedReviewContent += f"Quality Score: {quality_score:.2f}\n"
contentValidationSection += f"Quality Score: {quality_score:.2f}\n"
gap_analysis = validation.get('gapAnalysis', '')
if gap_analysis:
enhancedReviewContent += f"Gap Analysis: {gap_analysis}\n"
contentValidationSection += f"Gap Analysis: {gap_analysis}\n"
structure_comparison = validation.get('structureComparison', {})
if structure_comparison:
contentValidationSection += f"Structure Comparison: {json.dumps(structure_comparison, indent=2, ensure_ascii=False)}\n"
if validation.get('improvementSuggestions'):
enhancedReviewContent += f"Improvement Suggestions: {', '.join(validation['improvementSuggestions'])}\n"
suggestions = validation['improvementSuggestions']
contentValidationSection += f"Next Actions (in sequence):\n"
for i, suggestion in enumerate(suggestions):
contentValidationSection += f" [{i}] {suggestion}\n"
enhancedReviewContent = baseReviewContent + contentValidationSection
# NEW: Add content analysis to review content
if observation.contentAnalysis:
@ -854,9 +871,41 @@ class DynamicMode(BaseMode):
enhancedReviewContent += f"Partial Achievements: {len(progressState['partialAchievements'])}\n"
enhancedReviewContent += f"Failed Attempts: {len(progressState['failedAttempts'])}\n"
enhancedReviewContent += f"Current Phase: {progressState['currentPhase']}\n"
if progressState['nextActionsSuggested']:
# Use content validation priorities if available, otherwise fall back to progress tracker suggestions
if observation.contentValidation and observation.contentValidation.get('improvementSuggestions'):
# Content validation already shown above, no need to repeat
pass
elif progressState['nextActionsSuggested']:
enhancedReviewContent += f"Next Action Suggestions: {', '.join(progressState['nextActionsSuggested'])}\n"
# NEW: Add action history to review content
if hasattr(context, 'previousReviewResult') and context.previousReviewResult:
actionHistory = []
for i, prevDecision in enumerate(context.previousReviewResult, 1):
if prevDecision and hasattr(prevDecision, 'nextAction') and prevDecision.nextAction:
action = prevDecision.nextAction
params = getattr(prevDecision, 'nextActionParameters', {}) or {}
# Filter out documentList for clarity
relevantParams = {k: v for k, v in params.items() if k not in ['documentList', 'connections']}
paramsStr = json.dumps(relevantParams, ensure_ascii=False) if relevantParams else "{}"
quality = getattr(prevDecision, 'qualityScore', None)
qualityStr = f" (quality: {quality:.2f})" if quality is not None else ""
actionHistory.append(f"Round {i}: {action} {paramsStr}{qualityStr}")
if actionHistory:
enhancedReviewContent += f"\nACTION HISTORY:\n"
enhancedReviewContent += "\n".join(f"- {entry}" for entry in actionHistory)
# Detect repeated actions
actionCounts = {}
for entry in actionHistory:
# Extract action name (before first space or {)
actionName = entry.split()[1] if len(entry.split()) > 1 else "unknown"
actionCounts[actionName] = actionCounts.get(actionName, 0) + 1
repeatedActions = [action for action, count in actionCounts.items() if count >= 2]
if repeatedActions:
enhancedReviewContent += f"\nWARNING: Repeated actions detected: {', '.join(repeatedActions)}. Consider a fundamentally different approach.\n"
# Update placeholders with enhanced review content
placeholders["REVIEW_CONTENT"] = enhancedReviewContent

View file

@ -323,21 +323,22 @@ def generateDynamicRefinementPrompt(services, context: Any, reviewContent: str)
ACTIONS: {{KEY:AVAILABLE_METHODS}}
DOCUMENTS: {{KEY:AVAILABLE_DOCUMENTS_INDEX}}
=== OBSERVATION ===
{{KEY:REVIEW_CONTENT}}
=== NEXT ACTIONS ===
Follow the improvement suggestions from CONTENT VALIDATION in priority order. Each suggestion indicates what action to take next.
=== OUTPUT FORMAT ===
{{
"status": "continue",
"reason": "Brief reason",
"nextAction": "ai.convert",
"reason": "Brief reason explaining why continuing",
"nextAction": "Selected_action_from_ACTIONS",
"nextActionParameters": {{
"documentList": ["docItem:..."],
"inputFormat": "json",
"outputFormat": "csv",
"columnsPerRow": 10
"documentList": ["docItem:reference_from_DOCUMENTS"],
"parameter1": "value1",
"parameter2": "value2"
}},
"nextActionObjective": "Convert JSON to CSV with 10 columns per row"
"nextActionObjective": "Clear description of what this action will achieve based on improvement suggestions"
}}
=== RULES ===
@ -345,9 +346,10 @@ DOCUMENTS: {{KEY:AVAILABLE_DOCUMENTS_INDEX}}
- nextAction: SPECIFIC action from AVAILABLE_METHODS (do not invent)
- nextActionParameters: concrete parameters (check AVAILABLE_METHODS for valid names)
- documentList: ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (do not invent)
- nextActionObjective: describe what this action will achieve
- nextActionObjective: describe what this action will achieve based on the FIRST improvement suggestion from CONTENT VALIDATION
- Do NOT repeat failed actions - suggest DIFFERENT approach
- Use improvement suggestions from content validation
- If ACTION HISTORY shows repeated actions, suggest a fundamentally different approach
- nextActionObjective must directly address the highest priority improvement suggestion from CONTENT VALIDATION
"""