From 3ccd284a587af03db896fc619aa86d4b8f2b61fa Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 30 Nov 2025 17:35:19 +0100 Subject: [PATCH] fixed json merging chain for cut-off mapping with full-dynamc json merger engine for any json structure and complexity --- modules/services/serviceAi/mainServiceAi.py | 533 +-------- .../serviceAi/subJsonResponseHandling.py | 1022 +++++++++++++++++ .../renderers/rendererDocx.py | 3 - .../renderers/rendererXlsx.py | 11 +- .../services/serviceUtils/mainServiceUtils.py | 6 - modules/shared/debugLogger.py | 1 - modules/shared/jsonUtils.py | 41 - modules/workflows/methods/methodAi.py | 30 +- .../processing/adaptive/contentValidator.py | 102 +- .../workflows/processing/modes/modeDynamic.py | 69 +- .../shared/promptGenerationActionsDynamic.py | 22 +- 11 files changed, 1263 insertions(+), 577 deletions(-) create mode 100644 modules/services/serviceAi/subJsonResponseHandling.py diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 98f9d0ed..592099f3 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -16,6 +16,7 @@ from modules.shared.jsonUtils import ( buildContinuationContext, parseJsonWithModel ) +from modules.services.serviceAi.subJsonResponseHandling import JsonResponseHandler logger = logging.getLogger(__name__) @@ -304,7 +305,39 @@ Respond with ONLY a JSON object in this exact format: # Extract sections from response (handles both valid and broken JSON) # Only for document generation (JSON responses) - extractedSections, wasJsonComplete, parsedResult = self._extractSectionsFromResponse(result, iteration, debugPrefix) + # CRITICAL: Pass allSections to enable fragment detection and merging + extractedSections, wasJsonComplete, parsedResult = self._extractSectionsFromResponse( + result, iteration, debugPrefix, allSections + ) + + # CRITICAL: Handle JSON fragments (continuation content) + # Fragment merging happens inside _extractSectionsFromResponse and updates allSections in place + # If no sections extracted but fragment was merged, allSections was updated in place + # Check if fragment was merged by checking if allSections was modified + if not extractedSections and allSections: + # Fragment was detected and merged directly into allSections (side effect in _extractSectionsFromResponse) + logger.info(f"Iteration {iteration}: JSON fragment detected and merged, continuing") + # Don't break - fragment was merged, continue to get more content if needed + # Check if we should continue based on JSON completeness + shouldContinue = self._shouldContinueGeneration( + allSections, + iteration, + wasJsonComplete, + result + ) + if shouldContinue: + if iterationOperationId: + self.services.chat.progressLogUpdate(iterationOperationId, 0.8, "Fragment merged, continuing") + self.services.chat.progressLogFinish(iterationOperationId, True) + continue + else: + # Done - fragment was merged and JSON is complete + if iterationOperationId: + self.services.chat.progressLogFinish(iterationOperationId, True) + if operationId: + self.services.chat.progressLogUpdate(operationId, 0.95, f"Generation complete ({iteration} iterations, fragment merged)") + logger.info(f"Generation complete after {iteration} iterations: fragment merged") + break # Extract document metadata from first iteration if available if iteration == 1 and parsedResult and not documentMetadata: @@ -321,14 +354,15 @@ Respond with ONLY a JSON object in this exact format: if not wasJsonComplete: logger.warning(f"Iteration {iteration}: No sections extracted from broken JSON, continuing for another attempt") continue - # If JSON was complete but no sections extracted - this is an error, stop + # If JSON was complete but no sections extracted - check if it was a fragment + # Fragments are handled above, so if we get here and it's complete, it's an error logger.warning(f"Iteration {iteration}: No sections extracted from complete JSON, stopping") break # Merge new sections with existing sections intelligently # This handles the STANDARD CASE: broken JSON iterations must be merged together # The break can occur anywhere - in any section, at any depth - allSections = self._mergeSectionsIntelligently(allSections, extractedSections, iteration) + allSections = JsonResponseHandler.mergeSectionsIntelligently(allSections, extractedSections, iteration) # Check if we should continue (completion detection) # Simple logic: JSON completeness determines continuation @@ -370,484 +404,24 @@ Respond with ONLY a JSON object in this exact format: return final_result - def _mergeSectionsIntelligently( - self, - existingSections: List[Dict[str, Any]], - newSections: List[Dict[str, Any]], - iteration: int - ) -> List[Dict[str, Any]]: - """ - Intelligently merge sections from multiple iterations. - - This is a GENERIC merging strategy that handles broken JSON iterations. - The break can occur anywhere - in any section, at any depth. - - Merging strategies (in order of priority): - 1. Same Section ID: Merge sections with identical IDs - 2. Same Content-Type + Position: If last section is incomplete and new section continues it - 3. Same Order: Merge sections with same order value - 4. Structural Analysis: Detect continuation based on content structure - - Args: - existingSections: Sections accumulated from previous iterations - newSections: Sections extracted from current iteration - iteration: Current iteration number - - Returns: - Merged list of sections - """ - if not newSections: - return existingSections - - if not existingSections: - return newSections - - mergedSections = existingSections.copy() - - for newSection in newSections: - merged = False - - # Strategy 1: Same Section ID - merge directly - newSectionId = newSection.get("id") - if newSectionId: - for i, existingSection in enumerate(mergedSections): - if existingSection.get("id") == newSectionId: - # Merge sections with same ID - mergedSections[i] = self._mergeSectionContent(existingSection, newSection, iteration) - merged = True - logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'") - break - - if merged: - continue - - # Strategy 2: Same Content-Type + Position (continuation detection) - # Check if last section is incomplete and new section continues it - if mergedSections: - lastSection = mergedSections[-1] - lastContentType = lastSection.get("content_type") - newContentType = newSection.get("content_type") - - if lastContentType == newContentType: - # Same content type - check if last section is incomplete - if self._isSectionIncomplete(lastSection): - # Last section is incomplete, merge with new section - mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration) - merged = True - logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})") - continue - - # Strategy 3: Same Order value - newOrder = newSection.get("order") - if newOrder is not None: - for i, existingSection in enumerate(mergedSections): - existingOrder = existingSection.get("order") - if existingOrder is not None and existingOrder == newOrder: - # Merge sections with same order - mergedSections[i] = self._mergeSectionContent(existingSection, newSection, iteration) - merged = True - logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}") - break - - if merged: - continue - - # Strategy 4: Structural Analysis - detect continuation - # For code_block and table: if last section matches new section type, merge them - if mergedSections: - lastSection = mergedSections[-1] - lastContentType = lastSection.get("content_type") - newContentType = newSection.get("content_type") - - # Both are code blocks - merge them - if lastContentType == "code_block" and newContentType == "code_block": - mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration) - merged = True - logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis") - continue - - # Both are tables - merge them (common case for broken JSON iterations) - if lastContentType == "table" and newContentType == "table": - mergedSections[-1] = self._mergeSectionContent(lastSection, newSection, iteration) - merged = True - logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis") - continue - - # No merge strategy matched - add as new section - if not merged: - mergedSections.append(newSection) - logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})") - - return mergedSections - - def _isSectionIncomplete(self, section: Dict[str, Any]) -> bool: - """ - Check if a section is incomplete (broken at the end). - - This detects incomplete sections based on content analysis: - - Code blocks: ends mid-line, ends with comma, ends with incomplete structure - - Text sections: ends mid-sentence, ends with incomplete structure - - Other types: check for incomplete elements - """ - contentType = section.get("content_type", "") - elements = section.get("elements", []) - - if not elements: - return False - - # Handle list of elements - if isinstance(elements, list) and len(elements) > 0: - lastElement = elements[-1] - else: - lastElement = elements - - if not isinstance(lastElement, dict): - return False - - # Check code_block for incomplete code - if contentType == "code_block": - code = lastElement.get("code", "") - if code: - # Check if code ends incompletely: - # - Ends with comma (incomplete CSV line) - # - Ends with number but no newline (incomplete line) - # - Ends mid-token (e.g., "23431,23" - incomplete number) - codeStripped = code.rstrip() - if codeStripped: - # Check for incomplete patterns - if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')): - # Ends with comma or has comma but no final newline - likely incomplete - return True - # Check if last line is incomplete (doesn't end with newline and has partial content) - if not code.endswith('\n') and codeStripped: - # No final newline - might be incomplete - # More sophisticated: check if last number is complete - lastLine = codeStripped.split('\n')[-1] - if lastLine and ',' in lastLine: - # Has commas but might be incomplete - parts = lastLine.split(',') - if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete - return True - - # Check table for incomplete rows - if contentType == "table": - rows = lastElement.get("rows", []) - if rows: - # Check if last row is incomplete (ends with incomplete data) - lastRow = rows[-1] if isinstance(rows, list) else [] - if isinstance(lastRow, list) and lastRow: - # Check if last row ends with incomplete data (e.g., incomplete string) - lastCell = lastRow[-1] if lastRow else "" - if isinstance(lastCell, str): - # If last cell is incomplete (ends with quote or is very short), section might be incomplete - if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell): - return True - # Also check if last row doesn't have expected number of columns (if headers exist) - headers = lastElement.get("headers", []) - if headers and isinstance(headers, list): - expectedCols = len(headers) - if len(lastRow) < expectedCols: - return True - - # Check paragraph/text for incomplete sentences - if contentType in ["paragraph", "heading"]: - text = lastElement.get("text", "") - if text: - # Simple heuristic: if doesn't end with sentence-ending punctuation - textStripped = text.rstrip() - if textStripped and not textStripped[-1] in '.!?': - # Might be incomplete, but this is less reliable - # Only mark as incomplete if very short (likely cut off) - if len(textStripped) < 20: - return True - - # Check lists for incomplete items - if contentType in ["bullet_list", "numbered_list"]: - items = lastElement.get("items", []) - if items and isinstance(items, list): - # Check if last item is incomplete (very short or ends with incomplete string) - lastItem = items[-1] if items else None - if isinstance(lastItem, str) and len(lastItem) < 3: - return True - # Check if items array seems incomplete (e.g., expected count not reached) - # This is harder to detect without context, so we rely on other heuristics - - # Check image for incomplete base64 data - if contentType == "image": - imageData = lastElement.get("base64Data", "") - if imageData: - # Base64 strings should end with padding ('=' or '==') - # If it doesn't, it might be incomplete - stripped = imageData.rstrip() - if stripped and not stripped.endswith(('=', '==')): - # Check if it's a valid base64 character sequence that was cut off - # Base64 uses A-Z, a-z, 0-9, +, /, and = for padding - if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=': - return True - # If length is not a multiple of 4 (base64 requirement), it might be incomplete - if len(stripped) % 4 != 0: - return True - - # GENERIC CHECK: Look for incomplete structures in any element - # Check if element has arrays/lists that might be incomplete - for key, value in lastElement.items(): - if isinstance(value, list) and len(value) > 0: - # Check last item in list - lastItem = value[-1] - if isinstance(lastItem, str): - # If last string item is very short, might be incomplete - if len(lastItem) < 3: - return True - elif isinstance(lastItem, dict): - # If last dict item has very few keys, might be incomplete - if len(lastItem) < 2: - return True - elif isinstance(value, str): - # Check if string ends abruptly (no punctuation, very short) - if len(value) > 0 and len(value) < 10 and not value[-1] in '.!?\n': - return True - - return False - - def _mergeSectionContent( - self, - existingSection: Dict[str, Any], - newSection: Dict[str, Any], - iteration: int - ) -> Dict[str, Any]: - """ - Merge content from two sections. - - Handles different content types: - - code_block: Append code, handle overlaps, merge incomplete lines - - paragraph/heading: Append text - - table: Merge rows - - list: Merge items - - Other: Merge elements - """ - contentType = existingSection.get("content_type", "") - existingElements = existingSection.get("elements", []) - newElements = newSection.get("elements", []) - - if not newElements: - return existingSection - - # Handle list of elements - if isinstance(existingElements, list): - existingElem = existingElements[-1] if existingElements else {} - else: - existingElem = existingElements - - if isinstance(newElements, list): - newElem = newElements[0] if newElements else {} - else: - newElem = newElements - - if not isinstance(existingElem, dict) or not isinstance(newElem, dict): - return existingSection - - # Merge based on content type - if contentType == "code_block": - existingCode = existingElem.get("code", "") - newCode = newElem.get("code", "") - - if existingCode and newCode: - mergedCode = self._mergeCodeBlocks(existingCode, newCode, iteration) - existingElem["code"] = mergedCode - # Preserve language from existing or new - if "language" not in existingElem and "language" in newElem: - existingElem["language"] = newElem["language"] - - elif contentType in ["paragraph", "heading"]: - existingText = existingElem.get("text", "") - newText = newElem.get("text", "") - - if existingText and newText: - # Append text with space if needed - if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n': - mergedText = existingText.rstrip() + " " + newText.lstrip() - else: - mergedText = existingText.rstrip() + "\n" + newText.lstrip() - existingElem["text"] = mergedText - - elif contentType == "table": - # Merge table rows with overlap detection - existingRows = existingElem.get("rows", []) - newRows = newElem.get("rows", []) - if existingRows and newRows: - # CRITICAL: Detect and remove overlaps before merging - # Check if last existing row matches first new row (exact overlap) - if len(existingRows) > 0 and len(newRows) > 0: - lastExistingRow = existingRows[-1] - firstNewRow = newRows[0] - # Compare rows (handle both list and tuple formats) - if isinstance(lastExistingRow, (list, tuple)) and isinstance(firstNewRow, (list, tuple)): - if list(lastExistingRow) == list(firstNewRow): - # Exact duplicate - remove first new row - newRows = newRows[1:] - logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)") - - # Combine rows from both sections (after removing overlaps) - existingElem["rows"] = existingRows + newRows - logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(existingRows) + len(newRows)}") - elif newRows: - # If existing has no rows but new does, use new rows - existingElem["rows"] = newRows - # Preserve headers from existing (or use new if existing has none) - if not existingElem.get("headers") and newElem.get("headers"): - existingElem["headers"] = newElem["headers"] - # Preserve caption from existing (or use new if existing has none) - if not existingElem.get("caption") and newElem.get("caption"): - existingElem["caption"] = newElem["caption"] - - elif contentType in ["bullet_list", "numbered_list"]: - # Merge list items - existingItems = existingElem.get("items", []) - newItems = newElem.get("items", []) - if existingItems and newItems: - existingElem["items"] = existingItems + newItems - - elif contentType == "image": - # Images are typically complete - if new image is provided, replace existing - # But check if existing image data is incomplete (e.g., base64 string cut off) - existingImageData = existingElem.get("base64Data", "") - newImageData = newElem.get("base64Data", "") - if existingImageData and newImageData: - # If existing image data doesn't end with valid base64 padding, it might be incomplete - # Base64 padding is '=' or '==' at the end - if not existingImageData.rstrip().endswith(('=', '==')): - # Existing image might be incomplete - merge by appending new data - # This handles cases where base64 string was cut off - existingElem["base64Data"] = existingImageData + newImageData - logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data") - else: - # Existing image is complete - replace with new (or keep existing if new is empty) - if newImageData: - existingElem["base64Data"] = newImageData - elif newImageData: - existingElem["base64Data"] = newImageData - # Preserve other image metadata - if not existingElem.get("altText") and newElem.get("altText"): - existingElem["altText"] = newElem["altText"] - if not existingElem.get("caption") and newElem.get("caption"): - existingElem["caption"] = newElem["caption"] - - else: - # GENERIC FALLBACK: Handle any other content types or unknown structures - # Try to merge common array/list fields generically - for key in ["items", "rows", "columns", "cells", "elements", "data", "content"]: - if key in existingElem and key in newElem: - existingValue = existingElem[key] - newValue = newElem[key] - if isinstance(existingValue, list) and isinstance(newValue, list): - # Merge lists by concatenation - existingElem[key] = existingValue + newValue - logger.debug(f"Iteration {iteration}: Merged generic list field '{key}' - existing: {len(existingValue)}, new: {len(newValue)}") - break - - # If no common list fields found, try to merge all fields from newElem into existingElem - # This handles cases where objects have different structures - for key, value in newElem.items(): - if key not in existingElem: - # New field - add it - existingElem[key] = value - elif isinstance(existingElem[key], list) and isinstance(value, list): - # Both are lists - merge them - existingElem[key] = existingElem[key] + value - elif isinstance(existingElem[key], dict) and isinstance(value, dict): - # Both are dicts - recursively merge (shallow merge) - existingElem[key].update(value) - elif isinstance(existingElem[key], str) and isinstance(value, str): - # Both are strings - append new to existing - existingElem[key] = existingElem[key] + "\n" + value - - # Update section with merged content - mergedSection = existingSection.copy() - if isinstance(existingElements, list): - # Update the last element in the list with merged content - if existingElements: - existingElements[-1] = existingElem - mergedSection["elements"] = existingElements - else: - mergedSection["elements"] = existingElem - - # Preserve metadata from new section if missing in existing - if "order" not in mergedSection and "order" in newSection: - mergedSection["order"] = newSection["order"] - - return mergedSection - - def _mergeCodeBlocks(self, existingCode: str, newCode: str, iteration: int) -> str: - """ - Merge two code blocks intelligently, handling overlaps and incomplete lines. - """ - if not existingCode: - return newCode - if not newCode: - return existingCode - - existingLines = existingCode.rstrip().split('\n') - newLines = newCode.strip().split('\n') - - if not existingLines or not newLines: - return existingCode + "\n" + newCode - - lastExistingLine = existingLines[-1].strip() - firstNewLine = newLines[0].strip() - - # Strategy 1: Exact overlap - remove duplicate line - if lastExistingLine == firstNewLine: - newLines = newLines[1:] - logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge") - - # Strategy 2: Incomplete line merge - # If last existing line ends with comma or is incomplete, merge with first new line - elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5): - # Last line is incomplete - merge with first new line - # Remove trailing comma from existing line - mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip() - existingLines[-1] = mergedLine - newLines = newLines[1:] - logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation") - - # Strategy 3: Partial overlap detection - # Check if first new line starts with the end of last existing line - elif ',' in lastExistingLine and ',' in firstNewLine: - lastExistingParts = lastExistingLine.split(',') - firstNewParts = firstNewLine.split(',') - - # Check for overlap: if last part of existing matches first part of new - if lastExistingParts and firstNewParts: - lastExistingPart = lastExistingParts[-1].strip() - firstNewPart = firstNewParts[0].strip() - - # If they match, there's overlap - if lastExistingPart == firstNewPart and len(lastExistingParts) > 1: - # Remove overlapping part from new line - newLines[0] = ','.join(firstNewParts[1:]) - logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge") - - # Reconstruct merged code - mergedCode = '\n'.join(existingLines) - if newLines: - if mergedCode and not mergedCode.endswith('\n'): - mergedCode += '\n' - mergedCode += '\n'.join(newLines) - - return mergedCode + # JSON merging logic moved to subJsonResponseHandling.py def _extractSectionsFromResponse( self, result: str, iteration: int, - debugPrefix: str + debugPrefix: str, + allSections: List[Dict[str, Any]] = None ) -> Tuple[List[Dict[str, Any]], bool, Optional[Dict[str, Any]]]: """ Extract sections from AI response, handling both valid and broken JSON. Uses repair mechanism for broken JSON. + Handles JSON fragments (continuation content) that need to be merged into existing sections. Determines completion based on JSON structure (complete JSON = complete, broken/incomplete = incomplete). Returns (sections, wasJsonComplete, parsedResult) """ + if allSections is None: + allSections = [] # First, try to parse as valid JSON # CRITICAL: JSON completeness is determined by parsing, NOT by last character check! @@ -862,6 +436,20 @@ Respond with ONLY a JSON object in this exact format: # Extract sections from parsed JSON sections = extractSectionsFromDocument(parsed_result) + # CRITICAL: If no sections extracted but we have existing sections, check if it's a fragment + if not sections and allSections: + fragment = JsonResponseHandler.detectAndParseJsonFragment(result, allSections) + if fragment: + logger.info(f"Iteration {iteration}: Detected JSON fragment ({fragment.get('fragment_type')}), merging into existing sections") + # Merge fragment into existing sections + merged_sections = JsonResponseHandler.mergeFragmentIntoSection(fragment, allSections, iteration) + # Update allSections in place (this is a side effect, but necessary for continuation) + # Note: This modifies the caller's allSections list + allSections[:] = merged_sections + # Return empty list to indicate we merged directly (not new sections) + # But mark as incomplete so loop continues if needed + return [], False, parsed_result + # JSON parsed successfully = complete logger.info(f"Iteration {iteration}: JSON parsed successfully - marking as complete") return sections, True, parsed_result @@ -885,7 +473,7 @@ Respond with ONLY a JSON object in this exact format: # Repair failed - but we should still continue to allow AI to retry logger.warning(f"Iteration {iteration}: All repair strategies failed, but continuing to allow retry") return [], False, None # Mark as incomplete so loop continues - + except Exception as e: logger.error(f"Iteration {iteration}: Unexpected error during parsing: {str(e)}") return [], False, None @@ -1413,8 +1001,3 @@ Respond with ONLY a JSON object in this exact format: self.services.chat.progressLogFinish(aiOperationId, False) raise - # DEPRECATED METHODS REMOVED: - # - callAiDocuments() - replaced by callAiContent() - # - callAiText() - replaced by callAiContent() - # All call sites have been updated to use callAiContent() - diff --git a/modules/services/serviceAi/subJsonResponseHandling.py b/modules/services/serviceAi/subJsonResponseHandling.py new file mode 100644 index 00000000..5a6ec965 --- /dev/null +++ b/modules/services/serviceAi/subJsonResponseHandling.py @@ -0,0 +1,1022 @@ +""" +JSON Response Handling Module + +Handles merging of JSON responses from multiple AI iterations, including: +- Section merging with intelligent overlap detection +- JSON fragment detection and merging +- Deep recursive structure merging +- Overlap detection for complex nested structures +""" +import json +import logging +from typing import Dict, Any, List, Optional, Tuple + +from modules.shared.jsonUtils import extractJsonString + +logger = logging.getLogger(__name__) + + +class JsonResponseHandler: + """Handles JSON response merging and fragment detection for iterative AI generation.""" + + @staticmethod + def mergeSectionsIntelligently( + existingSections: List[Dict[str, Any]], + newSections: List[Dict[str, Any]], + iteration: int + ) -> List[Dict[str, Any]]: + """ + Intelligently merge sections from multiple iterations. + + This is a GENERIC merging strategy that handles broken JSON iterations. + The break can occur anywhere - in any section, at any depth. + + Merging strategies (in order of priority): + 1. Same Section ID: Merge sections with identical IDs + 2. Same Content-Type + Position: If last section is incomplete and new section continues it + 3. Same Order: Merge sections with same order value + 4. Structural Analysis: Detect continuation based on content structure + + Args: + existingSections: Sections accumulated from previous iterations + newSections: Sections extracted from current iteration + iteration: Current iteration number + + Returns: + Merged list of sections + """ + if not newSections: + return existingSections + + if not existingSections: + return newSections + + mergedSections = existingSections.copy() + + for newSection in newSections: + merged = False + + # Strategy 1: Same Section ID - merge directly + newSectionId = newSection.get("id") + if newSectionId: + for i, existingSection in enumerate(mergedSections): + if existingSection.get("id") == newSectionId: + # Merge sections with same ID + mergedSections[i] = JsonResponseHandler.mergeSectionContent( + existingSection, newSection, iteration + ) + merged = True + logger.debug(f"Iteration {iteration}: Merged section by ID '{newSectionId}'") + break + + if merged: + continue + + # Strategy 2: Same Content-Type + Position (continuation detection) + # Check if last section is incomplete and new section continues it + if mergedSections: + lastSection = mergedSections[-1] + lastContentType = lastSection.get("content_type") + newContentType = newSection.get("content_type") + + if lastContentType == newContentType: + # Same content type - check if last section is incomplete + if JsonResponseHandler.isSectionIncomplete(lastSection): + # Last section is incomplete, merge with new section + mergedSections[-1] = JsonResponseHandler.mergeSectionContent( + lastSection, newSection, iteration + ) + merged = True + logger.debug(f"Iteration {iteration}: Merged section by content-type continuation ({lastContentType})") + continue + + # Strategy 3: Same Order value + newOrder = newSection.get("order") + if newOrder is not None: + for i, existingSection in enumerate(mergedSections): + existingOrder = existingSection.get("order") + if existingOrder is not None and existingOrder == newOrder: + # Merge sections with same order + mergedSections[i] = JsonResponseHandler.mergeSectionContent( + existingSection, newSection, iteration + ) + merged = True + logger.debug(f"Iteration {iteration}: Merged section by order {newOrder}") + break + + if merged: + continue + + # Strategy 4: Structural Analysis - detect continuation + # For code_block and table: if last section matches new section type, merge them + if mergedSections: + lastSection = mergedSections[-1] + lastContentType = lastSection.get("content_type") + newContentType = newSection.get("content_type") + + # Both are code blocks - merge them + if lastContentType == "code_block" and newContentType == "code_block": + mergedSections[-1] = JsonResponseHandler.mergeSectionContent( + lastSection, newSection, iteration + ) + merged = True + logger.debug(f"Iteration {iteration}: Merged code_block sections by structural analysis") + continue + + # Both are tables - merge them (common case for broken JSON iterations) + if lastContentType == "table" and newContentType == "table": + mergedSections[-1] = JsonResponseHandler.mergeSectionContent( + lastSection, newSection, iteration + ) + merged = True + logger.debug(f"Iteration {iteration}: Merged table sections by structural analysis") + continue + + # No merge strategy matched - add as new section + if not merged: + mergedSections.append(newSection) + logger.debug(f"Iteration {iteration}: Added new section '{newSection.get('id', 'no-id')}' ({newSection.get('content_type', 'unknown')})") + + return mergedSections + + @staticmethod + def isSectionIncomplete(section: Dict[str, Any]) -> bool: + """ + Check if a section is incomplete (broken at the end). + + This detects incomplete sections based on content analysis: + - Code blocks: ends mid-line, ends with comma, ends with incomplete structure + - Text sections: ends mid-sentence, ends with incomplete structure + - Other types: check for incomplete elements + """ + contentType = section.get("content_type", "") + elements = section.get("elements", []) + + if not elements: + return False + + # Handle list of elements + if isinstance(elements, list) and len(elements) > 0: + lastElement = elements[-1] + else: + lastElement = elements + + if not isinstance(lastElement, dict): + return False + + # Check code_block for incomplete code + if contentType == "code_block": + code = lastElement.get("code", "") + if code: + # Check if code ends incompletely: + # - Ends with comma (incomplete CSV line) + # - Ends with number but no newline (incomplete line) + # - Ends mid-token (e.g., "23431,23" - incomplete number) + codeStripped = code.rstrip() + if codeStripped: + # Check for incomplete patterns + if codeStripped.endswith(',') or (',' in codeStripped and not codeStripped.endswith('\n')): + # Ends with comma or has comma but no final newline - likely incomplete + return True + # Check if last line is incomplete (doesn't end with newline and has partial content) + if not code.endswith('\n') and codeStripped: + # No final newline - might be incomplete + # More sophisticated: check if last number is complete + lastLine = codeStripped.split('\n')[-1] + if lastLine and ',' in lastLine: + # Has commas but might be incomplete + parts = lastLine.split(',') + if parts and len(parts[-1]) < 5: # Last part is very short - might be incomplete + return True + + # Check table for incomplete rows + if contentType == "table": + rows = lastElement.get("rows", []) + if rows: + # Check if last row is incomplete (ends with incomplete data) + lastRow = rows[-1] if isinstance(rows, list) else [] + if isinstance(lastRow, list) and lastRow: + # Check if last row ends with incomplete data (e.g., incomplete string) + lastCell = lastRow[-1] if lastRow else "" + if isinstance(lastCell, str): + # If last cell is incomplete (ends with quote or is very short), section might be incomplete + if lastCell.endswith('"') or (len(lastCell) < 3 and lastCell): + return True + # Also check if last row doesn't have expected number of columns (if headers exist) + headers = lastElement.get("headers", []) + if headers and isinstance(headers, list): + expectedCols = len(headers) + if len(lastRow) < expectedCols: + return True + + # Check paragraph/text for incomplete sentences + if contentType in ["paragraph", "heading"]: + text = lastElement.get("text", "") + if text: + # Simple heuristic: if doesn't end with sentence-ending punctuation + textStripped = text.rstrip() + if textStripped and not textStripped[-1] in '.!?': + # Might be incomplete, but this is less reliable + # Only mark as incomplete if very short (likely cut off) + if len(textStripped) < 20: + return True + + # Check lists for incomplete items + if contentType in ["bullet_list", "numbered_list"]: + items = lastElement.get("items", []) + if items and isinstance(items, list): + # Check if last item is incomplete (very short or ends with incomplete string) + lastItem = items[-1] if items else None + if isinstance(lastItem, str) and len(lastItem) < 3: + return True + + # Check image for incomplete base64 data + if contentType == "image": + imageData = lastElement.get("base64Data", "") + if imageData: + # Base64 strings should end with padding ('=' or '==') + # If it doesn't, it might be incomplete + stripped = imageData.rstrip() + if stripped and not stripped.endswith(('=', '==')): + # Check if it's a valid base64 character sequence that was cut off + if len(stripped) > 0 and stripped[-1] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=': + return True + # If length is not a multiple of 4 (base64 requirement), it might be incomplete + if len(stripped) % 4 != 0: + return True + + # GENERIC CHECK: Look for incomplete structures in any element + # Check if element has arrays/lists that might be incomplete + for key, value in lastElement.items(): + if isinstance(value, list) and len(value) > 0: + # Check last item in list + lastItem = value[-1] + if isinstance(lastItem, str): + # If last string item is very short, might be incomplete + if len(lastItem) < 3: + return True + elif isinstance(lastItem, dict): + # If last dict item has very few keys, might be incomplete + if len(lastItem) < 2: + return True + elif isinstance(value, str): + # Check if string ends abruptly (no punctuation, very short) + if len(value) > 0 and len(value) < 10 and not value[-1] in '.!?\n': + return True + + return False + + @staticmethod + def mergeSectionContent( + existingSection: Dict[str, Any], + newSection: Dict[str, Any], + iteration: int + ) -> Dict[str, Any]: + """ + Merge content from two sections. + + Handles different content types: + - code_block: Append code, handle overlaps, merge incomplete lines + - paragraph/heading: Append text + - table: Merge rows + - list: Merge items + - Other: Merge elements + """ + contentType = existingSection.get("content_type", "") + existingElements = existingSection.get("elements", []) + newElements = newSection.get("elements", []) + + if not newElements: + return existingSection + + # Handle list of elements + if isinstance(existingElements, list): + existingElem = existingElements[-1] if existingElements else {} + else: + existingElem = existingElements + + if isinstance(newElements, list): + newElem = newElements[0] if newElements else {} + else: + newElem = newElements + + if not isinstance(existingElem, dict) or not isinstance(newElem, dict): + return existingSection + + # Merge based on content type + if contentType == "code_block": + existingCode = existingElem.get("code", "") + newCode = newElem.get("code", "") + + if existingCode and newCode: + mergedCode = JsonResponseHandler.mergeCodeBlocks(existingCode, newCode, iteration) + existingElem["code"] = mergedCode + # Preserve language from existing or new + if "language" not in existingElem and "language" in newElem: + existingElem["language"] = newElem["language"] + + elif contentType in ["paragraph", "heading"]: + existingText = existingElem.get("text", "") + newText = newElem.get("text", "") + + if existingText and newText: + # Append text with space if needed + if existingText.rstrip() and not existingText.rstrip()[-1] in '.!?\n': + mergedText = existingText.rstrip() + " " + newText.lstrip() + else: + mergedText = existingText.rstrip() + "\n" + newText.lstrip() + existingElem["text"] = mergedText + + elif contentType == "table": + # Merge table rows with sophisticated overlap detection + existingRows = existingElem.get("rows", []) + newRows = newElem.get("rows", []) + if existingRows and newRows: + # Use sophisticated overlap detection that handles multiple overlapping rows + mergedRows = JsonResponseHandler.mergeRowsWithOverlap(existingRows, newRows, iteration) + existingElem["rows"] = mergedRows + logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(mergedRows)}") + elif newRows: + # If existing has no rows but new does, use new rows + existingElem["rows"] = newRows + # Preserve headers from existing (or use new if existing has none) + if not existingElem.get("headers") and newElem.get("headers"): + existingElem["headers"] = newElem["headers"] + # Preserve caption from existing (or use new if existing has none) + if not existingElem.get("caption") and newElem.get("caption"): + existingElem["caption"] = newElem.get("caption") + + elif contentType in ["bullet_list", "numbered_list"]: + # Merge list items with sophisticated overlap detection + existingItems = existingElem.get("items", []) + newItems = newElem.get("items", []) + if existingItems and newItems: + mergedItems = JsonResponseHandler.mergeItemsWithOverlap(existingItems, newItems, iteration) + existingElem["items"] = mergedItems + elif newItems: + existingElem["items"] = newItems + + elif contentType == "image": + # Images are typically complete - if new image is provided, replace existing + # But check if existing image data is incomplete (e.g., base64 string cut off) + existingImageData = existingElem.get("base64Data", "") + newImageData = newElem.get("base64Data", "") + if existingImageData and newImageData: + # If existing image data doesn't end with valid base64 padding, it might be incomplete + # Base64 padding is '=' or '==' at the end + if not existingImageData.rstrip().endswith(('=', '==')): + # Existing image might be incomplete - merge by appending new data + # This handles cases where base64 string was cut off + existingElem["base64Data"] = existingImageData + newImageData + logger.debug(f"Iteration {iteration}: Merged incomplete image base64 data") + else: + # Existing image is complete - replace with new (or keep existing if new is empty) + if newImageData: + existingElem["base64Data"] = newImageData + elif newImageData: + existingElem["base64Data"] = newImageData + # Preserve other image metadata + if not existingElem.get("altText") and newElem.get("altText"): + existingElem["altText"] = newElem["altText"] + if not existingElem.get("caption") and newElem.get("caption"): + existingElem["caption"] = newElem["caption"] + + else: + # GENERIC FALLBACK: Use deep recursive merging for complex nested structures + # This handles any content type with arbitrary depth and complexity + merged_element = JsonResponseHandler.mergeDeepStructures( + existingElem, + newElem, + iteration, + f"section.{contentType}" + ) + existingElem = merged_element + + # Update section with merged content + mergedSection = existingSection.copy() + if isinstance(existingElements, list): + # Update the last element in the list with merged content + if existingElements: + existingElements[-1] = existingElem + mergedSection["elements"] = existingElements + else: + mergedSection["elements"] = existingElem + + # Preserve metadata from new section if missing in existing + if "order" not in mergedSection and "order" in newSection: + mergedSection["order"] = newSection["order"] + + return mergedSection + + @staticmethod + def mergeCodeBlocks(existingCode: str, newCode: str, iteration: int) -> str: + """ + Merge two code blocks intelligently, handling overlaps and incomplete lines. + """ + if not existingCode: + return newCode + if not newCode: + return existingCode + + existingLines = existingCode.rstrip().split('\n') + newLines = newCode.strip().split('\n') + + if not existingLines or not newLines: + return existingCode + "\n" + newCode + + lastExistingLine = existingLines[-1].strip() + firstNewLine = newLines[0].strip() + + # Strategy 1: Exact overlap - remove duplicate line + if lastExistingLine == firstNewLine: + newLines = newLines[1:] + logger.debug(f"Iteration {iteration}: Removed exact duplicate line in code merge") + + # Strategy 2: Incomplete line merge + # If last existing line ends with comma or is incomplete, merge with first new line + elif lastExistingLine.endswith(',') or (',' in lastExistingLine and len(lastExistingLine.split(',')[-1]) < 5): + # Last line is incomplete - merge with first new line + # Remove trailing comma from existing line + mergedLine = lastExistingLine.rstrip(',') + ',' + firstNewLine.lstrip() + existingLines[-1] = mergedLine + newLines = newLines[1:] + logger.debug(f"Iteration {iteration}: Merged incomplete line with continuation") + + # Strategy 3: Partial overlap detection + # Check if first new line starts with the end of last existing line + elif ',' in lastExistingLine and ',' in firstNewLine: + lastExistingParts = lastExistingLine.split(',') + firstNewParts = firstNewLine.split(',') + + # Check for overlap: if last part of existing matches first part of new + if lastExistingParts and firstNewParts: + lastExistingPart = lastExistingParts[-1].strip() + firstNewPart = firstNewParts[0].strip() + + # If they match, there's overlap + if lastExistingPart == firstNewPart and len(lastExistingParts) > 1: + # Remove overlapping part from new line + newLines[0] = ','.join(firstNewParts[1:]) + logger.debug(f"Iteration {iteration}: Removed partial overlap in code merge") + + # Reconstruct merged code + mergedCode = '\n'.join(existingLines) + if newLines: + if mergedCode and not mergedCode.endswith('\n'): + mergedCode += '\n' + mergedCode += '\n'.join(newLines) + + return mergedCode + + @staticmethod + def detectAndParseJsonFragment( + result: str, + allSections: List[Dict[str, Any]] + ) -> Optional[Dict[str, Any]]: + """ + Detect if response is a JSON fragment (continuation content) rather than full document structure. + + Fragments are continuation content that needs to be merged into existing sections. + Examples: + - Array of table rows: [["37643", "37649", ...], ...] + - Array of code lines: ["line1", "line2", ...] + - Array of list items: ["item1", "item2", ...] + + Returns fragment info dict with: + - fragment_type: "table_rows", "code_lines", "list_items", etc. + - fragment_data: The parsed fragment content + - target_section_id: ID of section to merge into (if identifiable) + """ + try: + extracted = extractJsonString(result) + parsed = json.loads(extracted) + + # Check if it's a JSON fragment (not full document structure) + # Fragment indicators: + # 1. It's an array (not an object) + # 2. It doesn't have "documents" or "sections" keys + # 3. It's continuation content (rows, lines, items, etc.) + + if isinstance(parsed, list): + # It's an array - check if it looks like continuation content + if len(parsed) > 0: + first_item = parsed[0] + + # Check if it's an array of arrays (table rows) + if isinstance(first_item, list): + # This looks like table rows: [["col1", "col2"], ["col3", "col4"], ...] + logger.debug("Detected JSON fragment: table rows array") + return { + "fragment_type": "table_rows", + "fragment_data": parsed, + "target_section_id": JsonResponseHandler.findTargetSectionId(allSections, "table") + } + + # Check if it's an array of strings (code lines or list items) + elif isinstance(first_item, str): + # Could be code lines or list items - check context + # If we have a code_block section, it's likely code lines + # If we have a list section, it's likely list items + target_section_id = JsonResponseHandler.findTargetSectionId(allSections, "code_block") + if target_section_id: + logger.debug("Detected JSON fragment: code lines array") + return { + "fragment_type": "code_lines", + "fragment_data": parsed, + "target_section_id": target_section_id + } + + target_section_id = JsonResponseHandler.findTargetSectionId(allSections, "bullet_list") + if target_section_id: + logger.debug("Detected JSON fragment: list items array") + return { + "fragment_type": "list_items", + "fragment_data": parsed, + "target_section_id": target_section_id + } + + # Default to code lines if no context + logger.debug("Detected JSON fragment: string array (assuming code lines)") + return { + "fragment_type": "code_lines", + "fragment_data": parsed, + "target_section_id": JsonResponseHandler.findTargetSectionId(allSections, "code_block") + } + + # Check if it's a partial object that's missing document structure + elif isinstance(parsed, dict): + # If it has "rows" but no "documents" or "sections", it might be a table element fragment + if "rows" in parsed and "documents" not in parsed and "sections" not in parsed: + logger.debug("Detected JSON fragment: table element with rows") + return { + "fragment_type": "table_element", + "fragment_data": parsed, + "target_section_id": JsonResponseHandler.findTargetSectionId(allSections, "table") + } + + # If it has "code" but no "documents" or "sections", it might be a code element fragment + if "code" in parsed and "documents" not in parsed and "sections" not in parsed: + logger.debug("Detected JSON fragment: code element") + return { + "fragment_type": "code_element", + "fragment_data": parsed, + "target_section_id": JsonResponseHandler.findTargetSectionId(allSections, "code_block") + } + + except Exception as e: + logger.debug(f"Error detecting JSON fragment: {e}") + + return None + + @staticmethod + def findTargetSectionId( + allSections: List[Dict[str, Any]], + contentType: str + ) -> Optional[str]: + """Find the last incomplete section of the given content type.""" + # Find the last section with matching content type + for section in reversed(allSections): + if section.get("content_type") == contentType: + # Check if it's incomplete + if JsonResponseHandler.isSectionIncomplete(section): + return section.get("id") + # If not incomplete but it's the right type, still return it + return section.get("id") + return None + + @staticmethod + def mergeFragmentIntoSection( + fragment: Dict[str, Any], + allSections: List[Dict[str, Any]], + iteration: int + ) -> List[Dict[str, Any]]: + """ + Merge a JSON fragment into the appropriate section. + + This handles the special case where iteration N returns continuation content + that needs to be merged into the existing structure at the overlapping point. + """ + fragment_type = fragment.get("fragment_type") + fragment_data = fragment.get("fragment_data") + target_section_id = fragment.get("target_section_id") + + if not fragment_type or not fragment_data: + return allSections + + # Find the target section + target_section = None + target_index = -1 + for i, section in enumerate(allSections): + if section.get("id") == target_section_id: + target_section = section + target_index = i + break + + # If no target section found, try to find last incomplete section of matching type + if not target_section: + for i, section in enumerate(allSections): + if section.get("content_type") == JsonResponseHandler.getContentTypeForFragment(fragment_type): + if JsonResponseHandler.isSectionIncomplete(section): + target_section = section + target_index = i + break + + # If still no target, find last section of matching type + if not target_section: + for i, section in enumerate(reversed(allSections)): + if section.get("content_type") == JsonResponseHandler.getContentTypeForFragment(fragment_type): + target_section = section + target_index = len(allSections) - 1 - i + break + + if not target_section: + logger.warning(f"Iteration {iteration}: No target section found for fragment type {fragment_type}") + return allSections + + # Merge fragment into target section based on type + merged_section = target_section.copy() + elements = merged_section.get("elements", []) + + if not isinstance(elements, list): + elements = [elements] if elements else [] + + if not elements: + # Create new element if none exists + elements = [{}] + + last_element = elements[-1] if elements else {} + if not isinstance(last_element, dict): + last_element = {} + elements.append(last_element) + + # Merge based on fragment type using deep recursive merging + if fragment_type == "table_rows": + existing_rows = last_element.get("rows", []) + if not isinstance(existing_rows, list): + existing_rows = [] + + # Merge rows with sophisticated overlap detection + new_rows = fragment_data + merged_rows = JsonResponseHandler.mergeRowsWithOverlap(existing_rows, new_rows, iteration) + last_element["rows"] = merged_rows + + # Preserve headers if they exist + if not last_element.get("headers") and isinstance(fragment_data, list) and len(fragment_data) > 0: + # Try to infer headers from first row if it's a header row + first_row = fragment_data[0] + if isinstance(first_row, list) and len(first_row) > 0: + # Check if first row looks like headers (all strings, descriptive) + if all(isinstance(cell, str) for cell in first_row): + last_element["headers"] = first_row + merged_rows = merged_rows[1:] # Remove header row + last_element["rows"] = merged_rows + + elif fragment_type == "code_lines": + existing_code = last_element.get("code", "") + new_lines = fragment_data + + # Convert array of strings to code block + if isinstance(new_lines, list): + new_code = "\n".join(str(line) for line in new_lines) + else: + new_code = str(new_lines) + + merged_code = JsonResponseHandler.mergeCodeBlocks(existing_code, new_code, iteration) + last_element["code"] = merged_code + + elif fragment_type == "list_items": + existing_items = last_element.get("items", []) + if not isinstance(existing_items, list): + existing_items = [] + + new_items = fragment_data if isinstance(fragment_data, list) else [fragment_data] + merged_items = JsonResponseHandler.mergeItemsWithOverlap(existing_items, new_items, iteration) + last_element["items"] = merged_items + + elif fragment_type == "table_element": + # Use deep recursive merge for complex table structures + # This handles nested structures, multiple overlapping rows, etc. + merged_element = JsonResponseHandler.mergeDeepStructures( + last_element, + fragment_data, + iteration, + f"section.{target_section_id}.table_element" + ) + last_element = merged_element + + elif fragment_type == "code_element": + # Use deep recursive merge for complex code structures + merged_element = JsonResponseHandler.mergeDeepStructures( + last_element, + fragment_data, + iteration, + f"section.{target_section_id}.code_element" + ) + last_element = merged_element + + else: + # Generic fragment - use deep recursive merge + # This handles any complex nested structure + merged_element = JsonResponseHandler.mergeDeepStructures( + last_element, + fragment_data, + iteration, + f"section.{target_section_id}.{fragment_type}" + ) + last_element = merged_element + + # Update elements + elements[-1] = last_element + merged_section["elements"] = elements + + # Update allSections + merged_sections = allSections.copy() + merged_sections[target_index] = merged_section + + logger.info(f"Iteration {iteration}: Merged {fragment_type} fragment into section '{target_section_id}'") + return merged_sections + + @staticmethod + def getContentTypeForFragment(fragment_type: str) -> str: + """Map fragment type to content type.""" + mapping = { + "table_rows": "table", + "table_element": "table", + "code_lines": "code_block", + "code_element": "code_block", + "list_items": "bullet_list" + } + return mapping.get(fragment_type, "paragraph") + + @staticmethod + def deepCompare(obj1: Any, obj2: Any, max_depth: int = 10) -> bool: + """ + Deep recursive comparison of two JSON-serializable objects. + Handles nested structures of any depth and complexity. + + Args: + obj1: First object to compare + obj2: Second object to compare + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + True if objects are deeply equal, False otherwise + """ + if max_depth <= 0: + return False + + # Type check + if type(obj1) != type(obj2): + return False + + # Primitive types + if isinstance(obj1, (str, int, float, bool, type(None))): + return obj1 == obj2 + + # Lists/arrays - compare element by element + if isinstance(obj1, list): + if len(obj1) != len(obj2): + return False + return all(JsonResponseHandler.deepCompare(item1, item2, max_depth - 1) + for item1, item2 in zip(obj1, obj2)) + + # Dicts/objects - compare key by key + if isinstance(obj1, dict): + if set(obj1.keys()) != set(obj2.keys()): + return False + return all(JsonResponseHandler.deepCompare(obj1[key], obj2[key], max_depth - 1) + for key in obj1.keys()) + + # Fallback for other types + return obj1 == obj2 + + @staticmethod + def findLongestCommonSuffix( + existing_list: List[Any], + new_list: List[Any], + min_overlap: int = 1 + ) -> int: + """ + Find the longest common suffix of existing_list that matches a prefix of new_list. + + This handles cases where multiple elements overlap: + - existing: [A, B, C, D] + - new: [C, D, E, F] + - overlap: [C, D] (length 2) + + Returns the length of the overlap (0 if no overlap found). + """ + if not existing_list or not new_list: + return 0 + + max_overlap = min(len(existing_list), len(new_list)) + + # Try all possible overlap lengths (from longest to shortest) + for overlap_len in range(max_overlap, min_overlap - 1, -1): + existing_suffix = existing_list[-overlap_len:] + new_prefix = new_list[:overlap_len] + + # Deep compare suffix and prefix + if all(JsonResponseHandler.deepCompare(existing_suffix[i], new_prefix[i]) + for i in range(overlap_len)): + return overlap_len + + return 0 + + @staticmethod + def findPartialOverlap( + existing_item: Any, + new_item: Any + ) -> Tuple[bool, Optional[Any]]: + """ + Detect if new_item completes an incomplete existing_item. + + Handles cases like: + - existing: ["37643", "37649", "37657", "37663", "37691", "37693", "37699", "37717", "37747", "376"] + - new: ["37643", "37649", ...] + + Returns (is_partial_overlap, merged_item) if partial overlap detected, else (False, None). + """ + # Check if both are lists + if isinstance(existing_item, list) and isinstance(new_item, list): + if not existing_item or not new_item: + return False, None + + # Check if last element of existing is incomplete and matches first of new + last_existing = existing_item[-1] + first_new = new_item[0] + + # If last existing is a string and first new is a string + if isinstance(last_existing, str) and isinstance(first_new, str): + # Check if last existing is incomplete (very short, ends with number, etc.) + if len(last_existing) < 10 and first_new.startswith(last_existing): + # Partial overlap - merge them + merged_last = last_existing + first_new[len(last_existing):] + merged_item = existing_item[:-1] + [merged_last] + new_item[1:] + return True, merged_item + + # Check if last existing is incomplete list and first new completes it + if isinstance(last_existing, list) and isinstance(first_new, list): + if len(last_existing) < len(first_new): + # Check if last existing is prefix of first new + if first_new[:len(last_existing)] == last_existing: + # Merge: replace incomplete last with complete first + merged_item = existing_item[:-1] + [first_new] + new_item[1:] + return True, merged_item + + # Check if existing is incomplete string and new completes it + if isinstance(existing_item, str) and isinstance(new_item, str): + if len(existing_item) < 50 and new_item.startswith(existing_item): + # Partial overlap + merged = existing_item + new_item[len(existing_item):] + return True, merged + + return False, None + + @staticmethod + def mergeRowsWithOverlap( + existing_rows: List[List[str]], + new_rows: List[List[str]], + iteration: int + ) -> List[List[str]]: + """ + Merge table rows with sophisticated overlap detection. + Handles multiple overlapping rows and partial overlaps. + """ + if not new_rows: + return existing_rows + if not existing_rows: + return new_rows + + # Strategy 1: Find longest common suffix/prefix overlap + overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_rows, new_rows, min_overlap=1) + if overlap_len > 0: + logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping table rows, removing duplicates") + return existing_rows + new_rows[overlap_len:] + + # Strategy 2: Check for partial overlap in last row + if len(existing_rows) > 0 and len(new_rows) > 0: + last_existing = existing_rows[-1] + first_new = new_rows[0] + + is_partial, merged_row = JsonResponseHandler.findPartialOverlap(last_existing, first_new) + if is_partial: + logger.debug(f"Iteration {iteration}: Found partial overlap in table rows, merging") + return existing_rows[:-1] + [merged_row] + new_rows[1:] + + # Strategy 3: Simple first/last comparison (fallback) + if isinstance(existing_rows[-1], list) and isinstance(new_rows[0], list): + if list(existing_rows[-1]) == list(new_rows[0]): + logger.debug(f"Iteration {iteration}: Removed duplicate table row (exact match)") + return existing_rows + new_rows[1:] + + # No overlap detected - append all new rows + return existing_rows + new_rows + + @staticmethod + def mergeItemsWithOverlap( + existing_items: List[str], + new_items: List[str], + iteration: int + ) -> List[str]: + """ + Merge list items with sophisticated overlap detection. + Handles multiple overlapping items and partial overlaps. + """ + if not new_items: + return existing_items + if not existing_items: + return new_items + + # Strategy 1: Find longest common suffix/prefix overlap + overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing_items, new_items, min_overlap=1) + if overlap_len > 0: + logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping list items, removing duplicates") + return existing_items + new_items[overlap_len:] + + # Strategy 2: Check for partial overlap in last item + if len(existing_items) > 0 and len(new_items) > 0: + is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing_items[-1], new_items[0]) + if is_partial: + logger.debug(f"Iteration {iteration}: Found partial overlap in list items, merging") + return existing_items[:-1] + [merged_item] + new_items[1:] + + # Strategy 3: Simple first/last comparison (fallback) + if existing_items[-1] == new_items[0]: + logger.debug(f"Iteration {iteration}: Removed duplicate list item (exact match)") + return existing_items + new_items[1:] + + # No overlap detected - append all new items + return existing_items + new_items + + @staticmethod + def mergeDeepStructures( + existing: Any, + new: Any, + iteration: int, + path: str = "root" + ) -> Any: + """ + Recursively merge two JSON structures of arbitrary depth and complexity. + Handles overlaps at any nesting level. + + Args: + existing: Existing structure to merge into + new: New structure to merge + iteration: Current iteration number for logging + path: Current path in structure (for debugging) + + Returns: + Merged structure + """ + # Type check + if type(existing) != type(new): + # Types don't match - return new (replacement) + logger.debug(f"Iteration {iteration}: Types don't match at {path}, replacing") + return new + + # Lists/arrays - merge with overlap detection + if isinstance(existing, list) and isinstance(new, list): + if not new: + return existing + if not existing: + return new + + # Try to find overlap + overlap_len = JsonResponseHandler.findLongestCommonSuffix(existing, new, min_overlap=1) + if overlap_len > 0: + logger.debug(f"Iteration {iteration}: Found {overlap_len} overlapping elements at {path}, removing duplicates") + return existing + new[overlap_len:] + + # Check for partial overlap in last element + if len(existing) > 0 and len(new) > 0: + is_partial, merged_item = JsonResponseHandler.findPartialOverlap(existing[-1], new[0]) + if is_partial: + logger.debug(f"Iteration {iteration}: Found partial overlap at {path}, merging") + return existing[:-1] + [merged_item] + new[1:] + + # No overlap - append all + return existing + new + + # Dicts/objects - merge recursively + if isinstance(existing, dict) and isinstance(new, dict): + merged = existing.copy() + for key, new_value in new.items(): + if key in merged: + # Key exists - merge recursively + merged[key] = JsonResponseHandler.mergeDeepStructures( + merged[key], + new_value, + iteration, + f"{path}.{key}" + ) + else: + # New key - add it + merged[key] = new_value + return merged + + # Primitives - if equal, return existing; otherwise return new + if existing == new: + return existing + return new + diff --git a/modules/services/serviceGeneration/renderers/rendererDocx.py b/modules/services/serviceGeneration/renderers/rendererDocx.py index 61a645a7..179cbe75 100644 --- a/modules/services/serviceGeneration/renderers/rendererDocx.py +++ b/modules/services/serviceGeneration/renderers/rendererDocx.py @@ -497,13 +497,11 @@ class RendererDocx(BaseRenderer): # Extract title from prompt if not provided if not title or title == "Generated Document": # Look for "create a ... document" or "generate a ... report" - import re title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', userPrompt.lower()) if title_match: structure['title'] = title_match.group(1).strip().title() # Extract sections from numbered lists in prompt - import re section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)' sections = re.findall(section_pattern, userPrompt) @@ -849,7 +847,6 @@ class RendererDocx(BaseRenderer): Returns the content with tables replaced by placeholders. """ import csv - import io lines = content.split('\n') processed_lines = [] diff --git a/modules/services/serviceGeneration/renderers/rendererXlsx.py b/modules/services/serviceGeneration/renderers/rendererXlsx.py index f90a0980..9fca82e9 100644 --- a/modules/services/serviceGeneration/renderers/rendererXlsx.py +++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py @@ -95,7 +95,7 @@ class RendererXlsx(BaseRenderer): # Title sheet['A1'] = title sheet['A1'].font = Font(size=16, bold=True) - sheet['A1'].alignment = Alignment(horizontal='center') + sheet['A1'].alignment = Alignment(horizontal='left') # Generation info sheet['A3'] = "Generated:" @@ -325,7 +325,7 @@ class RendererXlsx(BaseRenderer): def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default Excel style set - used when no style instructions present.""" return { - "title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "center"}, + "title": {"font_size": 16, "color": "#FF1F4E79", "bold": True, "align": "left"}, "heading": {"font_size": 14, "color": "#FF2F2F2F", "bold": True, "align": "left"}, "table_header": {"background": "#FF4F4F4F", "text_color": "#FFFFFFFF", "bold": True, "align": "center"}, "table_cell": {"background": "#FFFFFFFF", "text_color": "#FF2F2F2F", "bold": False, "align": "left"}, @@ -543,8 +543,9 @@ class RendererXlsx(BaseRenderer): try: # Sheet title sheet['A1'] = sheetTitle - sheet['A1'].font = Font(size=16, bold=True, color=self._getSafeColor(styles.get("title", {}).get("color", "FF1F4E79"))) - sheet['A1'].alignment = Alignment(horizontal="center") + title_style = styles.get("title", {}) + sheet['A1'].font = Font(size=16, bold=True, color=self._getSafeColor(title_style.get("color", "FF1F4E79"))) + sheet['A1'].alignment = Alignment(horizontal=title_style.get("align", "left")) # Get table data from elements (canonical JSON format) elements = section.get("elements", []) @@ -592,7 +593,7 @@ class RendererXlsx(BaseRenderer): sheet['A1'] = documentTitle # Safety check for title style - title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "center"}) + title_style = styles.get("title", {"font_size": 16, "bold": True, "color": "#FF1F4E79", "align": "left"}) try: safe_color = self._getSafeColor(title_style["color"]) sheet['A1'].font = Font(size=title_style["font_size"], bold=title_style["bold"], color=safe_color) diff --git a/modules/services/serviceUtils/mainServiceUtils.py b/modules/services/serviceUtils/mainServiceUtils.py index 849cc3ef..bbee6540 100644 --- a/modules/services/serviceUtils/mainServiceUtils.py +++ b/modules/services/serviceUtils/mainServiceUtils.py @@ -271,12 +271,6 @@ class UtilsService: def jsonTryParse(self, text) -> tuple: return jsonUtils.tryParseJson(text) - def jsonParseOrRaise(self, text): - return jsonUtils.parseJsonOrRaise(text) - - def jsonMergeRootLists(self, parts): - return jsonUtils.mergeRootLists(parts) - # ===== Enum utility functions ===== def mapToEnum(self, enum_class, value_str, default_value): diff --git a/modules/shared/debugLogger.py b/modules/shared/debugLogger.py index c68546bf..6ee78bc7 100644 --- a/modules/shared/debugLogger.py +++ b/modules/shared/debugLogger.py @@ -159,7 +159,6 @@ def storeDebugMessageAndDocuments(message, currentUser) -> None: """ try: import json - from datetime import datetime, UTC # Create base debug directory (use base debug dir, not prompts subdirectory) baseDebugDir = _getBaseDebugDir() diff --git a/modules/shared/jsonUtils.py b/modules/shared/jsonUtils.py index dc51a349..3da04d21 100644 --- a/modules/shared/jsonUtils.py +++ b/modules/shared/jsonUtils.py @@ -97,47 +97,6 @@ def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]], return None, e, cleaned -def parseJsonOrRaise(text: Union[str, bytes]) -> Union[Dict, List]: - obj, err, cleaned = tryParseJson(text) - if err is not None: - logger.error(f"parse_json_or_raise failed: {err}. Cleaned preview: {cleaned[:200]}...") - raise err - return obj - - -def mergeRootLists(jsonParts: List[Union[str, Dict, List]]) -> Dict[str, Any]: - """ - Generic merger for root-level lists: take first dict as base; for each subsequent part: - - if value is list and same key exists as list, extend it - - if key absent, add it - - for non-list keys, keep the original (from the first part) - Sets continuation=None if present in base. - """ - base: Optional[Dict[str, Any]] = None - parsed: List[Dict[str, Any]] = [] - for part in jsonParts: - if isinstance(part, (dict, list)): - obj = part - else: - obj, err, _ = tryParseJson(part) - if err is not None or not isinstance(obj, (dict, list)): - continue - if isinstance(obj, dict): - parsed.append(obj) - if not parsed: - return {} - base = dict(parsed[0]) - for obj in parsed[1:]: - for k, v in obj.items(): - if isinstance(v, list) and isinstance(base.get(k), list): - base[k].extend(v) - elif k not in base: - base[k] = v - if 'continuation' in base: - base['continuation'] = None - return base - - def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]: """ Attempt to repair broken JSON using multiple strategies. diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py index 183d4605..1e837f62 100644 --- a/modules/workflows/methods/methodAi.py +++ b/modules/workflows/methods/methodAi.py @@ -271,7 +271,20 @@ class MethodAi(MethodBase): # Prepare extraction options self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options") - extractionOptions = parameters.extractionOptions + extractionOptionsParam = parameters.get("extractionOptions") + + # Convert dict to ExtractionOptions object if needed, or create defaults + if extractionOptionsParam: + if isinstance(extractionOptionsParam, dict): + # Convert dict to ExtractionOptions object + extractionOptions = ExtractionOptions(**extractionOptionsParam) + elif isinstance(extractionOptionsParam, ExtractionOptions): + extractionOptions = extractionOptionsParam + else: + # Invalid type, use defaults + extractionOptions = None + else: + extractionOptions = None # If extractionOptions not provided, create defaults if not extractionOptions: @@ -297,10 +310,21 @@ class MethodAi(MethodBase): # Build ActionDocuments from ContentExtracted results self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents") actionDocuments = [] - for extracted in extractedResults: + # Map extracted results back to original documents by index (results are in same order) + for i, extracted in enumerate(extractedResults): + # Get original document name if available + originalDoc = chatDocuments[i] if i < len(chatDocuments) else None + if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName: + # Use original filename with "extracted_" prefix + baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName + documentName = f"{baseName}_extracted_{extracted.id}.json" + else: + # Fallback to generic name with index + documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json" + # Store ContentExtracted object in ActionDocument.documentData actionDoc = ActionDocument( - documentName=f"extracted_{extracted.id}.json", + documentName=documentName, documentData=extracted, # ContentExtracted object mimeType="application/json" ) diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py index be420b36..b24b4e52 100644 --- a/modules/workflows/processing/adaptive/contentValidator.py +++ b/modules/workflows/processing/adaptive/contentValidator.py @@ -22,7 +22,7 @@ class ContentValidator: self.services = services self.learningEngine = learningEngine - async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None) -> Dict[str, Any]: + async def validateContent(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Validates delivered content against user intent using AI (single attempt; parse-or-fail) Args: @@ -30,8 +30,9 @@ class ContentValidator: intent: Workflow-level intent dict (for format requirements) taskStep: Optional TaskStep object (preferred source for objective) actionName: Optional action name (e.g., "ai.process", "ai.webResearch") that created the documents + actionParameters: Optional action parameters used during execution (e.g., {"columnsPerRow": 10, "researchDepth": "deep"}) """ - return await self._validateWithAI(documents, intent, taskStep, actionName) + return await self._validateWithAI(documents, intent, taskStep, actionName, actionParameters) def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]: """Generic document analysis - create simple summaries with metadata.""" @@ -368,7 +369,7 @@ class ContentValidator: return False - async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None) -> Dict[str, Any]: + async def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any], taskStep: Optional[Any] = None, actionName: Optional[str] = None, actionParameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """AI-based comprehensive validation - generic approach""" try: if not hasattr(self, 'services') or not self.services or not hasattr(self.services, 'ai'): @@ -430,48 +431,91 @@ class ContentValidator: actionDescription = "Content processing" actionContext = f"\nDOCUMENTS CREATED BY: {actionDescription} ({actionName})" - # Format success criteria for display - criteriaDisplay = json.dumps(successCriteria, ensure_ascii=False) if successCriteria else "[]" + # Build action parameters context + actionParamsContext = "" + if actionParameters and isinstance(actionParameters, dict) and len(actionParameters) > 0: + # Filter out documentList and other large/redundant parameters for clarity + relevantParams = {k: v for k, v in actionParameters.items() + if k not in ['documentList', 'connections'] and v is not None} + if relevantParams: + paramsJson = json.dumps(relevantParams, ensure_ascii=False, indent=2) + actionParamsContext = f"\nACTION PARAMETERS USED: {paramsJson}" - # Build successCriteriaMet example - show proper array format - criteriaMetExample = json.dumps([False] * criteriaCount) if criteriaCount > 0 else "[]" + # Format success criteria for display with index numbers + if successCriteria: + criteriaDisplay = "\n".join([f"[{i}] {criterion}" for i, criterion in enumerate(successCriteria)]) + else: + criteriaDisplay = "[]" promptBase = f"""TASK VALIDATION +=== TASK INFORMATION === {objectiveLabel}: '{objectiveText}' EXPECTED DATA TYPE: {dataType} -EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']} -SUCCESS CRITERIA ({criteriaCount} items): {criteriaDisplay}{actionContext} +EXPECTED FORMATS: {expectedFormats if expectedFormats else ['any']}{actionContext}{actionParamsContext} + +=== VALIDATION INSTRUCTIONS === VALIDATION CONTEXT: You have METADATA (filename, format, size, mimeType) and STRUCTURE SUMMARY (if available: sections, tables, captions, IDs, statistics). VALIDATION PRINCIPLES: -1. Format compatibility: Match delivered format to expected format -2. Structure validation: Use structure summary to verify requirements (section count, table captions, IDs, section types, etc.) -3. Filename appropriateness: Check if filename suggests correct content type -4. Document count: Verify number matches expectations -5. Size sanity: Only flag if clearly wrong (<1KB for complex content or suspiciously large) +1. EVIDENCE-BASED VALIDATION (CRITICAL): Claims must match structure evidence. If structure shows different values than claimed, trust the structure evidence, not claims. +2. INDEPENDENT CRITERIA EVALUATION (CRITICAL): For criteriaMapping reason field - address ONLY the specific criterion requirement. Do not mention other criteria or other issues. +3. PRIORITY: Missing data > Formatting issues. Always prioritize data completeness over format correctness. +4. Structure validation: Use structure summary (statistics, counts, structure metadata) as PRIMARY evidence. Compare with task requirements. +5. Discrepancy detection: If task requires specific quantities/amounts but structure shows different values, classify as missing_data or incomplete_data, not success. +6. Format compatibility: Match delivered format to expected format (secondary priority after data completeness) +7. Filename appropriateness: Check if filename suggests correct content type +8. Document count: Verify number matches expectations LIMITATIONS: - Cannot validate: Content accuracy, data correctness, formatting details, or requirements requiring full content reading - If structure summary unavailable, validate only metadata (format, filename, count, size) SCORING GUIDELINES: -- Format matches + reasonable structure → qualityScore: 0.8-1.0 -- Format matches but structure issues → qualityScore: 0.7-0.8 +- Data complete + format matches + structure matches requirements → qualityScore: 0.9-1.0 +- Data complete but format/structure issues → qualityScore: 0.7-0.9 +- Missing/incomplete data (even if format correct) → qualityScore: <0.7 +- Claims don't match structure evidence → qualityScore: <0.6 (trust structure, not claims) - Format mismatch → qualityScore: <0.7 - Only suggest improvements for CLEAR metadata/structure issues -OUTPUT FORMAT (JSON only): +VALIDATION LOGIC: +- If structure shows fewer quantities/amounts than required → gapType: missing_data or incomplete_data +- If structure shows wrong organization but correct quantity → gapType: wrong_structure +- If structure matches requirements but format wrong → gapType: wrong_format +- If claims say "X delivered" but structure shows "Y" (Y < X) → overallSuccess: false, gapType: missing_data +- Always trust structure statistics over any claims or descriptions + +IMPROVEMENT SUGGESTIONS PRIORITY (CRITICAL): +- Order by CRITERIA PRIORITY first, then gapType priority: missing_data > incomplete_data > wrong_structure > wrong_format +- [0] MUST address the HIGHEST PRIORITY unmet criterion (check criteriaMapping for which criteria are unmet) +- If multiple criteria are unmet, prioritize by: data completeness > structure > format +- gapType indicates the PRIMARY issue, but improvement suggestions must prioritize based on unmet criteria order + +=== OUTPUT FORMAT (JSON TEMPLATE) === {{ "overallSuccess": false, "qualityScore": 0.0, "dataTypeMatch": false, "formatMatch": false, "documentCount": {len(documents)}, - "successCriteriaMet": {criteriaMetExample}, + "criteriaMapping": [ + {{ + "index": 0, + "criterion": "exact_criterion_text_from_data_section", + "met": false, + "reason": "explanation_about_this_criterion_based_on_structure_evidence" + }} + ], "gapAnalysis": "Brief description of gaps based on metadata/structure only. If validation is limited, state this clearly.", + "gapType": "missing_data" | "wrong_structure" | "wrong_format" | "incomplete_data" | "no_gap", + "structureComparison": {{ + "required": {{}}, + "found": {{}}, + "gap": {{}} + }}, "improvementSuggestions": [], "validationDetails": [ {{ @@ -482,6 +526,15 @@ OUTPUT FORMAT (JSON only): ] }} +OUTPUT FORMAT NOTES: +- criteriaMapping reason: Address ONLY the specific criterion requirement. +- improvementSuggestions: [0] = highest priority unmet criterion from criteriaMapping. Order: unmet criteria by index first (data completeness > structure > format), then by gapType priority. + +=== DATA === + +SUCCESS CRITERIA TO VALIDATE in criteriaMapping array: +{criteriaDisplay} + DELIVERED DOCUMENTS ({len(documents)} items): """ @@ -522,7 +575,6 @@ DELIVERED DOCUMENTS ({len(documents)} items): # Proactively fix Python-style booleans (False/True -> false/true) BEFORE parsing # This handles booleans in any context: standalone, in lists, in dicts, etc. - import re # Use word boundaries but also handle cases where booleans are in brackets/arrays # Replace False/True regardless of context (word boundary handles string matching correctly) normalizedJson = re.sub(r'\bFalse\b', 'false', extractedJson) @@ -544,8 +596,10 @@ DELIVERED DOCUMENTS ({len(documents)} items): quality = aiResult.get("qualityScore") details = aiResult.get("validationDetails") gap = aiResult.get("gapAnalysis", "") - criteria = aiResult.get("successCriteriaMet") improvements = aiResult.get("improvementSuggestions", []) + gap_type = aiResult.get("gapType", "") + structure_comp = aiResult.get("structureComparison", {}) + criteria_mapping = aiResult.get("criteriaMapping", []) # Normalize while keeping failures explicit normalized = { @@ -553,10 +607,12 @@ DELIVERED DOCUMENTS ({len(documents)} items): "qualityScore": float(quality) if isinstance(quality, (int, float)) else None, "documentCount": len(documentSummaries), "gapAnalysis": gap if gap else "", + "gapType": gap_type if gap_type else "", + "structureComparison": structure_comp if structure_comp else {}, + "criteriaMapping": criteria_mapping if isinstance(criteria_mapping, list) else [], "validationDetails": details if isinstance(details, list) else [{ "documentName": "AI Validation", - "gapAnalysis": gap, - "successCriteriaMet": criteria if isinstance(criteria, list) else [] + "gapAnalysis": gap }], "improvementSuggestions": improvements, "schemaCompliant": True, @@ -585,7 +641,7 @@ DELIVERED DOCUMENTS ({len(documents)} items): "dataTypeMatch": False, "formatMatch": False, "documentCount": 0, - "successCriteriaMet": [], + "criteriaMapping": [], "gapAnalysis": errorMessage, "improvementSuggestions": [], "validationDetails": [], diff --git a/modules/workflows/processing/modes/modeDynamic.py b/modules/workflows/processing/modes/modeDynamic.py index 43d4f2b7..f91a4080 100644 --- a/modules/workflows/processing/modes/modeDynamic.py +++ b/modules/workflows/processing/modes/modeDynamic.py @@ -133,8 +133,10 @@ class DynamicMode(BaseMode): # Pass ALL documents to validator - validator decides what to validate (generic approach) # Pass taskStep so validator can use task.objective and format fields # Pass action name so validator knows which action created the documents + # Pass action parameters so validator can verify parameter-specific requirements actionName = selection.get('action', 'unknown') - validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName) + actionParameters = selection.get('parameters', {}) + validationResult = await self.contentValidator.validateContent(result.documents, self.workflowIntent, taskStep, actionName, actionParameters) observation.contentValidation = validationResult quality_score = validationResult.get('qualityScore', 0.0) if quality_score is None: @@ -807,9 +809,9 @@ class DynamicMode(BaseMode): 'documentsCount': observation.documentsCount, 'previews': [p.model_dump(exclude_none=True) if hasattr(p, 'model_dump') else p.dict() for p in observation.previews] if observation.previews else [], 'notes': observation.notes, - 'contentValidation': observation.contentValidation if observation.contentValidation else {}, 'contentAnalysis': observation.contentAnalysis if observation.contentAnalysis else {} } + # Note: contentValidation is shown separately in CONTENT VALIDATION section, not duplicated here reviewContext = ReviewContext( taskStep=context.taskStep, taskActions=[], @@ -822,21 +824,36 @@ class DynamicMode(BaseMode): baseReviewContent = extractReviewContent(reviewContext) placeholders = {"REVIEW_CONTENT": baseReviewContent} - # NEW: Add content validation to review content - enhancedReviewContent = placeholders.get("REVIEW_CONTENT", "") + # NEW: Add content validation to review content - extract separately for prominence + baseReviewContent = placeholders.get("REVIEW_CONTENT", "") + # Add observation title if there's content + if baseReviewContent.strip(): + baseReviewContent = f"=== OBSERVATION ===\n{baseReviewContent}" + contentValidationSection = "" if observation.contentValidation: validation = observation.contentValidation - enhancedReviewContent += f"\n\nCONTENT VALIDATION:\n" - enhancedReviewContent += f"Overall Success: {validation.get('overallSuccess', False)}\n" + contentValidationSection += f"\n=== CONTENT VALIDATION ===\n" + gap_type = validation.get('gapType', '') + if gap_type: + contentValidationSection += f"Gap Type: {gap_type}\n" + contentValidationSection += f"Overall Success: {validation.get('overallSuccess', False)}\n" quality_score = validation.get('qualityScore', 0.0) if quality_score is None: quality_score = 0.0 - enhancedReviewContent += f"Quality Score: {quality_score:.2f}\n" + contentValidationSection += f"Quality Score: {quality_score:.2f}\n" gap_analysis = validation.get('gapAnalysis', '') if gap_analysis: - enhancedReviewContent += f"Gap Analysis: {gap_analysis}\n" + contentValidationSection += f"Gap Analysis: {gap_analysis}\n" + structure_comparison = validation.get('structureComparison', {}) + if structure_comparison: + contentValidationSection += f"Structure Comparison: {json.dumps(structure_comparison, indent=2, ensure_ascii=False)}\n" if validation.get('improvementSuggestions'): - enhancedReviewContent += f"Improvement Suggestions: {', '.join(validation['improvementSuggestions'])}\n" + suggestions = validation['improvementSuggestions'] + contentValidationSection += f"Next Actions (in sequence):\n" + for i, suggestion in enumerate(suggestions): + contentValidationSection += f" [{i}] {suggestion}\n" + + enhancedReviewContent = baseReviewContent + contentValidationSection # NEW: Add content analysis to review content if observation.contentAnalysis: @@ -854,9 +871,41 @@ class DynamicMode(BaseMode): enhancedReviewContent += f"Partial Achievements: {len(progressState['partialAchievements'])}\n" enhancedReviewContent += f"Failed Attempts: {len(progressState['failedAttempts'])}\n" enhancedReviewContent += f"Current Phase: {progressState['currentPhase']}\n" - if progressState['nextActionsSuggested']: + # Use content validation priorities if available, otherwise fall back to progress tracker suggestions + if observation.contentValidation and observation.contentValidation.get('improvementSuggestions'): + # Content validation already shown above, no need to repeat + pass + elif progressState['nextActionsSuggested']: enhancedReviewContent += f"Next Action Suggestions: {', '.join(progressState['nextActionsSuggested'])}\n" + # NEW: Add action history to review content + if hasattr(context, 'previousReviewResult') and context.previousReviewResult: + actionHistory = [] + for i, prevDecision in enumerate(context.previousReviewResult, 1): + if prevDecision and hasattr(prevDecision, 'nextAction') and prevDecision.nextAction: + action = prevDecision.nextAction + params = getattr(prevDecision, 'nextActionParameters', {}) or {} + # Filter out documentList for clarity + relevantParams = {k: v for k, v in params.items() if k not in ['documentList', 'connections']} + paramsStr = json.dumps(relevantParams, ensure_ascii=False) if relevantParams else "{}" + quality = getattr(prevDecision, 'qualityScore', None) + qualityStr = f" (quality: {quality:.2f})" if quality is not None else "" + actionHistory.append(f"Round {i}: {action} {paramsStr}{qualityStr}") + + if actionHistory: + enhancedReviewContent += f"\nACTION HISTORY:\n" + enhancedReviewContent += "\n".join(f"- {entry}" for entry in actionHistory) + # Detect repeated actions + actionCounts = {} + for entry in actionHistory: + # Extract action name (before first space or {) + actionName = entry.split()[1] if len(entry.split()) > 1 else "unknown" + actionCounts[actionName] = actionCounts.get(actionName, 0) + 1 + + repeatedActions = [action for action, count in actionCounts.items() if count >= 2] + if repeatedActions: + enhancedReviewContent += f"\nWARNING: Repeated actions detected: {', '.join(repeatedActions)}. Consider a fundamentally different approach.\n" + # Update placeholders with enhanced review content placeholders["REVIEW_CONTENT"] = enhancedReviewContent diff --git a/modules/workflows/processing/shared/promptGenerationActionsDynamic.py b/modules/workflows/processing/shared/promptGenerationActionsDynamic.py index d9a699a6..c0af7adf 100644 --- a/modules/workflows/processing/shared/promptGenerationActionsDynamic.py +++ b/modules/workflows/processing/shared/promptGenerationActionsDynamic.py @@ -323,21 +323,22 @@ def generateDynamicRefinementPrompt(services, context: Any, reviewContent: str) ACTIONS: {{KEY:AVAILABLE_METHODS}} DOCUMENTS: {{KEY:AVAILABLE_DOCUMENTS_INDEX}} -=== OBSERVATION === {{KEY:REVIEW_CONTENT}} +=== NEXT ACTIONS === +Follow the improvement suggestions from CONTENT VALIDATION in priority order. Each suggestion indicates what action to take next. + === OUTPUT FORMAT === {{ "status": "continue", - "reason": "Brief reason", - "nextAction": "ai.convert", + "reason": "Brief reason explaining why continuing", + "nextAction": "Selected_action_from_ACTIONS", "nextActionParameters": {{ - "documentList": ["docItem:..."], - "inputFormat": "json", - "outputFormat": "csv", - "columnsPerRow": 10 + "documentList": ["docItem:reference_from_DOCUMENTS"], + "parameter1": "value1", + "parameter2": "value2" }}, - "nextActionObjective": "Convert JSON to CSV with 10 columns per row" + "nextActionObjective": "Clear description of what this action will achieve based on improvement suggestions" }} === RULES === @@ -345,9 +346,10 @@ DOCUMENTS: {{KEY:AVAILABLE_DOCUMENTS_INDEX}} - nextAction: SPECIFIC action from AVAILABLE_METHODS (do not invent) - nextActionParameters: concrete parameters (check AVAILABLE_METHODS for valid names) - documentList: ONLY exact references from AVAILABLE_DOCUMENTS_INDEX (do not invent) -- nextActionObjective: describe what this action will achieve +- nextActionObjective: describe what this action will achieve based on the FIRST improvement suggestion from CONTENT VALIDATION - Do NOT repeat failed actions - suggest DIFFERENT approach -- Use improvement suggestions from content validation +- If ACTION HISTORY shows repeated actions, suggest a fundamentally different approach +- nextActionObjective must directly address the highest priority improvement suggestion from CONTENT VALIDATION """