diff --git a/modules/services/serviceAi/subDocumentProcessing.py b/modules/services/serviceAi/subDocumentProcessing.py index a0dc5088..e96a9394 100644 --- a/modules/services/serviceAi/subDocumentProcessing.py +++ b/modules/services/serviceAi/subDocumentProcessing.py @@ -176,22 +176,28 @@ class SubDocumentProcessing: # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) - # Normalize merged JSON into a single canonical table + # Normalize merged JSON into a single canonical table (only if table content exists) try: from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService normalizer = NormalizationService(self.services) inventory = normalizer.discoverStructures(mergedJsonDocument) - # Use workflow id as cache key - cacheKey = self.services.currentWorkflow.id - # Provide the extraction/merge prompt context when available to help mapping - mergePrompt = prompt - mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) - canonical = normalizer.applyMapping(mergedJsonDocument, mapping) - report = normalizer.validateCanonical(canonical) - if report.get('success'): - mergedJsonDocument = canonical + + # Check if any table content was discovered + tableHeaders = inventory.get("tableHeaders", []) + if not tableHeaders: + logger.info("No table content found in merged JSON, skipping normalization and returning original structure") else: - raise ValueError('Normalization produced zero rows') + # Use workflow id as cache key + cacheKey = self.services.currentWorkflow.id + # Provide the extraction/merge prompt context when available to help mapping + mergePrompt = prompt + mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) + canonical = normalizer.applyMapping(mergedJsonDocument, mapping) + report = normalizer.validateCanonical(canonical) + if report.get('success'): + mergedJsonDocument = canonical + else: + raise ValueError('Normalization produced zero rows') except Exception as e: # Surface normalization failure while leaving original merged JSON (single-path expectation is to fail) raise diff --git a/modules/services/serviceGeneration/renderers/rendererText.py b/modules/services/serviceGeneration/renderers/rendererText.py index 6ca1415b..33d648e8 100644 --- a/modules/services/serviceGeneration/renderers/rendererText.py +++ b/modules/services/serviceGeneration/renderers/rendererText.py @@ -108,16 +108,30 @@ class RendererText(BaseRenderer): elif section_type == "bullet_list": return self._render_json_bullet_list(section_data) elif section_type == "heading": - return self._render_json_heading(section_data) + # Render each heading element in the elements array + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_heading(element)) + return "\n".join(rendered_elements) elif section_type == "paragraph": - return self._render_json_paragraph(section_data) + # Render each paragraph element in the elements array + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_paragraph(element)) + return "\n".join(rendered_elements) elif section_type == "code_block": return self._render_json_code_block(section_data) elif section_type == "image": return self._render_json_image(section_data) else: - # Fallback to paragraph for unknown types - return self._render_json_paragraph(section_data) + # Fallback to paragraph for unknown types - render each element + # section_data is already the elements array from _get_section_data + rendered_elements = [] + for element in section_data: + rendered_elements.append(self._render_json_paragraph(element)) + return "\n".join(rendered_elements) except Exception as e: self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py index c0139f45..cbcce375 100644 --- a/modules/services/serviceGeneration/subPromptBuilder.py +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -45,7 +45,28 @@ async def buildAdaptiveExtractionPrompt( "filename": "section_1.xlsx", "sections": [ { - "id": "table_1", + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. SECTION TITLE" + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "This is the actual content that should be extracted from the document." + } + ], + "order": 2 + }, + { + "id": "section_3", "content_type": "table", "elements": [ { @@ -53,7 +74,7 @@ async def buildAdaptiveExtractionPrompt( "rows": [["Value 1", "Value 2"]] } ], - "order": 1 + "order": 3 } ] } @@ -69,7 +90,28 @@ async def buildAdaptiveExtractionPrompt( }, "sections": [ { - "id": "table_1", + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. SECTION TITLE" + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "This is the actual content that should be extracted from the document." + } + ], + "order": 2 + }, + { + "id": "section_3", "content_type": "table", "elements": [ { @@ -77,7 +119,7 @@ async def buildAdaptiveExtractionPrompt( "rows": [["Value 1", "Value 2"]] } ], - "order": 1 + "order": 3 } ] } @@ -253,14 +295,11 @@ Consider the user's intent and the most logical way to organize the extracted co "sections": [ { "id": "section_001", - "content_type": "table", + "content_type": "heading", "elements": [ { - "headers": ["Column 1", "Column 2", "Column 3"], - "rows": [ - ["Value 1", "Value 2", "Value 3"], - ["Value 4", "Value 5", "Value 6"] - ] + "level": 1, + "text": "1. SECTION TITLE" } ], "order": 1, @@ -340,7 +379,7 @@ async def buildExtractionPrompt( from .subJsonSchema import get_document_subJsonSchema jsonSchema = get_document_subJsonSchema() - # Generic block for JSON extraction - use example data instead of schema + # Generic block for JSON extraction - use mixed example data showing different content types example_data = { "metadata": { "title": "Example Document", @@ -351,6 +390,29 @@ async def buildExtractionPrompt( "sections": [ { "id": "section_001", + "content_type": "heading", + "elements": [ + { + "level": 1, + "text": "1. INTRODUCTION" + } + ], + "order": 1, + "metadata": {} + }, + { + "id": "section_002", + "content_type": "paragraph", + "elements": [ + { + "text": "This is a sample paragraph with actual content that should be extracted from the document." + } + ], + "order": 2, + "metadata": {} + }, + { + "id": "section_003", "content_type": "table", "elements": [ { @@ -361,7 +423,7 @@ async def buildExtractionPrompt( ] } ], - "order": 1, + "order": 3, "metadata": {} } ], @@ -486,17 +548,38 @@ CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this "sections": [ {{ "id": "section_1", - "content_type": "table", + "content_type": "heading", "elements": [ {{ - "headers": ["Column1", "Column2", "Column3"], - "rows": [ - ["Value1", "Value2", "Value3"], - ["Value4", "Value5", "Value6"] - ] + "level": 1, + "text": "1. SECTION TITLE" }} ], "order": 1 + }}, + {{ + "id": "section_2", + "content_type": "paragraph", + "elements": [ + {{ + "text": "This is the actual content that should be extracted from the document." + }} + ], + "order": 2 + }}, + {{ + "id": "section_3", + "content_type": "table", + "elements": [ + {{ + "headers": ["Column 1", "Column 2", "Column 3"], + "rows": [ + ["Value 1", "Value 2", "Value 3"], + ["Value 4", "Value 5", "Value 6"] + ] + }} + ], + "order": 3 }} ] }} diff --git a/modules/services/serviceWorkflow/mainServiceWorkflow.py b/modules/services/serviceWorkflow/mainServiceWorkflow.py index d5f71cfd..afc4e3b5 100644 --- a/modules/services/serviceWorkflow/mainServiceWorkflow.py +++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py @@ -597,12 +597,16 @@ class WorkflowService: if not workflow or not hasattr(workflow, 'messages'): return "No documents available" - # Reload workflow from database to ensure we have all messages - if hasattr(workflow, 'id'): - try: - workflow = self.getWorkflow(workflow.id) - except Exception as e: - logger.warning(f"Could not reload workflow from database: {str(e)}") + # Use the provided workflow object directly to avoid database reload issues + # that can cause filename truncation. The workflow object should already be up-to-date. + logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})") + + # Debug: Check document filenames in the workflow object + if hasattr(workflow, 'messages') and workflow.messages: + for message in workflow.messages: + if hasattr(message, 'documents') and message.documents: + for doc in message.documents: + logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})") # Get document reference list using the exact same logic as old system document_list = self._getDocumentReferenceList(workflow) @@ -739,12 +743,22 @@ class WorkflowService: """Update file attributes (fileName, fileSize, mimeType) for documents""" for doc in documents: try: + # Debug: Log original filename before refresh + original_filename = doc.fileName + logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})") + # Use the proper WorkflowService method to get file info file_info = self.getFileInfo(doc.fileId) if file_info: + db_filename = file_info.get("fileName", doc.fileName) + logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})") + doc.fileName = file_info.get("fileName", doc.fileName) doc.fileSize = file_info.get("size", doc.fileSize) doc.mimeType = file_info.get("mimeType", doc.mimeType) + + # Debug: Log final filename after refresh + logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})") else: logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}") except Exception as e: @@ -760,6 +774,8 @@ class WorkflowService: def _getDocumentReferenceFromChatDocument(self, document, message) -> str: """Get document reference using document ID and filename.""" try: + # Debug logging to track filename truncation + logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})") # Use document ID and filename for simple reference return f"docItem:{document.id}:{document.fileName}" except Exception as e: diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py index 48339cb4..5253ab5e 100644 --- a/modules/workflows/processing/adaptive/contentValidator.py +++ b/modules/workflows/processing/adaptive/contentValidator.py @@ -3,6 +3,7 @@ import re import logging +import json from typing import List, Dict, Any logger = logging.getLogger(__name__) @@ -14,8 +15,14 @@ class ContentValidator: pass def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]: - """Validates delivered content against user intent""" + """Validates delivered content against user intent using AI""" try: + # First, try AI-based validation for intelligent gap analysis + aiValidation = self._validateWithAI(documents, intent) + if aiValidation: + return aiValidation + + # Fallback to rule-based validation if AI validation fails validationDetails = [] for doc in documents: @@ -306,3 +313,73 @@ class ContentValidator: "validationDetails": [], "improvementSuggestions": [f"Validation failed: {error}"] } + + def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]: + """AI-based validation to intelligently assess task completion""" + try: + # Extract content from all documents + documentContents = [] + for doc in documents: + content = self._extractContent(doc) + documentContents.append({ + "name": getattr(doc, 'documentName', 'Unknown'), + "content": content[:2000] # Limit content for AI processing + }) + + # Create AI validation prompt + validationPrompt = f""" +You are a task completion validator. Analyze if the delivered content actually fulfills the user's request. + +USER REQUEST: {intent.get('primaryGoal', 'Unknown')} + +DELIVERED CONTENT: +{json.dumps(documentContents, indent=2)} + +TASK: Determine if the user's request has been fully completed. + +Analyze the gap between what was requested and what was delivered. Consider any missing elements, incorrect formats, incomplete work, or other discrepancies. + +Respond with JSON only: +{{ + "overallSuccess": true/false, + "qualityScore": 0.0-1.0, + "gapAnalysis": "Detailed analysis of what's missing or incorrect", + "improvementSuggestions": ["specific action 1", "specific action 2"] +}} +""" + + # Call AI service for validation + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=validationPrompt, context="", options=request_options) + + # Get AI service from the workflow context + if hasattr(self, 'services') and hasattr(self.services, 'ai'): + response = self.services.ai.aiObjects.call(request) + if response and response.content: + import re + result = response.content.strip() + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + result = json_match.group(0) + + aiResult = json.loads(result) + + return { + "overallSuccess": aiResult.get("overallSuccess", False), + "qualityScore": aiResult.get("qualityScore", 0.0), + "validationDetails": [{ + "documentName": "AI Validation", + "gapAnalysis": aiResult.get("gapAnalysis", ""), + "successCriteriaMet": [aiResult.get("overallSuccess", False)] + }], + "improvementSuggestions": aiResult.get("improvementSuggestions", []) + } + + return None # Fallback to rule-based validation + + except Exception as e: + logger.error(f"AI validation failed: {str(e)}") + return None # Fallback to rule-based validation \ No newline at end of file diff --git a/modules/workflows/processing/modes/modeReact.py b/modules/workflows/processing/modes/modeReact.py index 606d1123..1bc893a5 100644 --- a/modules/workflows/processing/modes/modeReact.py +++ b/modules/workflows/processing/modes/modeReact.py @@ -33,6 +33,7 @@ class ReactMode(BaseMode): # Initialize adaptive components self.intentAnalyzer = IntentAnalyzer() self.contentValidator = ContentValidator() + self.contentValidator.services = self.services # Pass services for AI validation self.learningEngine = LearningEngine() self.progressTracker = ProgressTracker() self.currentIntent = None @@ -235,8 +236,8 @@ class ReactMode(BaseMode): valid_refs = [] for line in available_docs.split('\n'): if 'docList:' in line or 'docItem:' in line: - # Extract reference from line like " - docList:msg_xxx:label" - ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line) + # Extract reference from line like " - docList:msg_xxx:label" or " - docItem:xxx:filename with spaces" + ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line) if ref_match: valid_refs.append(ref_match.group(1)) diff --git a/modules/workflows/processing/shared/promptGenerationTaskplan.py b/modules/workflows/processing/shared/promptGenerationTaskplan.py index 08f18493..f5fb3407 100644 --- a/modules/workflows/processing/shared/promptGenerationTaskplan.py +++ b/modules/workflows/processing/shared/promptGenerationTaskplan.py @@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle: Break down user requests into logical, executable task steps. +**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks. + ## 📋 Context ### User Request @@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps. - **ONE TOPIC PER TASK** - Each task should handle one complete business objective - **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps - **AVOID MICRO-TASKS** - Don't create separate tasks for each small action +- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job ### Task Grouping Examples - **Research + Analysis + Report** → ONE task: "Web research report" - **Data Collection + Processing + Visualization** → ONE task: "Collect and present data" +- **Document splitting** (analyze + extract + create files) → ONE task: "Split document into separate files" - **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message" +### Common Single-Task Scenarios +- **"Split document into sections"** → ONE task: "Split document into separate files" +- **"Extract data and create report"** → ONE task: "Extract data and create report" +- **"Analyze and summarize document"** → ONE task: "Analyze and summarize document" +- **"Convert file to different format"** → ONE task: "Convert file to different format" + ### Retry Handling - **If retry request**: Analyze previous rounds to understand what failed - **Learn from mistakes**: Improve the plan based on previous failures