content mapping and transformation valdiated

2025-10-14 22:48:55 +02:00 · 2025-10-14 22:48:55 +02:00 · f0733204fb
commit f0733204fb
parent e0afc72e13
7 changed files with 249 additions and 42 deletions
--- a/modules/services/serviceAi/subDocumentProcessing.py
+++ b/modules/services/serviceAi/subDocumentProcessing.py
@ -176,22 +176,28 @@ class SubDocumentProcessing:
            # Merge with JSON mode
            mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)

-            # Normalize merged JSON into a single canonical table
+            # Normalize merged JSON into a single canonical table (only if table content exists)
            try:
                from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
                normalizer = NormalizationService(self.services)
                inventory = normalizer.discoverStructures(mergedJsonDocument)
-                # Use workflow id as cache key
-                cacheKey = self.services.currentWorkflow.id
-                # Provide the extraction/merge prompt context when available to help mapping
-                mergePrompt = prompt
-                mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
-                canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
-                report = normalizer.validateCanonical(canonical)
-                if report.get('success'):
-                    mergedJsonDocument = canonical
+                
+                # Check if any table content was discovered
+                tableHeaders = inventory.get("tableHeaders", [])
+                if not tableHeaders:
+                    logger.info("No table content found in merged JSON, skipping normalization and returning original structure")
                else:
-                    raise ValueError('Normalization produced zero rows')
+                    # Use workflow id as cache key
+                    cacheKey = self.services.currentWorkflow.id
+                    # Provide the extraction/merge prompt context when available to help mapping
+                    mergePrompt = prompt
+                    mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
+                    canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
+                    report = normalizer.validateCanonical(canonical)
+                    if report.get('success'):
+                        mergedJsonDocument = canonical
+                    else:
+                        raise ValueError('Normalization produced zero rows')
            except Exception as e:
                # Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
                raise
--- a/modules/services/serviceGeneration/renderers/rendererText.py
+++ b/modules/services/serviceGeneration/renderers/rendererText.py
@ -108,16 +108,30 @@ class RendererText(BaseRenderer):
            elif section_type == "bullet_list":
                return self._render_json_bullet_list(section_data)
            elif section_type == "heading":
-                return self._render_json_heading(section_data)
+                # Render each heading element in the elements array
+                # section_data is already the elements array from _get_section_data
+                rendered_elements = []
+                for element in section_data:
+                    rendered_elements.append(self._render_json_heading(element))
+                return "\n".join(rendered_elements)
            elif section_type == "paragraph":
-                return self._render_json_paragraph(section_data)
+                # Render each paragraph element in the elements array
+                # section_data is already the elements array from _get_section_data
+                rendered_elements = []
+                for element in section_data:
+                    rendered_elements.append(self._render_json_paragraph(element))
+                return "\n".join(rendered_elements)
            elif section_type == "code_block":
                return self._render_json_code_block(section_data)
            elif section_type == "image":
                return self._render_json_image(section_data)
            else:
-                # Fallback to paragraph for unknown types
-                return self._render_json_paragraph(section_data)
+                # Fallback to paragraph for unknown types - render each element
+                # section_data is already the elements array from _get_section_data
+                rendered_elements = []
+                for element in section_data:
+                    rendered_elements.append(self._render_json_paragraph(element))
+                return "\n".join(rendered_elements)
                
        except Exception as e:
            self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
--- a/modules/services/serviceGeneration/subPromptBuilder.py
+++ b/modules/services/serviceGeneration/subPromptBuilder.py
@ -45,7 +45,28 @@ async def buildAdaptiveExtractionPrompt(
                "filename": "section_1.xlsx",
                "sections": [
                    {
-                        "id": "table_1",
+                        "id": "section_1",
+                        "content_type": "heading",
+                        "elements": [
+                            {
+                                "level": 1,
+                                "text": "1. SECTION TITLE"
+                            }
+                        ],
+                        "order": 1
+                    },
+                    {
+                        "id": "section_2",
+                        "content_type": "paragraph",
+                        "elements": [
+                            {
+                                "text": "This is the actual content that should be extracted from the document."
+                            }
+                        ],
+                        "order": 2
+                    },
+                    {
+                        "id": "section_3",
                        "content_type": "table",
                        "elements": [
                            {
@ -53,7 +74,7 @@ async def buildAdaptiveExtractionPrompt(
                                "rows": [["Value 1", "Value 2"]]
                            }
                        ],
-                        "order": 1
+                        "order": 3
                    }
                ]
            }
@ -69,7 +90,28 @@ async def buildAdaptiveExtractionPrompt(
        },
        "sections": [
            {
-                "id": "table_1",
+                "id": "section_1",
+                "content_type": "heading",
+                "elements": [
+                    {
+                        "level": 1,
+                        "text": "1. SECTION TITLE"
+                    }
+                ],
+                "order": 1
+            },
+            {
+                "id": "section_2",
+                "content_type": "paragraph",
+                "elements": [
+                    {
+                        "text": "This is the actual content that should be extracted from the document."
+                    }
+                ],
+                "order": 2
+            },
+            {
+                "id": "section_3",
                "content_type": "table",
                "elements": [
                    {
@ -77,7 +119,7 @@ async def buildAdaptiveExtractionPrompt(
                        "rows": [["Value 1", "Value 2"]]
                    }
                ],
-                "order": 1
+                "order": 3
            }
        ]
    }
@ -253,14 +295,11 @@ Consider the user's intent and the most logical way to organize the extracted co
        "sections": [
            {
                "id": "section_001",
-                "content_type": "table",
+                "content_type": "heading",
                "elements": [
                    {
-                        "headers": ["Column 1", "Column 2", "Column 3"],
-                        "rows": [
-                            ["Value 1", "Value 2", "Value 3"],
-                            ["Value 4", "Value 5", "Value 6"]
-                        ]
+                        "level": 1,
+                        "text": "1. SECTION TITLE"
                    }
                ],
                "order": 1,
@ -340,7 +379,7 @@ async def buildExtractionPrompt(
    from .subJsonSchema import get_document_subJsonSchema
    jsonSchema = get_document_subJsonSchema()

-    # Generic block for JSON extraction - use example data instead of schema
+    # Generic block for JSON extraction - use mixed example data showing different content types
    example_data = {
        "metadata": {
            "title": "Example Document",
@ -351,6 +390,29 @@ async def buildExtractionPrompt(
        "sections": [
            {
                "id": "section_001",
+                "content_type": "heading",
+                "elements": [
+                    {
+                        "level": 1,
+                        "text": "1. INTRODUCTION"
+                    }
+                ],
+                "order": 1,
+                "metadata": {}
+            },
+            {
+                "id": "section_002",
+                "content_type": "paragraph",
+                "elements": [
+                    {
+                        "text": "This is a sample paragraph with actual content that should be extracted from the document."
+                    }
+                ],
+                "order": 2,
+                "metadata": {}
+            },
+            {
+                "id": "section_003",
                "content_type": "table",
                "elements": [
                    {
@ -361,7 +423,7 @@ async def buildExtractionPrompt(
                        ]
                    }
                ],
-                "order": 1,
+                "order": 3,
                "metadata": {}
            }
        ],
@ -486,17 +548,38 @@ CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this
  "sections": [
    {{
      "id": "section_1",
-      "content_type": "table",
+      "content_type": "heading",
      "elements": [
        {{
-          "headers": ["Column1", "Column2", "Column3"],
-          "rows": [
-            ["Value1", "Value2", "Value3"],
-            ["Value4", "Value5", "Value6"]
-          ]
+          "level": 1,
+          "text": "1. SECTION TITLE"
        }}
      ],
      "order": 1
+    }},
+    {{
+      "id": "section_2",
+      "content_type": "paragraph",
+      "elements": [
+        {{
+          "text": "This is the actual content that should be extracted from the document."
+        }}
+      ],
+      "order": 2
+    }},
+    {{
+      "id": "section_3",
+      "content_type": "table",
+      "elements": [
+        {{
+          "headers": ["Column 1", "Column 2", "Column 3"],
+          "rows": [
+            ["Value 1", "Value 2", "Value 3"],
+            ["Value 4", "Value 5", "Value 6"]
+          ]
+        }}
+      ],
+      "order": 3
    }}
  ]
 }}
--- a/modules/services/serviceWorkflow/mainServiceWorkflow.py
+++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py
@ -597,12 +597,16 @@ class WorkflowService:
            if not workflow or not hasattr(workflow, 'messages'):
                return "No documents available"
            
-            # Reload workflow from database to ensure we have all messages
-            if hasattr(workflow, 'id'):
-                try:
-                    workflow = self.getWorkflow(workflow.id)
-                except Exception as e:
-                    logger.warning(f"Could not reload workflow from database: {str(e)}")
+            # Use the provided workflow object directly to avoid database reload issues
+            # that can cause filename truncation. The workflow object should already be up-to-date.
+            logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
+            
+            # Debug: Check document filenames in the workflow object
+            if hasattr(workflow, 'messages') and workflow.messages:
+                for message in workflow.messages:
+                    if hasattr(message, 'documents') and message.documents:
+                        for doc in message.documents:
+                            logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
            
            # Get document reference list using the exact same logic as old system
            document_list = self._getDocumentReferenceList(workflow)
@ -739,12 +743,22 @@ class WorkflowService:
        """Update file attributes (fileName, fileSize, mimeType) for documents"""
        for doc in documents:
            try:
+                # Debug: Log original filename before refresh
+                original_filename = doc.fileName
+                logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
+                
                # Use the proper WorkflowService method to get file info
                file_info = self.getFileInfo(doc.fileId)
                if file_info:
+                    db_filename = file_info.get("fileName", doc.fileName)
+                    logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
+                    
                    doc.fileName = file_info.get("fileName", doc.fileName)
                    doc.fileSize = file_info.get("size", doc.fileSize)
                    doc.mimeType = file_info.get("mimeType", doc.mimeType)
+                    
+                    # Debug: Log final filename after refresh
+                    logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
                else:
                    logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
            except Exception as e:
@ -760,6 +774,8 @@ class WorkflowService:
    def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
        """Get document reference using document ID and filename."""
        try:
+            # Debug logging to track filename truncation
+            logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
            # Use document ID and filename for simple reference
            return f"docItem:{document.id}:{document.fileName}"
        except Exception as e:
--- a/modules/workflows/processing/adaptive/contentValidator.py
+++ b/modules/workflows/processing/adaptive/contentValidator.py
@ -3,6 +3,7 @@

 import re
 import logging
+import json
 from typing import List, Dict, Any

 logger = logging.getLogger(__name__)
@ -14,8 +15,14 @@ class ContentValidator:
        pass
    
    def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
-        """Validates delivered content against user intent"""
+        """Validates delivered content against user intent using AI"""
        try:
+            # First, try AI-based validation for intelligent gap analysis
+            aiValidation = self._validateWithAI(documents, intent)
+            if aiValidation:
+                return aiValidation
+            
+            # Fallback to rule-based validation if AI validation fails
            validationDetails = []
            
            for doc in documents:
@ -306,3 +313,73 @@ class ContentValidator:
            "validationDetails": [],
            "improvementSuggestions": [f"Validation failed: {error}"]
        }
+    
+    def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
+        """AI-based validation to intelligently assess task completion"""
+        try:
+            # Extract content from all documents
+            documentContents = []
+            for doc in documents:
+                content = self._extractContent(doc)
+                documentContents.append({
+                    "name": getattr(doc, 'documentName', 'Unknown'),
+                    "content": content[:2000]  # Limit content for AI processing
+                })
+            
+            # Create AI validation prompt
+            validationPrompt = f"""
+You are a task completion validator. Analyze if the delivered content actually fulfills the user's request.
+
+USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
+
+DELIVERED CONTENT:
+{json.dumps(documentContents, indent=2)}
+
+TASK: Determine if the user's request has been fully completed.
+
+Analyze the gap between what was requested and what was delivered. Consider any missing elements, incorrect formats, incomplete work, or other discrepancies.
+
+Respond with JSON only:
+{{
+    "overallSuccess": true/false,
+    "qualityScore": 0.0-1.0,
+    "gapAnalysis": "Detailed analysis of what's missing or incorrect",
+    "improvementSuggestions": ["specific action 1", "specific action 2"]
+}}
+"""
+            
+            # Call AI service for validation
+            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
+            request_options = AiCallOptions()
+            request_options.operationType = OperationType.GENERAL
+            
+            request = AiCallRequest(prompt=validationPrompt, context="", options=request_options)
+            
+            # Get AI service from the workflow context
+            if hasattr(self, 'services') and hasattr(self.services, 'ai'):
+                response = self.services.ai.aiObjects.call(request)
+                if response and response.content:
+                    import re
+                    result = response.content.strip()
+                    json_match = re.search(r'\{.*\}', result, re.DOTALL)
+                    if json_match:
+                        result = json_match.group(0)
+                    
+                    aiResult = json.loads(result)
+                    
+                    return {
+                        "overallSuccess": aiResult.get("overallSuccess", False),
+                        "qualityScore": aiResult.get("qualityScore", 0.0),
+                        "validationDetails": [{
+                            "documentName": "AI Validation",
+                            "gapAnalysis": aiResult.get("gapAnalysis", ""),
+                            "successCriteriaMet": [aiResult.get("overallSuccess", False)]
+                        }],
+                        "improvementSuggestions": aiResult.get("improvementSuggestions", [])
+                    }
+            
+            return None  # Fallback to rule-based validation
+            
+        except Exception as e:
+            logger.error(f"AI validation failed: {str(e)}")
+            return None  # Fallback to rule-based validation
--- a/modules/workflows/processing/modes/modeReact.py
+++ b/modules/workflows/processing/modes/modeReact.py
@ -33,6 +33,7 @@ class ReactMode(BaseMode):
        # Initialize adaptive components
        self.intentAnalyzer = IntentAnalyzer()
        self.contentValidator = ContentValidator()
+        self.contentValidator.services = self.services  # Pass services for AI validation
        self.learningEngine = LearningEngine()
        self.progressTracker = ProgressTracker()
        self.currentIntent = None
@ -235,8 +236,8 @@ class ReactMode(BaseMode):
            valid_refs = []
            for line in available_docs.split('\n'):
                if 'docList:' in line or 'docItem:' in line:
-                    # Extract reference from line like "  - docList:msg_xxx:label"
-                    ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line)
+                    # Extract reference from line like "  - docList:msg_xxx:label" or "  - docItem:xxx:filename with spaces"
+                    ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
                    if ref_match:
                        valid_refs.append(ref_match.group(1))
            
--- a/modules/workflows/processing/shared/promptGenerationTaskplan.py
+++ b/modules/workflows/processing/shared/promptGenerationTaskplan.py
@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:

 Break down user requests into logical, executable task steps.

+**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
+
 ## 📋 Context

 ### User Request
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
 - **ONE TOPIC PER TASK** - Each task should handle one complete business objective
 - **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
 - **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
+- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job

 ### Task Grouping Examples
 - **Research + Analysis + Report** → ONE task: "Web research report"
 - **Data Collection + Processing + Visualization** → ONE task: "Collect and present data"
+- **Document splitting** (analyze + extract + create files) → ONE task: "Split document into separate files"
 - **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"

+### Common Single-Task Scenarios
+- **"Split document into sections"** → ONE task: "Split document into separate files"
+- **"Extract data and create report"** → ONE task: "Extract data and create report"
+- **"Analyze and summarize document"** → ONE task: "Analyze and summarize document"
+- **"Convert file to different format"** → ONE task: "Convert file to different format"
+
 ### Retry Handling
 - **If retry request**: Analyze previous rounds to understand what failed
 - **Learn from mistakes**: Improve the plan based on previous failures