content mapping and transformation valdiated

2025-10-14 22:48:55 +02:00 · 2025-10-14 22:48:55 +02:00 · f0733204fb
commit f0733204fb
parent e0afc72e13
7 changed files with 249 additions and 42 deletions
--- a/modules/services/serviceAi/subDocumentProcessing.py
+++ b/modules/services/serviceAi/subDocumentProcessing.py
@ -176,22 +176,28 @@ class SubDocumentProcessing:
            # Merge with JSON mode
            mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
-            # Normalize merged JSON into a single canonical table
+            # Normalize merged JSON into a single canonical table (only if table content exists)
            try:
                from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
                normalizer = NormalizationService(self.services)
                inventory = normalizer.discoverStructures(mergedJsonDocument)
-                # Use workflow id as cache key
+                
-                cacheKey = self.services.currentWorkflow.id
+                # Check if any table content was discovered
-                # Provide the extraction/merge prompt context when available to help mapping
+                tableHeaders = inventory.get("tableHeaders", [])
-                mergePrompt = prompt
+                if not tableHeaders:
-                mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
+                    logger.info("No table content found in merged JSON, skipping normalization and returning original structure")
                canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
                report = normalizer.validateCanonical(canonical)
                if report.get('success'):
                    mergedJsonDocument = canonical
                else:
-                    raise ValueError('Normalization produced zero rows')
+                    # Use workflow id as cache key
                    cacheKey = self.services.currentWorkflow.id
                    # Provide the extraction/merge prompt context when available to help mapping
                    mergePrompt = prompt
                    mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
                    canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
                    report = normalizer.validateCanonical(canonical)
                    if report.get('success'):
                        mergedJsonDocument = canonical
                    else:
                        raise ValueError('Normalization produced zero rows')
            except Exception as e:
                # Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
                raise
--- a/modules/services/serviceGeneration/renderers/rendererText.py
+++ b/modules/services/serviceGeneration/renderers/rendererText.py
@ -108,16 +108,30 @@ class RendererText(BaseRenderer):
            elif section_type == "bullet_list":
                return self._render_json_bullet_list(section_data)
            elif section_type == "heading":
-                return self._render_json_heading(section_data)
+                # Render each heading element in the elements array
                # section_data is already the elements array from _get_section_data
                rendered_elements = []
                for element in section_data:
                    rendered_elements.append(self._render_json_heading(element))
                return "\n".join(rendered_elements)
            elif section_type == "paragraph":
-                return self._render_json_paragraph(section_data)
+                # Render each paragraph element in the elements array
                # section_data is already the elements array from _get_section_data
                rendered_elements = []
                for element in section_data:
                    rendered_elements.append(self._render_json_paragraph(element))
                return "\n".join(rendered_elements)
            elif section_type == "code_block":
                return self._render_json_code_block(section_data)
            elif section_type == "image":
                return self._render_json_image(section_data)
            else:
-                # Fallback to paragraph for unknown types
+                # Fallback to paragraph for unknown types - render each element
-                return self._render_json_paragraph(section_data)
+                # section_data is already the elements array from _get_section_data
                rendered_elements = []
                for element in section_data:
                    rendered_elements.append(self._render_json_paragraph(element))
                return "\n".join(rendered_elements)
        except Exception as e:
            self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
--- a/modules/services/serviceGeneration/subPromptBuilder.py
+++ b/modules/services/serviceGeneration/subPromptBuilder.py
@ -45,7 +45,28 @@ async def buildAdaptiveExtractionPrompt(
                "filename": "section_1.xlsx",
                "sections": [
                    {
-                        "id": "table_1",
+                        "id": "section_1",
                        "content_type": "heading",
                        "elements": [
                            {
                                "level": 1,
                                "text": "1. SECTION TITLE"
                            }
                        ],
                        "order": 1
                    },
                    {
                        "id": "section_2",
                        "content_type": "paragraph",
                        "elements": [
                            {
                                "text": "This is the actual content that should be extracted from the document."
                            }
                        ],
                        "order": 2
                    },
                    {
                        "id": "section_3",
                        "content_type": "table",
                        "elements": [
                            {
@ -53,7 +74,7 @@ async def buildAdaptiveExtractionPrompt(
                                "rows": [["Value 1", "Value 2"]]
                            }
                        ],
-                        "order": 1
+                        "order": 3
                    }
                ]
            }
@ -69,7 +90,28 @@ async def buildAdaptiveExtractionPrompt(
        },
        "sections": [
            {
-                "id": "table_1",
+                "id": "section_1",
                "content_type": "heading",
                "elements": [
                    {
                        "level": 1,
                        "text": "1. SECTION TITLE"
                    }
                ],
                "order": 1
            },
            {
                "id": "section_2",
                "content_type": "paragraph",
                "elements": [
                    {
                        "text": "This is the actual content that should be extracted from the document."
                    }
                ],
                "order": 2
            },
            {
                "id": "section_3",
                "content_type": "table",
                "elements": [
                    {
@ -77,7 +119,7 @@ async def buildAdaptiveExtractionPrompt(
                        "rows": [["Value 1", "Value 2"]]
                    }
                ],
-                "order": 1
+                "order": 3
            }
        ]
    }
@ -253,14 +295,11 @@ Consider the user's intent and the most logical way to organize the extracted co
        "sections": [
            {
                "id": "section_001",
-                "content_type": "table",
+                "content_type": "heading",
                "elements": [
                    {
-                        "headers": ["Column 1", "Column 2", "Column 3"],
+                        "level": 1,
-                        "rows": [
+                        "text": "1. SECTION TITLE"
                            ["Value 1", "Value 2", "Value 3"],
                            ["Value 4", "Value 5", "Value 6"]
                        ]
                    }
                ],
                "order": 1,
@ -340,7 +379,7 @@ async def buildExtractionPrompt(
    from .subJsonSchema import get_document_subJsonSchema
    jsonSchema = get_document_subJsonSchema()
-    # Generic block for JSON extraction - use example data instead of schema
+    # Generic block for JSON extraction - use mixed example data showing different content types
    example_data = {
        "metadata": {
            "title": "Example Document",
@ -351,6 +390,29 @@ async def buildExtractionPrompt(
        "sections": [
            {
                "id": "section_001",
                "content_type": "heading",
                "elements": [
                    {
                        "level": 1,
                        "text": "1. INTRODUCTION"
                    }
                ],
                "order": 1,
                "metadata": {}
            },
            {
                "id": "section_002",
                "content_type": "paragraph",
                "elements": [
                    {
                        "text": "This is a sample paragraph with actual content that should be extracted from the document."
                    }
                ],
                "order": 2,
                "metadata": {}
            },
            {
                "id": "section_003",
                "content_type": "table",
                "elements": [
                    {
@ -361,7 +423,7 @@ async def buildExtractionPrompt(
                        ]
                    }
                ],
-                "order": 1,
+                "order": 3,
                "metadata": {}
            }
        ],
@ -486,17 +548,38 @@ CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this
  "sections": [
    {{
      "id": "section_1",
-      "content_type": "table",
+      "content_type": "heading",
      "elements": [
        {{
-          "headers": ["Column1", "Column2", "Column3"],
+          "level": 1,
-          "rows": [
+          "text": "1. SECTION TITLE"
            ["Value1", "Value2", "Value3"],
            ["Value4", "Value5", "Value6"]
          ]
        }}
      ],
      "order": 1
    }},
    {{
      "id": "section_2",
      "content_type": "paragraph",
      "elements": [
        {{
          "text": "This is the actual content that should be extracted from the document."
        }}
      ],
      "order": 2
    }},
    {{
      "id": "section_3",
      "content_type": "table",
      "elements": [
        {{
          "headers": ["Column 1", "Column 2", "Column 3"],
          "rows": [
            ["Value 1", "Value 2", "Value 3"],
            ["Value 4", "Value 5", "Value 6"]
          ]
        }}
      ],
      "order": 3
    }}
  ]
 }}
--- a/modules/services/serviceWorkflow/mainServiceWorkflow.py
+++ b/modules/services/serviceWorkflow/mainServiceWorkflow.py
@ -597,12 +597,16 @@ class WorkflowService:
            if not workflow or not hasattr(workflow, 'messages'):
                return "No documents available"
-            # Reload workflow from database to ensure we have all messages
+            # Use the provided workflow object directly to avoid database reload issues
-            if hasattr(workflow, 'id'):
+            # that can cause filename truncation. The workflow object should already be up-to-date.
-                try:
+            logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
-                    workflow = self.getWorkflow(workflow.id)
+            
-                except Exception as e:
+            # Debug: Check document filenames in the workflow object
-                    logger.warning(f"Could not reload workflow from database: {str(e)}")
+            if hasattr(workflow, 'messages') and workflow.messages:
                for message in workflow.messages:
                    if hasattr(message, 'documents') and message.documents:
                        for doc in message.documents:
                            logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
            # Get document reference list using the exact same logic as old system
            document_list = self._getDocumentReferenceList(workflow)
@ -739,12 +743,22 @@ class WorkflowService:
        """Update file attributes (fileName, fileSize, mimeType) for documents"""
        for doc in documents:
            try:
                # Debug: Log original filename before refresh
                original_filename = doc.fileName
                logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
                # Use the proper WorkflowService method to get file info
                file_info = self.getFileInfo(doc.fileId)
                if file_info:
                    db_filename = file_info.get("fileName", doc.fileName)
                    logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
                    doc.fileName = file_info.get("fileName", doc.fileName)
                    doc.fileSize = file_info.get("size", doc.fileSize)
                    doc.mimeType = file_info.get("mimeType", doc.mimeType)
                    # Debug: Log final filename after refresh
                    logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
                else:
                    logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
            except Exception as e:
@ -760,6 +774,8 @@ class WorkflowService:
    def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
        """Get document reference using document ID and filename."""
        try:
            # Debug logging to track filename truncation
            logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
            # Use document ID and filename for simple reference
            return f"docItem:{document.id}:{document.fileName}"
        except Exception as e:
--- a/modules/workflows/processing/adaptive/contentValidator.py
+++ b/modules/workflows/processing/adaptive/contentValidator.py
@ -3,6 +3,7 @@
 import re
 import logging
 import json
 from typing import List, Dict, Any
 logger = logging.getLogger(__name__)
@ -14,8 +15,14 @@ class ContentValidator:
        pass
    def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
-        """Validates delivered content against user intent"""
+        """Validates delivered content against user intent using AI"""
        try:
            # First, try AI-based validation for intelligent gap analysis
            aiValidation = self._validateWithAI(documents, intent)
            if aiValidation:
                return aiValidation
            # Fallback to rule-based validation if AI validation fails
            validationDetails = []
            for doc in documents:
@ -306,3 +313,73 @@ class ContentValidator:
            "validationDetails": [],
            "improvementSuggestions": [f"Validation failed: {error}"]
        }
    def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
        """AI-based validation to intelligently assess task completion"""
        try:
            # Extract content from all documents
            documentContents = []
            for doc in documents:
                content = self._extractContent(doc)
                documentContents.append({
                    "name": getattr(doc, 'documentName', 'Unknown'),
                    "content": content[:2000]  # Limit content for AI processing
                })
            # Create AI validation prompt
            validationPrompt = f"""
 You are a task completion validator. Analyze if the delivered content actually fulfills the user's request.
 USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
 DELIVERED CONTENT:
 {json.dumps(documentContents, indent=2)}
 TASK: Determine if the user's request has been fully completed.
 Analyze the gap between what was requested and what was delivered. Consider any missing elements, incorrect formats, incomplete work, or other discrepancies.
 Respond with JSON only:
 {{
    "overallSuccess": true/false,
    "qualityScore": 0.0-1.0,
    "gapAnalysis": "Detailed analysis of what's missing or incorrect",
    "improvementSuggestions": ["specific action 1", "specific action 2"]
 }}
 """
            # Call AI service for validation
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL
            request = AiCallRequest(prompt=validationPrompt, context="", options=request_options)
            # Get AI service from the workflow context
            if hasattr(self, 'services') and hasattr(self.services, 'ai'):
                response = self.services.ai.aiObjects.call(request)
                if response and response.content:
                    import re
                    result = response.content.strip()
                    json_match = re.search(r'\{.*\}', result, re.DOTALL)
                    if json_match:
                        result = json_match.group(0)
                    aiResult = json.loads(result)
                    return {
                        "overallSuccess": aiResult.get("overallSuccess", False),
                        "qualityScore": aiResult.get("qualityScore", 0.0),
                        "validationDetails": [{
                            "documentName": "AI Validation",
                            "gapAnalysis": aiResult.get("gapAnalysis", ""),
                            "successCriteriaMet": [aiResult.get("overallSuccess", False)]
                        }],
                        "improvementSuggestions": aiResult.get("improvementSuggestions", [])
                    }
            return None  # Fallback to rule-based validation
        except Exception as e:
            logger.error(f"AI validation failed: {str(e)}")
            return None  # Fallback to rule-based validation
--- a/modules/workflows/processing/modes/modeReact.py
+++ b/modules/workflows/processing/modes/modeReact.py
@ -33,6 +33,7 @@ class ReactMode(BaseMode):
        # Initialize adaptive components
        self.intentAnalyzer = IntentAnalyzer()
        self.contentValidator = ContentValidator()
        self.contentValidator.services = self.services  # Pass services for AI validation
        self.learningEngine = LearningEngine()
        self.progressTracker = ProgressTracker()
        self.currentIntent = None
@ -235,8 +236,8 @@ class ReactMode(BaseMode):
            valid_refs = []
            for line in available_docs.split('\n'):
                if 'docList:' in line or 'docItem:' in line:
-                    # Extract reference from line like "  - docList:msg_xxx:label"
+                    # Extract reference from line like "  - docList:msg_xxx:label" or "  - docItem:xxx:filename with spaces"
-                    ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line)
+                    ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
                    if ref_match:
                        valid_refs.append(ref_match.group(1))
--- a/modules/workflows/processing/shared/promptGenerationTaskplan.py
+++ b/modules/workflows/processing/shared/promptGenerationTaskplan.py
@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:
 Break down user requests into logical, executable task steps.
 **IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
 ## 📋 Context
 ### User Request
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
 - **ONE TOPIC PER TASK** - Each task should handle one complete business objective
 - **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
 - **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
 - **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job
 ### Task Grouping Examples
 - **Research + Analysis + Report** → ONE task: "Web research report"
 - **Data Collection + Processing + Visualization** → ONE task: "Collect and present data"
 - **Document splitting** (analyze + extract + create files) → ONE task: "Split document into separate files"
 - **Different topics** (email + flowers) → SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
 ### Common Single-Task Scenarios
 - **"Split document into sections"** → ONE task: "Split document into separate files"
 - **"Extract data and create report"** → ONE task: "Extract data and create report"
 - **"Analyze and summarize document"** → ONE task: "Analyze and summarize document"
 - **"Convert file to different format"** → ONE task: "Convert file to different format"
 ### Retry Handling
 - **If retry request**: Analyze previous rounds to understand what failed
 - **Learn from mistakes**: Improve the plan based on previous failures