added normalization service to merge structured data of any format

2025-10-14 17:50:27 +02:00 · 2025-10-14 17:50:27 +02:00 · bdc87eb5c6
commit bdc87eb5c6
parent f83786b3a7
12 changed files with 585 additions and 164 deletions
--- a/modules/datamodels/datamodelDocument.py
+++ b/modules/datamodels/datamodelDocument.py
@ -121,5 +121,10 @@ class JsonMergeResult(BaseModel):
    metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")


-# Update forward references
-ListItem.model_rebuild()
+# Update forward references (compatible with Pydantic v1 and v2)
+try:
+    # Pydantic v2
+    ListItem.model_rebuild()
+except AttributeError:
+    # Pydantic v1
+    ListItem.update_forward_refs()
--- a/modules/services/serviceAi/subDocumentGeneration.py
+++ b/modules/services/serviceAi/subDocumentGeneration.py
@ -105,6 +105,12 @@ class SubDocumentGeneration:
            if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
                raise Exception("AI response is not valid JSON document structure")

+            # Emit raw extracted data as a chat message attachment before rendering
+            try:
+                await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
+            except Exception:
+                logger.warning("Failed to emit raw extraction chat message (single-file)")
+
            # Generate filename from document metadata
            parsedFilename = None
            try:
@ -211,6 +217,12 @@ class SubDocumentGeneration:
                    prompt, documents, options, outputFormat, title
                )
            
+            # Emit raw extracted data as a chat message attachment before transformation/rendering
+            try:
+                await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
+            except Exception:
+                logger.warning("Failed to emit raw extraction chat message (multi-file)")
+
            # Process multiple documents
            generated_documents = []
            for i, doc_data in enumerate(ai_response.get("documents", [])):
@ -457,3 +469,71 @@ Return only the JSON response.
        except Exception as e:
            logger.warning(f"Response validation failed with exception: {str(e)}")
            return False
+
+    async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
+        """
+        Create a ChatMessage with the extracted raw JSON attached as a file so the user
+        has access to the data even if downstream processing fails.
+        """
+        try:
+            services = self.services
+            workflow = getattr(services, 'currentWorkflow', None)
+            if not workflow:
+                return
+
+            # Serialize payload
+            import json as _json
+            from datetime import datetime, UTC
+            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+            content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
+            content_bytes = content_text.encode('utf-8')
+
+            # Store as file via component storage
+            file_name = f"{label}_{ts}.json"
+            file_item = services.interfaceDbComponent.createFile(
+                name=file_name,
+                mimeType="application/json",
+                content=content_bytes
+            )
+            services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
+
+            # Lookup file info for ChatDocument
+            file_info = services.workflow.getFileInfo(file_item.id)
+            doc = ChatDocument(
+                messageId="",  # set after message creation
+                fileId=file_item.id,
+                fileName=file_info.get("fileName", file_name) if file_info else file_name,
+                fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
+                mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
+            )
+
+            # Create message referencing the file
+            messageData = {
+                "workflowId": workflow.id,
+                "role": "assistant",
+                "message": "Raw extraction data saved",
+                "status": "data",
+                "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
+                "publishedAt": services.utils.getUtcTimestamp(),
+                "documentsLabel": label,
+                "documents": []
+            }
+            message = services.workflow.createMessage(messageData)
+            if not message:
+                return
+
+            # Persist ChatDocument with messageId
+            doc.messageId = message.id
+            services.interfaceDbChat.createDocument(doc.to_dict())
+
+            # Update message to include document
+            try:
+                if not message.documents:
+                    message.documents = []
+                message.documents.append(doc)
+                services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
+            except Exception:
+                pass
+        except Exception:
+            # Non-fatal; ignore if storage or chat creation fails
+            return
--- a/modules/services/serviceAi/subDocumentProcessing.py
+++ b/modules/services/serviceAi/subDocumentProcessing.py
@ -175,6 +175,27 @@ class SubDocumentProcessing:
            
            # Merge with JSON mode
            mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
+
+            # Normalize merged JSON into a single canonical table
+            try:
+                from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
+                normalizer = NormalizationService(self.services)
+                inventory = normalizer.discoverStructures(mergedJsonDocument)
+                # Use workflow id if available as cache key, else default
+                cacheKey = getattr(self.services, 'currentWorkflow', None)
+                cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run'
+                # Provide the extraction/merge prompt context when available to help mapping
+                mergePrompt = prompt
+                mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
+                canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
+                report = normalizer.validateCanonical(canonical)
+                if report.get('success'):
+                    mergedJsonDocument = canonical
+                else:
+                    raise ValueError('Normalization produced zero rows')
+            except Exception as e:
+                # Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
+                raise
            
            # Save merged JSON extraction content to debug file - only if debug enabled
            try:
@ -481,8 +502,16 @@ class SubDocumentProcessing:
                        self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
                        self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
                        
+                        # Strengthen prompt to forbid fabrication for text/container extraction
+                        augmented_prompt = (
+                            f"{prompt}\n\n"
+                            "CRITICAL RULES (NO FABRICATION):\n"
+                            "- Use ONLY content present in the provided CONTEXT.\n"
+                            "- Do NOT create, infer, or guess values not explicitly in the context.\n"
+                            "- If a value is missing, leave the cell empty or omit the row.\n"
+                        )
                        request = AiCallRequest(
-                            prompt=prompt,
+                            prompt=augmented_prompt,
                            context=part.data,
                            options=request_options
                        )
@ -579,8 +608,16 @@ class SubDocumentProcessing:
                    logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
                    logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
                    
+                    # Strengthen prompt to forbid fabrication for text extraction
+                    augmented_prompt_text = (
+                        f"{prompt}\n\n"
+                        "CRITICAL RULES (NO FABRICATION):\n"
+                        "- Use ONLY content present in the provided CONTEXT.\n"
+                        "- Do NOT create, infer, or guess values not explicitly in the context.\n"
+                        "- If a value is missing, leave the cell empty or omit the row.\n"
+                    )
                    request = AiCallRequest(
-                        prompt=prompt,
+                        prompt=augmented_prompt_text,
                        context=part.data,
                        options=request_options
                    )
--- a/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py
+++ b/modules/services/serviceGeneration/renderers/rendererBaseTemplate.py
@ -92,11 +92,11 @@ class BaseRenderer(ABC):
    
    def _get_section_type(self, section: Dict[str, Any]) -> str:
        """Get the type of a section."""
-        return section.get("type", "paragraph")
+        return section.get("content_type", "paragraph")
    
-    def _get_section_data(self, section: Dict[str, Any]) -> Dict[str, Any]:
-        """Get the data of a section."""
-        return section.get("data", {})
+    def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Get the elements of a section."""
+        return section.get("elements", [])
    
    def _get_section_id(self, section: Dict[str, Any]) -> str:
        """Get the ID of a section (if available)."""
--- a/modules/services/serviceGeneration/renderers/rendererDocx.py
+++ b/modules/services/serviceGeneration/renderers/rendererDocx.py
@ -212,24 +212,26 @@ class RendererDocx(BaseRenderer):
    def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
        """Render a single JSON section to DOCX using AI-generated styles."""
        try:
-            section_type = section.get("type", "paragraph")
-            section_data = section.get("data", {})
+            section_type = section.get("content_type", "paragraph")
+            elements = section.get("elements", [])
            
-            if section_type == "table":
-                self._render_json_table(doc, section_data, styles)
-            elif section_type == "bullet_list":
-                self._render_json_bullet_list(doc, section_data, styles)
-            elif section_type == "heading":
-                self._render_json_heading(doc, section_data, styles)
-            elif section_type == "paragraph":
-                self._render_json_paragraph(doc, section_data, styles)
-            elif section_type == "code_block":
-                self._render_json_code_block(doc, section_data, styles)
-            elif section_type == "image":
-                self._render_json_image(doc, section_data, styles)
-            else:
-                # Fallback to paragraph for unknown types
-                self._render_json_paragraph(doc, section_data, styles)
+            # Process each element in the section
+            for element in elements:
+                if section_type == "table":
+                    self._render_json_table(doc, element, styles)
+                elif section_type == "bullet_list":
+                    self._render_json_bullet_list(doc, element, styles)
+                elif section_type == "heading":
+                    self._render_json_heading(doc, element, styles)
+                elif section_type == "paragraph":
+                    self._render_json_paragraph(doc, element, styles)
+                elif section_type == "code_block":
+                    self._render_json_code_block(doc, element, styles)
+                elif section_type == "image":
+                    self._render_json_image(doc, element, styles)
+                else:
+                    # Fallback to paragraph for unknown types
+                    self._render_json_paragraph(doc, element, styles)
                
        except Exception as e:
            self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
--- a/modules/services/serviceGeneration/renderers/rendererPdf.py
+++ b/modules/services/serviceGeneration/renderers/rendererPdf.py
@ -105,7 +105,7 @@ class RendererPdf(BaseRenderer):
            sections = json_content.get("sections", [])
            self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
            for i, section in enumerate(sections):
-                self.services.utils.debugLogToFile(f"PDF SECTION {i}: type={section.get('type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
+                self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
                section_elements = self._render_json_section(section, styles)
                self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
                story.extend(section_elements)
@ -469,23 +469,28 @@ class RendererPdf(BaseRenderer):
        """Render a single JSON section to PDF elements using AI-generated styles."""
        try:
            section_type = self._get_section_type(section)
-            section_data = self._get_section_data(section)
+            elements = self._get_section_data(section)
            
-            if section_type == "table":
-                return self._render_json_table(section_data, styles)
-            elif section_type == "bullet_list":
-                return self._render_json_bullet_list(section_data, styles)
-            elif section_type == "heading":
-                return self._render_json_heading(section_data, styles)
-            elif section_type == "paragraph":
-                return self._render_json_paragraph(section_data, styles)
-            elif section_type == "code_block":
-                return self._render_json_code_block(section_data, styles)
-            elif section_type == "image":
-                return self._render_json_image(section_data, styles)
-            else:
-                # Fallback to paragraph for unknown types
-                return self._render_json_paragraph(section_data, styles)
+            # Process each element in the section
+            all_elements = []
+            for element in elements:
+                if section_type == "table":
+                    all_elements.extend(self._render_json_table(element, styles))
+                elif section_type == "bullet_list":
+                    all_elements.extend(self._render_json_bullet_list(element, styles))
+                elif section_type == "heading":
+                    all_elements.extend(self._render_json_heading(element, styles))
+                elif section_type == "paragraph":
+                    all_elements.extend(self._render_json_paragraph(element, styles))
+                elif section_type == "code_block":
+                    all_elements.extend(self._render_json_code_block(element, styles))
+                elif section_type == "image":
+                    all_elements.extend(self._render_json_image(element, styles))
+                else:
+                    # Fallback to paragraph for unknown types
+                    all_elements.extend(self._render_json_paragraph(element, styles))
+            
+            return all_elements
                
        except Exception as e:
            self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
--- a/modules/services/serviceGeneration/renderers/rendererPptx.py
+++ b/modules/services/serviceGeneration/renderers/rendererPptx.py
@ -600,29 +600,33 @@ JSON ONLY. NO OTHER TEXT."""
        try:
            # Get section title from data or use default
            section_title = "Untitled Section"
-            if section.get("type") == "heading":
-                section_title = section.get("data", {}).get("text", "Untitled Section")
+            if section.get("content_type") == "heading":
+                # Extract text from elements array
+                for element in section.get("elements", []):
+                    if isinstance(element, dict) and "text" in element:
+                        section_title = element.get("text", "Untitled Section")
+                        break
            elif section.get("title"):
                section_title = section.get("title")
            
-            content_type = section.get("type", "paragraph")
-            section_data = section.get("data", {})
+            content_type = section.get("content_type", "paragraph")
+            elements = section.get("elements", [])
            
            # Build slide content based on section type
            content_parts = []
            
            if content_type == "table":
-                content_parts.append(self._format_table_for_slide(section_data))
+                content_parts.append(self._format_table_for_slide(elements))
            elif content_type == "list":
-                content_parts.append(self._format_list_for_slide(section_data))
+                content_parts.append(self._format_list_for_slide(elements))
            elif content_type == "heading":
-                content_parts.append(self._format_heading_for_slide(section_data))
+                content_parts.append(self._format_heading_for_slide(elements))
            elif content_type == "paragraph":
-                content_parts.append(self._format_paragraph_for_slide(section_data))
+                content_parts.append(self._format_paragraph_for_slide(elements))
            elif content_type == "code":
-                content_parts.append(self._format_code_for_slide(section_data))
+                content_parts.append(self._format_code_for_slide(elements))
            else:
-                content_parts.append(self._format_paragraph_for_slide(section_data))
+                content_parts.append(self._format_paragraph_for_slide(elements))
            
            # Combine content parts
            slide_content = "\n\n".join(filter(None, content_parts))
@ -636,11 +640,17 @@ JSON ONLY. NO OTHER TEXT."""
            logger.warning(f"Error creating slide from section: {str(e)}")
            return None
    
-    def _format_table_for_slide(self, table_data: Dict[str, Any]) -> str:
+    def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
        """Format table data for slide presentation."""
        try:
-            headers = table_data.get("headers", [])
-            rows = table_data.get("rows", [])
+            # Extract table data from elements array
+            headers = []
+            rows = []
+            for element in elements:
+                if isinstance(element, dict) and "headers" in element and "rows" in element:
+                    headers = element.get("headers", [])
+                    rows = element.get("rows", [])
+                    break
            
            if not headers:
                return ""
@ -805,8 +815,8 @@ JSON ONLY. NO OTHER TEXT."""
            current_slide_title = "Content Overview"
            
            for section in sections:
-                section_type = section.get("type", "paragraph")
-                section_data = section.get("data", {})
+                section_type = section.get("content_type", "paragraph")
+                elements = section.get("elements", [])
                
                if section_type == "heading":
                    # If we have accumulated content, create a slide
@ -818,7 +828,10 @@ JSON ONLY. NO OTHER TEXT."""
                        current_slide_content = []
                    
                    # Start new slide with heading as title
-                    current_slide_title = section_data.get("text", "Untitled Section")
+                    for element in elements:
+                        if isinstance(element, dict) and "text" in element:
+                            current_slide_title = element.get("text", "Untitled Section")
+                            break
                else:
                    # Add content to current slide
                    formatted_content = self._format_section_content(section)
@ -841,21 +854,26 @@ JSON ONLY. NO OTHER TEXT."""
    def _format_section_content(self, section: Dict[str, Any]) -> str:
        """Format section content for slide presentation."""
        try:
-            content_type = section.get("type", "paragraph")
-            section_data = section.get("data", {})
+            content_type = section.get("content_type", "paragraph")
+            elements = section.get("elements", [])
            
-            if content_type == "table":
-                return self._format_table_for_slide(section_data)
-            elif content_type == "list":
-                return self._format_list_for_slide(section_data)
-            elif content_type == "heading":
-                return self._format_heading_for_slide(section_data)
-            elif content_type == "paragraph":
-                return self._format_paragraph_for_slide(section_data)
-            elif content_type == "code":
-                return self._format_code_for_slide(section_data)
-            else:
-                return self._format_paragraph_for_slide(section_data)
+            # Process each element in the section
+            content_parts = []
+            for element in elements:
+                if content_type == "table":
+                    content_parts.append(self._format_table_for_slide([element]))
+                elif content_type == "list":
+                    content_parts.append(self._format_list_for_slide([element]))
+                elif content_type == "heading":
+                    content_parts.append(self._format_heading_for_slide([element]))
+                elif content_type == "paragraph":
+                    content_parts.append(self._format_paragraph_for_slide([element]))
+                elif content_type == "code":
+                    content_parts.append(self._format_code_for_slide([element]))
+                else:
+                    content_parts.append(self._format_paragraph_for_slide([element]))
+            
+            return "\n\n".join(filter(None, content_parts))
                
        except Exception as e:
            logger.warning(f"Error formatting section content: {str(e)}")
--- a/modules/services/serviceGeneration/renderers/rendererXlsx.py
+++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py
@ -457,7 +457,7 @@ class RendererXlsx(BaseRenderer):
            # Add additional sheets for other content types
            content_types = set()
            for section in sections:
-                content_type = section.get("type", "paragraph")
+                content_type = section.get("content_type", "paragraph")
                content_types.add(content_type)
            
            if "table" in content_types and len(table_sections) == 1:
@ -658,25 +658,21 @@ class RendererXlsx(BaseRenderer):
                start_row += 1
            
            # Process section based on type
-            section_type = section.get("type", "paragraph")
+            section_type = section.get("content_type", "paragraph")
            
-            if section_type == "table":
-                # Handle table section directly
-                table_data = section.get("data", {})
-                if table_data:
-                    start_row = self._add_table_to_excel(sheet, table_data, styles, start_row)
-            else:
-                # Handle other section types
-                elements = section.get("elements", [])
-                for element in elements:
-                    if section_type == "list":
-                        start_row = self._add_list_to_excel(sheet, element, styles, start_row)
-                    elif section_type == "paragraph":
-                        start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
-                    elif section_type == "heading":
-                        start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
-                    else:
-                        start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
+            # Handle all section types using elements array
+            elements = section.get("elements", [])
+            for element in elements:
+                if section_type == "table":
+                    start_row = self._add_table_to_excel(sheet, element, styles, start_row)
+                elif section_type == "list":
+                    start_row = self._add_list_to_excel(sheet, element, styles, start_row)
+                elif section_type == "paragraph":
+                    start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
+                elif section_type == "heading":
+                    start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
+                else:
+                    start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
            
            return start_row
            
--- a/modules/services/serviceGeneration/subPromptBuilder.py
+++ b/modules/services/serviceGeneration/subPromptBuilder.py
@ -286,7 +286,7 @@ async def buildExtractionPrompt(
    from .subJsonSchema import get_document_subJsonSchema
    jsonSchema = get_document_subJsonSchema()

-    # Generic block for JSON extraction
+    # Generic block for JSON extraction - use proper schema instead of hardcoded template
    genericIntro = f"""
 {extractionIntent}

@ -295,44 +295,7 @@ You are extracting structured content from documents and must respond with valid
 CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.

 Extract the actual data from the source documents and structure it as JSON with this format:
-{{
-    "metadata": {{
-        "title": "Document Title",
-        "version": "1.0"
-    }},
-    "sections": [
-        {{
-            "id": "section_1",
-            "type": "heading",
-            "data": {{
-                "level": 1,
-                "text": "Heading Text"
-            }}
-        }},
-        {{
-            "id": "section_2", 
-            "type": "table",
-            "data": {{
-                "headers": ["Column1", "Column2"],
-                "rows": [["Data1", "Data2"], ["Data3", "Data4"]]
-            }}
-        }},
-        {{
-            "id": "section_3",
-            "type": "bullet_list", 
-            "data": {{
-                "items": ["Item 1", "Item 2", "Item 3"]
-            }}
-        }},
-        {{
-            "id": "section_4",
-            "type": "paragraph",
-            "data": {{
-                "text": "Paragraph content here"
-            }}
-        }}
-    ]
-}}
+{json.dumps(jsonSchema, indent=2)}

 Content Types to Extract:
 1. Tables: Extract all rows and columns with proper headers
@ -485,8 +448,8 @@ def _getFormatRules(outputFormat: str) -> str:

 async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
    """
-    Use AI to extract the core content intention from the user prompt.
-    Focus on WHAT the user wants to extract, not HOW to format it.
+    Use AI to extract a rich, structured extraction intent from the user prompt.
+    Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
    """
    if not aiService:
        # Fallback if no AI service available
@ -496,18 +459,46 @@ async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=N
        # Protect userPrompt from injection by escaping quotes and newlines
        safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
        
-        # Simple AI call to extract the intention
+        # Rich analysis to derive a complete extraction intent and structure guidance
        extractionPrompt = f"""
-Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted.
+Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
+
+Goals:
+- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
+- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
+- Identify if multi-file output is appropriate and how to split/files name.

 User request: "{safeUserPrompt}"

-Return only the content intention in a simple format like "Extract: [content description]"
-Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis.
-If the user mentions a table, extract the actual table data with rows and columns.
-If the user mentions a list, extract the actual list items.
-IMPORTANT: Preserve any language requirements in your response.
-Do not include formatting instructions, file types, or output methods.
+Return JSON in this exact shape:
+{{
+  "detectedLanguage": "de|en|fr|it|...",
+  "normalizedRequest": "Full explicit instruction in detected language",
+  "requiresStructuredData": true|false,
+  "targetStructure": "table|list|mixed|unstructured",
+  "table": {{
+    "headers": ["Header1", "Header2", "..."],
+    "headerOrderStrict": true|false,
+    "rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
+    "formats": {{
+      "dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
+      "amountDecimals": 2,
+      "currencyFormat": "code|symbol",
+      "idMasking": "none|last4|custom"
+    }}
+  }},
+  "multiFile": true|false,
+  "fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
+  "fileNamingPattern": "suggested pattern for filenames",
+  "constraints": ["List of critical constraints to enforce"],
+  "reasoning": "Brief justification (one sentence)"
+}}
+
+Rules:
+- Preserve user terminology and language in normalizedRequest.
+- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
+- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
+- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
 """
        
        # Call AI service to extract intention
@ -523,12 +514,24 @@ Do not include formatting instructions, file types, or output methods.
        result = response.content if response else ""
        services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
        
-        return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
+        # Try to extract and pretty print JSON
+        if result:
+            import re, json as _json
+            match = re.search(r'\{[\s\S]*\}', result)
+            if match:
+                try:
+                    obj = _json.loads(match.group(0))
+                    return _json.dumps(obj, ensure_ascii=False, indent=2)
+                except Exception:
+                    pass
+        
+        # Fallback to previous simple format
+        return f"Extract: {safeUserPrompt}"
        
    except Exception as e:
        # Fallback on any error - preserve user prompt for language instructions
        services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
-        return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
+        return f"Extract: {userPrompt}"



--- a/modules/services/serviceNormalization/mainServiceNormalization.py
+++ b/modules/services/serviceNormalization/mainServiceNormalization.py
@ -0,0 +1,243 @@
+import json
+import os
+from typing import Any, Dict, List, Set
+from datetime import datetime, UTC
+
+
+class NormalizationService:
+    """
+    Produces a single canonical table in merged JSON using an AI-provided header mapping
+    and deterministic, in-code value normalization. No language heuristics in code.
+    """
+
+    def __init__(self, services):
+        self.services = services
+
+    # Public API
+    def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
+        headers: Set[str] = set()
+        samples: Dict[str, List[str]] = {}
+
+        sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
+        for section in sections:
+            if not isinstance(section, dict):
+                continue
+            
+            # Use only the fundamental agreed JSON structure: content_type/elements
+            if section.get("content_type") != "table":
+                continue
+            
+            # Extract table data from elements array
+            for element in section.get("elements", []):
+                if isinstance(element, dict) and "headers" in element and "rows" in element:
+                    hdrs = element.get("headers") or []
+                    rows = element.get("rows") or []
+                    break
+            else:
+                continue
+            for h in hdrs:
+                if not isinstance(h, str):
+                    continue
+                headers.add(h)
+            # collect small value samples by column index
+            for row in rows[:5]:
+                if not isinstance(row, list):
+                    continue
+                for i, value in enumerate(row):
+                    headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
+                    if headerName not in samples:
+                        samples[headerName] = []
+                    if len(samples[headerName]) < 5:
+                        samples[headerName].append(str(value))
+
+        return {
+            "tableHeaders": sorted(list(headers)),
+            "headerSamples": samples,
+        }
+
+    async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
+
+        # Allow caller to specify any canonical schema. If none provided, default to discovered headers.
+        if canonicalSpec is None:
+            canonicalSpec = {
+                "canonicalHeaders": inventory.get("tableHeaders", []),
+                "constraints": {}
+            }
+
+        # Protect merge prompt context by wrapping in single quotes and escaping internal quotes
+        protectedMerge = None
+        if mergePrompt:
+            try:
+                protectedMerge = str(mergePrompt).replace("'", "\\'")
+            except Exception:
+                protectedMerge = str(mergePrompt)
+
+        prompt = (
+            "You are a mapping generator. Return ONLY JSON.\n\n"
+            "Given discovered headers and sample values, map them to the canonical headers.\n"
+            "Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
+            f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
+            f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
+            + (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
+            "REPLY JSON SHAPE:\n(Example)\n"
+            "{\n  \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
+            "  \"normalizationPolicy\": {\n    \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
+            "    \"Currency\": {\"stripSymbols\": true},\n"
+            "    \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n  }\n}\n"
+        )
+
+        response = await self.services.ai.callAi(prompt=prompt)
+        js = response[response.find('{'):response.rfind('}') + 1] if response else '{}'
+        mapping = json.loads(js)
+        # Normalize key naming from AI: prefer single key "mapping"
+        if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
+            mapping["mapping"] = mapping["mappings"]
+            try:
+                del mapping["mappings"]
+            except Exception:
+                pass
+        # Ensure canonicalHeaders present in mapping for downstream use
+        if "canonicalHeaders" not in mapping:
+            mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
+
+        # debug artifact
+        self._writeDebugArtifact("mapping.json", mapping)
+        return mapping
+
+    def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
+        mappings = (mappingSpec or {}).get("mapping", {})
+        policy = (mappingSpec or {}).get("normalizationPolicy", {})
+
+        # Prefer headers provided by mapping (generic across domains)
+        canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
+        if not canonicalHeaders:
+            # Fallback to union of mapped targets
+            canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
+
+        rows: List[List[str]] = []
+        sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
+        for section in sections:
+            # Use only the fundamental agreed JSON structure: content_type/elements
+            if section.get("content_type") != "table":
+                continue
+            
+            # Extract table data from elements array
+            for element in section.get("elements", []):
+                if isinstance(element, dict) and "headers" in element and "rows" in element:
+                    sourceHeaders = element.get("headers") or []
+                    sourceRows = element.get("rows") or []
+                    break
+            else:
+                continue
+            if not sourceHeaders or not sourceRows:
+                continue
+
+            # Build index map: canonical -> source index or None
+            indexMap: Dict[str, int] = {}
+            for ci, ch in enumerate(canonicalHeaders):
+                srcIndex = None
+                for si, sh in enumerate(sourceHeaders):
+                    # Prefer explicit mapping target; fallback to identity when names match
+                    target = mappings.get(sh)
+                    if target is None and sh == ch:
+                        target = ch
+                    if target == ch:
+                        srcIndex = si
+                        break
+                indexMap[ch] = srcIndex
+
+            # Transform rows
+            for r in sourceRows:
+                canonicalRow: List[str] = []
+                for ch in canonicalHeaders:
+                    idx = indexMap.get(ch)
+                    value = r[idx] if (idx is not None and idx < len(r)) else ""
+                    canonicalRow.append(self._normalizeValue(ch, value, policy))
+                # consider as row if at least one non-empty meaningful field
+                if any(v.strip() for v in canonicalRow):
+                    rows.append(canonicalRow)
+
+        canonical = {
+            "metadata": {
+                "title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
+                "source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
+            },
+            "sections": [
+                {
+                    "id": "canonical_table_1",
+                    "content_type": "table",
+                    "elements": [
+                        {
+                            "headers": canonicalHeaders,
+                            "rows": rows
+                        }
+                    ],
+                    "order": 1
+                }
+            ]
+        }
+
+        # debug artifact
+        self._writeDebugArtifact("canonical_merged.json", canonical)
+        return canonical
+
+    def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
+        rows = []
+        try:
+            sections = canonicalJson.get("sections", [])
+            for s in sections:
+                if s.get("content_type") == "table":
+                    # Extract rows from elements array
+                    for element in s.get("elements", []):
+                        if isinstance(element, dict) and "rows" in element:
+                            rows.extend(element.get("rows", []))
+        except Exception:
+            rows = []
+        report = {
+            "rowCount": len(rows),
+            "success": len(rows) > 0
+        }
+        self._writeDebugArtifact("normalization_report.json", report)
+        return report
+
+    # Internal helpers
+    def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
+        if value is None:
+            return ""
+        text = str(value).strip()
+        # Generic normalization guided by policy; avoid domain specifics
+        if canonicalHeader in (policy.get("numericFields", []) or []):
+            dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
+                   or (policy.get("numeric") or {}).get("decimalSeparator")
+                   or ".")
+            if dec == ",":
+                text = text.replace(".", "").replace(",", ".") if "," in text else text
+            text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
+        elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
+            text = ''.join(ch for ch in text if ch.isalpha())
+            text = text.upper()
+        return text
+
+    def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
+        try:
+            debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
+            if not debugEnabled:
+                return
+            root = "./test-chat/ai"
+            os.makedirs(root, exist_ok=True)
+            # Prefix timestamp for files that are frequently overwritten
+            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+            if fileName in ("mapping.json", "canonical_merged.json"):
+                outName = f"{ts}_{fileName}"
+            else:
+                outName = fileName
+            path = os.path.join(root, outName)
+            with open(path, "w", encoding="utf-8") as f:
+                if isinstance(obj, (dict, list)):
+                    f.write(json.dumps(obj, ensure_ascii=False, indent=2))
+                else:
+                    f.write(str(obj))
+        except Exception:
+            pass
+
+
--- a/modules/workflows/processing/shared/placeholderFactory.py
+++ b/modules/workflows/processing/shared/placeholderFactory.py
@ -42,16 +42,27 @@ def extractUserPrompt(context: Any) -> str:
    Fallback to the task_step objective.
    """
    try:
-        # Prefer services.currentUserPrompt when accessible through context
        services = getattr(context, 'services', None)
-        if services and getattr(services, 'currentUserPrompt', None):
-            return services.currentUserPrompt
-    except Exception:
-        pass

-    if hasattr(context, 'task_step') and context.task_step:
-        return context.task_step.objective or 'No request specified'
-    return 'No request specified'
+        # Determine raw user prompt from services or task_step
+        rawPrompt = None
+        if services and getattr(services, 'currentUserPrompt', None):
+            rawPrompt = services.currentUserPrompt
+        elif hasattr(context, 'task_step') and context.task_step:
+            rawPrompt = context.task_step.objective or 'No request specified'
+        else:
+            rawPrompt = 'No request specified'
+
+        # Prefer values computed at workflow start by WorkflowManager analyzer
+        normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
+        if normalized:
+            return normalized
+        return rawPrompt
+    except Exception:
+        # Robust fallback behavior
+        if hasattr(context, 'task_step') and context.task_step:
+            return context.task_step.objective or 'No request specified'
+        return 'No request specified'

 def extractWorkflowHistory(service: Any, context: Any) -> str:
    """Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
@ -99,7 +110,15 @@ def extractAvailableMethods(service: Any) -> str:

 def extractUserLanguage(service: Any) -> str:
    """Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
-    return service.user.language if service and service.user else 'en'
+    try:
+        # Prefer detected language if available
+        if service and getattr(service, 'currentUserLanguage', None):
+            return service.currentUserLanguage
+        return service.user.language if service and service.user else 'en'
+    except Exception:
+        return 'en'
+
+# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.


 def _computeMessageSummary(msg) -> str:
--- a/modules/workflows/workflowManager.py
+++ b/modules/workflows/workflowManager.py
@ -216,23 +216,23 @@ class WorkflowManager:
                    # Update the message with documents in database
                    self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})

-                # Analyze the user's input to extract intent and offload bulky context into documents
+                # Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
                try:
                    analyzerPrompt = (
-                        "You are an input analyzer. Split the user's message into:\n"
-                        "1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n"
-                        "2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. "
-                        "Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. "
-                        "Keep URLs in the intent unless they include large pasted content.\n\n"
+                        "You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
+                        "1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
+                        "2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
+                        "3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
+                        "4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
                        "Rules:\n"
-                        "- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; "
-                        "return an empty contextItems and keep a compact, self-contained intent.\n"
-                        "- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n"
-                        "- Preserve critical references (URLs, filenames) in the intent.\n"
-                        "- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n"
-                        "Output JSON only (no markdown):\n"
+                        "- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
+                        "- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
+                        "- Preserve critical references (URLs, filenames) in intent.\n"
+                        "- Normalize to the primary detected language if mixed-language.\n\n"
+                        "Return ONLY JSON (no markdown) with this shape:\n"
                        "{\n"
-                        "  \"detectedLanguage\": \"en\",\n"
+                        "  \"detectedLanguage\": \"de|en|fr|it|...\",\n"
+                        "  \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
                        "  \"intent\": \"Concise normalized request...\",\n"
                        "  \"contextItems\": [\n"
                        "    {\n"
@ -249,6 +249,7 @@ class WorkflowManager:
                    aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)

                    detectedLanguage = None
+                    normalizedRequest = None
                    intentText = userInput.prompt
                    contextItems = []

@ -260,6 +261,7 @@ class WorkflowManager:
                        if jsonStart != -1 and jsonEnd > jsonStart:
                            parsed = json.loads(aiResponse[jsonStart:jsonEnd])
                            detectedLanguage = parsed.get('detectedLanguage') or None
+                            normalizedRequest = parsed.get('normalizedRequest') or None
                            if parsed.get('intent'):
                                intentText = parsed.get('intent')
                            contextItems = parsed.get('contextItems') or []
@ -269,7 +271,18 @@ class WorkflowManager:
                    # Update services state
                    if detectedLanguage and isinstance(detectedLanguage, str):
                        self._setUserLanguage(detectedLanguage)
+                        try:
+                            setattr(self.services, 'currentUserLanguage', detectedLanguage)
+                        except Exception:
+                            pass
                    self.services.currentUserPrompt = intentText or userInput.prompt
+                    try:
+                        if normalizedRequest:
+                            setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
+                        if contextItems is not None:
+                            setattr(self.services, 'currentUserContextItems', contextItems)
+                    except Exception:
+                        pass

                    # Telemetry (sizes and counts)
                    try: