added normalization service to merge structured data of any format

This commit is contained in:
ValueOn AG 2025-10-14 17:50:27 +02:00
parent f83786b3a7
commit bdc87eb5c6
12 changed files with 585 additions and 164 deletions

View file

@ -121,5 +121,10 @@ class JsonMergeResult(BaseModel):
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata") metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
# Update forward references # Update forward references (compatible with Pydantic v1 and v2)
ListItem.model_rebuild() try:
# Pydantic v2
ListItem.model_rebuild()
except AttributeError:
# Pydantic v1
ListItem.update_forward_refs()

View file

@ -105,6 +105,12 @@ class SubDocumentGeneration:
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson: if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
raise Exception("AI response is not valid JSON document structure") raise Exception("AI response is not valid JSON document structure")
# Emit raw extracted data as a chat message attachment before rendering
try:
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
except Exception:
logger.warning("Failed to emit raw extraction chat message (single-file)")
# Generate filename from document metadata # Generate filename from document metadata
parsedFilename = None parsedFilename = None
try: try:
@ -211,6 +217,12 @@ class SubDocumentGeneration:
prompt, documents, options, outputFormat, title prompt, documents, options, outputFormat, title
) )
# Emit raw extracted data as a chat message attachment before transformation/rendering
try:
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
except Exception:
logger.warning("Failed to emit raw extraction chat message (multi-file)")
# Process multiple documents # Process multiple documents
generated_documents = [] generated_documents = []
for i, doc_data in enumerate(ai_response.get("documents", [])): for i, doc_data in enumerate(ai_response.get("documents", [])):
@ -457,3 +469,71 @@ Return only the JSON response.
except Exception as e: except Exception as e:
logger.warning(f"Response validation failed with exception: {str(e)}") logger.warning(f"Response validation failed with exception: {str(e)}")
return False return False
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
"""
Create a ChatMessage with the extracted raw JSON attached as a file so the user
has access to the data even if downstream processing fails.
"""
try:
services = self.services
workflow = getattr(services, 'currentWorkflow', None)
if not workflow:
return
# Serialize payload
import json as _json
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
content_bytes = content_text.encode('utf-8')
# Store as file via component storage
file_name = f"{label}_{ts}.json"
file_item = services.interfaceDbComponent.createFile(
name=file_name,
mimeType="application/json",
content=content_bytes
)
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
# Lookup file info for ChatDocument
file_info = services.workflow.getFileInfo(file_item.id)
doc = ChatDocument(
messageId="", # set after message creation
fileId=file_item.id,
fileName=file_info.get("fileName", file_name) if file_info else file_name,
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
)
# Create message referencing the file
messageData = {
"workflowId": workflow.id,
"role": "assistant",
"message": "Raw extraction data saved",
"status": "data",
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
"publishedAt": services.utils.getUtcTimestamp(),
"documentsLabel": label,
"documents": []
}
message = services.workflow.createMessage(messageData)
if not message:
return
# Persist ChatDocument with messageId
doc.messageId = message.id
services.interfaceDbChat.createDocument(doc.to_dict())
# Update message to include document
try:
if not message.documents:
message.documents = []
message.documents.append(doc)
services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
except Exception:
pass
except Exception:
# Non-fatal; ignore if storage or chat creation fails
return

View file

@ -175,6 +175,27 @@ class SubDocumentProcessing:
# Merge with JSON mode # Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
# Normalize merged JSON into a single canonical table
try:
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
normalizer = NormalizationService(self.services)
inventory = normalizer.discoverStructures(mergedJsonDocument)
# Use workflow id if available as cache key, else default
cacheKey = getattr(self.services, 'currentWorkflow', None)
cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run'
# Provide the extraction/merge prompt context when available to help mapping
mergePrompt = prompt
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
report = normalizer.validateCanonical(canonical)
if report.get('success'):
mergedJsonDocument = canonical
else:
raise ValueError('Normalization produced zero rows')
except Exception as e:
# Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
raise
# Save merged JSON extraction content to debug file - only if debug enabled # Save merged JSON extraction content to debug file - only if debug enabled
try: try:
@ -481,8 +502,16 @@ class SubDocumentProcessing:
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE") self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
# Strengthen prompt to forbid fabrication for text/container extraction
augmented_prompt = (
f"{prompt}\n\n"
"CRITICAL RULES (NO FABRICATION):\n"
"- Use ONLY content present in the provided CONTEXT.\n"
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
"- If a value is missing, leave the cell empty or omit the row.\n"
)
request = AiCallRequest( request = AiCallRequest(
prompt=prompt, prompt=augmented_prompt,
context=part.data, context=part.data,
options=request_options options=request_options
) )
@ -579,8 +608,16 @@ class SubDocumentProcessing:
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...") logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...") logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
# Strengthen prompt to forbid fabrication for text extraction
augmented_prompt_text = (
f"{prompt}\n\n"
"CRITICAL RULES (NO FABRICATION):\n"
"- Use ONLY content present in the provided CONTEXT.\n"
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
"- If a value is missing, leave the cell empty or omit the row.\n"
)
request = AiCallRequest( request = AiCallRequest(
prompt=prompt, prompt=augmented_prompt_text,
context=part.data, context=part.data,
options=request_options options=request_options
) )

View file

@ -92,11 +92,11 @@ class BaseRenderer(ABC):
def _get_section_type(self, section: Dict[str, Any]) -> str: def _get_section_type(self, section: Dict[str, Any]) -> str:
"""Get the type of a section.""" """Get the type of a section."""
return section.get("type", "paragraph") return section.get("content_type", "paragraph")
def _get_section_data(self, section: Dict[str, Any]) -> Dict[str, Any]: def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Get the data of a section.""" """Get the elements of a section."""
return section.get("data", {}) return section.get("elements", [])
def _get_section_id(self, section: Dict[str, Any]) -> str: def _get_section_id(self, section: Dict[str, Any]) -> str:
"""Get the ID of a section (if available).""" """Get the ID of a section (if available)."""

View file

@ -212,24 +212,26 @@ class RendererDocx(BaseRenderer):
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None: def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a single JSON section to DOCX using AI-generated styles.""" """Render a single JSON section to DOCX using AI-generated styles."""
try: try:
section_type = section.get("type", "paragraph") section_type = section.get("content_type", "paragraph")
section_data = section.get("data", {}) elements = section.get("elements", [])
if section_type == "table": # Process each element in the section
self._render_json_table(doc, section_data, styles) for element in elements:
elif section_type == "bullet_list": if section_type == "table":
self._render_json_bullet_list(doc, section_data, styles) self._render_json_table(doc, element, styles)
elif section_type == "heading": elif section_type == "bullet_list":
self._render_json_heading(doc, section_data, styles) self._render_json_bullet_list(doc, element, styles)
elif section_type == "paragraph": elif section_type == "heading":
self._render_json_paragraph(doc, section_data, styles) self._render_json_heading(doc, element, styles)
elif section_type == "code_block": elif section_type == "paragraph":
self._render_json_code_block(doc, section_data, styles) self._render_json_paragraph(doc, element, styles)
elif section_type == "image": elif section_type == "code_block":
self._render_json_image(doc, section_data, styles) self._render_json_code_block(doc, element, styles)
else: elif section_type == "image":
# Fallback to paragraph for unknown types self._render_json_image(doc, element, styles)
self._render_json_paragraph(doc, section_data, styles) else:
# Fallback to paragraph for unknown types
self._render_json_paragraph(doc, element, styles)
except Exception as e: except Exception as e:
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}") self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")

View file

@ -105,7 +105,7 @@ class RendererPdf(BaseRenderer):
sections = json_content.get("sections", []) sections = json_content.get("sections", [])
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER") self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
for i, section in enumerate(sections): for i, section in enumerate(sections):
self.services.utils.debugLogToFile(f"PDF SECTION {i}: type={section.get('type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER") self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
section_elements = self._render_json_section(section, styles) section_elements = self._render_json_section(section, styles)
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER") self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
story.extend(section_elements) story.extend(section_elements)
@ -469,23 +469,28 @@ class RendererPdf(BaseRenderer):
"""Render a single JSON section to PDF elements using AI-generated styles.""" """Render a single JSON section to PDF elements using AI-generated styles."""
try: try:
section_type = self._get_section_type(section) section_type = self._get_section_type(section)
section_data = self._get_section_data(section) elements = self._get_section_data(section)
if section_type == "table": # Process each element in the section
return self._render_json_table(section_data, styles) all_elements = []
elif section_type == "bullet_list": for element in elements:
return self._render_json_bullet_list(section_data, styles) if section_type == "table":
elif section_type == "heading": all_elements.extend(self._render_json_table(element, styles))
return self._render_json_heading(section_data, styles) elif section_type == "bullet_list":
elif section_type == "paragraph": all_elements.extend(self._render_json_bullet_list(element, styles))
return self._render_json_paragraph(section_data, styles) elif section_type == "heading":
elif section_type == "code_block": all_elements.extend(self._render_json_heading(element, styles))
return self._render_json_code_block(section_data, styles) elif section_type == "paragraph":
elif section_type == "image": all_elements.extend(self._render_json_paragraph(element, styles))
return self._render_json_image(section_data, styles) elif section_type == "code_block":
else: all_elements.extend(self._render_json_code_block(element, styles))
# Fallback to paragraph for unknown types elif section_type == "image":
return self._render_json_paragraph(section_data, styles) all_elements.extend(self._render_json_image(element, styles))
else:
# Fallback to paragraph for unknown types
all_elements.extend(self._render_json_paragraph(element, styles))
return all_elements
except Exception as e: except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}") self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")

View file

@ -600,29 +600,33 @@ JSON ONLY. NO OTHER TEXT."""
try: try:
# Get section title from data or use default # Get section title from data or use default
section_title = "Untitled Section" section_title = "Untitled Section"
if section.get("type") == "heading": if section.get("content_type") == "heading":
section_title = section.get("data", {}).get("text", "Untitled Section") # Extract text from elements array
for element in section.get("elements", []):
if isinstance(element, dict) and "text" in element:
section_title = element.get("text", "Untitled Section")
break
elif section.get("title"): elif section.get("title"):
section_title = section.get("title") section_title = section.get("title")
content_type = section.get("type", "paragraph") content_type = section.get("content_type", "paragraph")
section_data = section.get("data", {}) elements = section.get("elements", [])
# Build slide content based on section type # Build slide content based on section type
content_parts = [] content_parts = []
if content_type == "table": if content_type == "table":
content_parts.append(self._format_table_for_slide(section_data)) content_parts.append(self._format_table_for_slide(elements))
elif content_type == "list": elif content_type == "list":
content_parts.append(self._format_list_for_slide(section_data)) content_parts.append(self._format_list_for_slide(elements))
elif content_type == "heading": elif content_type == "heading":
content_parts.append(self._format_heading_for_slide(section_data)) content_parts.append(self._format_heading_for_slide(elements))
elif content_type == "paragraph": elif content_type == "paragraph":
content_parts.append(self._format_paragraph_for_slide(section_data)) content_parts.append(self._format_paragraph_for_slide(elements))
elif content_type == "code": elif content_type == "code":
content_parts.append(self._format_code_for_slide(section_data)) content_parts.append(self._format_code_for_slide(elements))
else: else:
content_parts.append(self._format_paragraph_for_slide(section_data)) content_parts.append(self._format_paragraph_for_slide(elements))
# Combine content parts # Combine content parts
slide_content = "\n\n".join(filter(None, content_parts)) slide_content = "\n\n".join(filter(None, content_parts))
@ -636,11 +640,17 @@ JSON ONLY. NO OTHER TEXT."""
logger.warning(f"Error creating slide from section: {str(e)}") logger.warning(f"Error creating slide from section: {str(e)}")
return None return None
def _format_table_for_slide(self, table_data: Dict[str, Any]) -> str: def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
"""Format table data for slide presentation.""" """Format table data for slide presentation."""
try: try:
headers = table_data.get("headers", []) # Extract table data from elements array
rows = table_data.get("rows", []) headers = []
rows = []
for element in elements:
if isinstance(element, dict) and "headers" in element and "rows" in element:
headers = element.get("headers", [])
rows = element.get("rows", [])
break
if not headers: if not headers:
return "" return ""
@ -805,8 +815,8 @@ JSON ONLY. NO OTHER TEXT."""
current_slide_title = "Content Overview" current_slide_title = "Content Overview"
for section in sections: for section in sections:
section_type = section.get("type", "paragraph") section_type = section.get("content_type", "paragraph")
section_data = section.get("data", {}) elements = section.get("elements", [])
if section_type == "heading": if section_type == "heading":
# If we have accumulated content, create a slide # If we have accumulated content, create a slide
@ -818,7 +828,10 @@ JSON ONLY. NO OTHER TEXT."""
current_slide_content = [] current_slide_content = []
# Start new slide with heading as title # Start new slide with heading as title
current_slide_title = section_data.get("text", "Untitled Section") for element in elements:
if isinstance(element, dict) and "text" in element:
current_slide_title = element.get("text", "Untitled Section")
break
else: else:
# Add content to current slide # Add content to current slide
formatted_content = self._format_section_content(section) formatted_content = self._format_section_content(section)
@ -841,21 +854,26 @@ JSON ONLY. NO OTHER TEXT."""
def _format_section_content(self, section: Dict[str, Any]) -> str: def _format_section_content(self, section: Dict[str, Any]) -> str:
"""Format section content for slide presentation.""" """Format section content for slide presentation."""
try: try:
content_type = section.get("type", "paragraph") content_type = section.get("content_type", "paragraph")
section_data = section.get("data", {}) elements = section.get("elements", [])
if content_type == "table": # Process each element in the section
return self._format_table_for_slide(section_data) content_parts = []
elif content_type == "list": for element in elements:
return self._format_list_for_slide(section_data) if content_type == "table":
elif content_type == "heading": content_parts.append(self._format_table_for_slide([element]))
return self._format_heading_for_slide(section_data) elif content_type == "list":
elif content_type == "paragraph": content_parts.append(self._format_list_for_slide([element]))
return self._format_paragraph_for_slide(section_data) elif content_type == "heading":
elif content_type == "code": content_parts.append(self._format_heading_for_slide([element]))
return self._format_code_for_slide(section_data) elif content_type == "paragraph":
else: content_parts.append(self._format_paragraph_for_slide([element]))
return self._format_paragraph_for_slide(section_data) elif content_type == "code":
content_parts.append(self._format_code_for_slide([element]))
else:
content_parts.append(self._format_paragraph_for_slide([element]))
return "\n\n".join(filter(None, content_parts))
except Exception as e: except Exception as e:
logger.warning(f"Error formatting section content: {str(e)}") logger.warning(f"Error formatting section content: {str(e)}")

View file

@ -457,7 +457,7 @@ class RendererXlsx(BaseRenderer):
# Add additional sheets for other content types # Add additional sheets for other content types
content_types = set() content_types = set()
for section in sections: for section in sections:
content_type = section.get("type", "paragraph") content_type = section.get("content_type", "paragraph")
content_types.add(content_type) content_types.add(content_type)
if "table" in content_types and len(table_sections) == 1: if "table" in content_types and len(table_sections) == 1:
@ -658,25 +658,21 @@ class RendererXlsx(BaseRenderer):
start_row += 1 start_row += 1
# Process section based on type # Process section based on type
section_type = section.get("type", "paragraph") section_type = section.get("content_type", "paragraph")
if section_type == "table": # Handle all section types using elements array
# Handle table section directly elements = section.get("elements", [])
table_data = section.get("data", {}) for element in elements:
if table_data: if section_type == "table":
start_row = self._add_table_to_excel(sheet, table_data, styles, start_row) start_row = self._add_table_to_excel(sheet, element, styles, start_row)
else: elif section_type == "list":
# Handle other section types start_row = self._add_list_to_excel(sheet, element, styles, start_row)
elements = section.get("elements", []) elif section_type == "paragraph":
for element in elements: start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
if section_type == "list": elif section_type == "heading":
start_row = self._add_list_to_excel(sheet, element, styles, start_row) start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
elif section_type == "paragraph": else:
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row) start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
elif section_type == "heading":
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
else:
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
return start_row return start_row

View file

@ -286,7 +286,7 @@ async def buildExtractionPrompt(
from .subJsonSchema import get_document_subJsonSchema from .subJsonSchema import get_document_subJsonSchema
jsonSchema = get_document_subJsonSchema() jsonSchema = get_document_subJsonSchema()
# Generic block for JSON extraction # Generic block for JSON extraction - use proper schema instead of hardcoded template
genericIntro = f""" genericIntro = f"""
{extractionIntent} {extractionIntent}
@ -295,44 +295,7 @@ You are extracting structured content from documents and must respond with valid
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format: Extract the actual data from the source documents and structure it as JSON with this format:
{{ {json.dumps(jsonSchema, indent=2)}
"metadata": {{
"title": "Document Title",
"version": "1.0"
}},
"sections": [
{{
"id": "section_1",
"type": "heading",
"data": {{
"level": 1,
"text": "Heading Text"
}}
}},
{{
"id": "section_2",
"type": "table",
"data": {{
"headers": ["Column1", "Column2"],
"rows": [["Data1", "Data2"], ["Data3", "Data4"]]
}}
}},
{{
"id": "section_3",
"type": "bullet_list",
"data": {{
"items": ["Item 1", "Item 2", "Item 3"]
}}
}},
{{
"id": "section_4",
"type": "paragraph",
"data": {{
"text": "Paragraph content here"
}}
}}
]
}}
Content Types to Extract: Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers 1. Tables: Extract all rows and columns with proper headers
@ -485,8 +448,8 @@ def _getFormatRules(outputFormat: str) -> str:
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str: async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
""" """
Use AI to extract the core content intention from the user prompt. Use AI to extract a rich, structured extraction intent from the user prompt.
Focus on WHAT the user wants to extract, not HOW to format it. Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
""" """
if not aiService: if not aiService:
# Fallback if no AI service available # Fallback if no AI service available
@ -496,18 +459,46 @@ async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=N
# Protect userPrompt from injection by escaping quotes and newlines # Protect userPrompt from injection by escaping quotes and newlines
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ') safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
# Simple AI call to extract the intention # Rich analysis to derive a complete extraction intent and structure guidance
extractionPrompt = f""" extractionPrompt = f"""
Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted. Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
Goals:
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
- Identify if multi-file output is appropriate and how to split/files name.
User request: "{safeUserPrompt}" User request: "{safeUserPrompt}"
Return only the content intention in a simple format like "Extract: [content description]" Return JSON in this exact shape:
Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis. {{
If the user mentions a table, extract the actual table data with rows and columns. "detectedLanguage": "de|en|fr|it|...",
If the user mentions a list, extract the actual list items. "normalizedRequest": "Full explicit instruction in detected language",
IMPORTANT: Preserve any language requirements in your response. "requiresStructuredData": true|false,
Do not include formatting instructions, file types, or output methods. "targetStructure": "table|list|mixed|unstructured",
"table": {{
"headers": ["Header1", "Header2", "..."],
"headerOrderStrict": true|false,
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
"formats": {{
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
"amountDecimals": 2,
"currencyFormat": "code|symbol",
"idMasking": "none|last4|custom"
}}
}},
"multiFile": true|false,
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
"fileNamingPattern": "suggested pattern for filenames",
"constraints": ["List of critical constraints to enforce"],
"reasoning": "Brief justification (one sentence)"
}}
Rules:
- Preserve user terminology and language in normalizedRequest.
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
""" """
# Call AI service to extract intention # Call AI service to extract intention
@ -523,12 +514,24 @@ Do not include formatting instructions, file types, or output methods.
result = response.content if response else "" result = response.content if response else ""
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER") services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}" # Try to extract and pretty print JSON
if result:
import re, json as _json
match = re.search(r'\{[\s\S]*\}', result)
if match:
try:
obj = _json.loads(match.group(0))
return _json.dumps(obj, ensure_ascii=False, indent=2)
except Exception:
pass
# Fallback to previous simple format
return f"Extract: {safeUserPrompt}"
except Exception as e: except Exception as e:
# Fallback on any error - preserve user prompt for language instructions # Fallback on any error - preserve user prompt for language instructions
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER") services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}" return f"Extract: {userPrompt}"

View file

@ -0,0 +1,243 @@
import json
import os
from typing import Any, Dict, List, Set
from datetime import datetime, UTC
class NormalizationService:
"""
Produces a single canonical table in merged JSON using an AI-provided header mapping
and deterministic, in-code value normalization. No language heuristics in code.
"""
def __init__(self, services):
self.services = services
# Public API
def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
headers: Set[str] = set()
samples: Dict[str, List[str]] = {}
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
for section in sections:
if not isinstance(section, dict):
continue
# Use only the fundamental agreed JSON structure: content_type/elements
if section.get("content_type") != "table":
continue
# Extract table data from elements array
for element in section.get("elements", []):
if isinstance(element, dict) and "headers" in element and "rows" in element:
hdrs = element.get("headers") or []
rows = element.get("rows") or []
break
else:
continue
for h in hdrs:
if not isinstance(h, str):
continue
headers.add(h)
# collect small value samples by column index
for row in rows[:5]:
if not isinstance(row, list):
continue
for i, value in enumerate(row):
headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
if headerName not in samples:
samples[headerName] = []
if len(samples[headerName]) < 5:
samples[headerName].append(str(value))
return {
"tableHeaders": sorted(list(headers)),
"headerSamples": samples,
}
async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
# Allow caller to specify any canonical schema. If none provided, default to discovered headers.
if canonicalSpec is None:
canonicalSpec = {
"canonicalHeaders": inventory.get("tableHeaders", []),
"constraints": {}
}
# Protect merge prompt context by wrapping in single quotes and escaping internal quotes
protectedMerge = None
if mergePrompt:
try:
protectedMerge = str(mergePrompt).replace("'", "\\'")
except Exception:
protectedMerge = str(mergePrompt)
prompt = (
"You are a mapping generator. Return ONLY JSON.\n\n"
"Given discovered headers and sample values, map them to the canonical headers.\n"
"Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
+ (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
"REPLY JSON SHAPE:\n(Example)\n"
"{\n \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
" \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
" \"Currency\": {\"stripSymbols\": true},\n"
" \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n"
)
response = await self.services.ai.callAi(prompt=prompt)
js = response[response.find('{'):response.rfind('}') + 1] if response else '{}'
mapping = json.loads(js)
# Normalize key naming from AI: prefer single key "mapping"
if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
mapping["mapping"] = mapping["mappings"]
try:
del mapping["mappings"]
except Exception:
pass
# Ensure canonicalHeaders present in mapping for downstream use
if "canonicalHeaders" not in mapping:
mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
# debug artifact
self._writeDebugArtifact("mapping.json", mapping)
return mapping
def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
mappings = (mappingSpec or {}).get("mapping", {})
policy = (mappingSpec or {}).get("normalizationPolicy", {})
# Prefer headers provided by mapping (generic across domains)
canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
if not canonicalHeaders:
# Fallback to union of mapped targets
canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
rows: List[List[str]] = []
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
for section in sections:
# Use only the fundamental agreed JSON structure: content_type/elements
if section.get("content_type") != "table":
continue
# Extract table data from elements array
for element in section.get("elements", []):
if isinstance(element, dict) and "headers" in element and "rows" in element:
sourceHeaders = element.get("headers") or []
sourceRows = element.get("rows") or []
break
else:
continue
if not sourceHeaders or not sourceRows:
continue
# Build index map: canonical -> source index or None
indexMap: Dict[str, int] = {}
for ci, ch in enumerate(canonicalHeaders):
srcIndex = None
for si, sh in enumerate(sourceHeaders):
# Prefer explicit mapping target; fallback to identity when names match
target = mappings.get(sh)
if target is None and sh == ch:
target = ch
if target == ch:
srcIndex = si
break
indexMap[ch] = srcIndex
# Transform rows
for r in sourceRows:
canonicalRow: List[str] = []
for ch in canonicalHeaders:
idx = indexMap.get(ch)
value = r[idx] if (idx is not None and idx < len(r)) else ""
canonicalRow.append(self._normalizeValue(ch, value, policy))
# consider as row if at least one non-empty meaningful field
if any(v.strip() for v in canonicalRow):
rows.append(canonicalRow)
canonical = {
"metadata": {
"title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
"source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
},
"sections": [
{
"id": "canonical_table_1",
"content_type": "table",
"elements": [
{
"headers": canonicalHeaders,
"rows": rows
}
],
"order": 1
}
]
}
# debug artifact
self._writeDebugArtifact("canonical_merged.json", canonical)
return canonical
def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
rows = []
try:
sections = canonicalJson.get("sections", [])
for s in sections:
if s.get("content_type") == "table":
# Extract rows from elements array
for element in s.get("elements", []):
if isinstance(element, dict) and "rows" in element:
rows.extend(element.get("rows", []))
except Exception:
rows = []
report = {
"rowCount": len(rows),
"success": len(rows) > 0
}
self._writeDebugArtifact("normalization_report.json", report)
return report
# Internal helpers
def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
if value is None:
return ""
text = str(value).strip()
# Generic normalization guided by policy; avoid domain specifics
if canonicalHeader in (policy.get("numericFields", []) or []):
dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
or (policy.get("numeric") or {}).get("decimalSeparator")
or ".")
if dec == ",":
text = text.replace(".", "").replace(",", ".") if "," in text else text
text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
text = ''.join(ch for ch in text if ch.isalpha())
text = text.upper()
return text
def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
try:
debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if not debugEnabled:
return
root = "./test-chat/ai"
os.makedirs(root, exist_ok=True)
# Prefix timestamp for files that are frequently overwritten
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
if fileName in ("mapping.json", "canonical_merged.json"):
outName = f"{ts}_{fileName}"
else:
outName = fileName
path = os.path.join(root, outName)
with open(path, "w", encoding="utf-8") as f:
if isinstance(obj, (dict, list)):
f.write(json.dumps(obj, ensure_ascii=False, indent=2))
else:
f.write(str(obj))
except Exception:
pass

View file

@ -42,16 +42,27 @@ def extractUserPrompt(context: Any) -> str:
Fallback to the task_step objective. Fallback to the task_step objective.
""" """
try: try:
# Prefer services.currentUserPrompt when accessible through context
services = getattr(context, 'services', None) services = getattr(context, 'services', None)
if services and getattr(services, 'currentUserPrompt', None):
return services.currentUserPrompt
except Exception:
pass
if hasattr(context, 'task_step') and context.task_step: # Determine raw user prompt from services or task_step
return context.task_step.objective or 'No request specified' rawPrompt = None
return 'No request specified' if services and getattr(services, 'currentUserPrompt', None):
rawPrompt = services.currentUserPrompt
elif hasattr(context, 'task_step') and context.task_step:
rawPrompt = context.task_step.objective or 'No request specified'
else:
rawPrompt = 'No request specified'
# Prefer values computed at workflow start by WorkflowManager analyzer
normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
if normalized:
return normalized
return rawPrompt
except Exception:
# Robust fallback behavior
if hasattr(context, 'task_step') and context.task_step:
return context.task_step.objective or 'No request specified'
return 'No request specified'
def extractWorkflowHistory(service: Any, context: Any) -> str: def extractWorkflowHistory(service: Any, context: Any) -> str:
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}} """Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
@ -99,7 +110,15 @@ def extractAvailableMethods(service: Any) -> str:
def extractUserLanguage(service: Any) -> str: def extractUserLanguage(service: Any) -> str:
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}""" """Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
return service.user.language if service and service.user else 'en' try:
# Prefer detected language if available
if service and getattr(service, 'currentUserLanguage', None):
return service.currentUserLanguage
return service.user.language if service and service.user else 'en'
except Exception:
return 'en'
# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.
def _computeMessageSummary(msg) -> str: def _computeMessageSummary(msg) -> str:

View file

@ -216,23 +216,23 @@ class WorkflowManager:
# Update the message with documents in database # Update the message with documents in database
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]}) self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
# Analyze the user's input to extract intent and offload bulky context into documents # Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
try: try:
analyzerPrompt = ( analyzerPrompt = (
"You are an input analyzer. Split the user's message into:\n" "You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
"1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n" "1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
"2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. " "2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
"Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. " "3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
"Keep URLs in the intent unless they include large pasted content.\n\n" "4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
"Rules:\n" "Rules:\n"
"- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; " "- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
"return an empty contextItems and keep a compact, self-contained intent.\n" "- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
"- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n" "- Preserve critical references (URLs, filenames) in intent.\n"
"- Preserve critical references (URLs, filenames) in the intent.\n" "- Normalize to the primary detected language if mixed-language.\n\n"
"- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n" "Return ONLY JSON (no markdown) with this shape:\n"
"Output JSON only (no markdown):\n"
"{\n" "{\n"
" \"detectedLanguage\": \"en\",\n" " \"detectedLanguage\": \"de|en|fr|it|...\",\n"
" \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
" \"intent\": \"Concise normalized request...\",\n" " \"intent\": \"Concise normalized request...\",\n"
" \"contextItems\": [\n" " \"contextItems\": [\n"
" {\n" " {\n"
@ -249,6 +249,7 @@ class WorkflowManager:
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt) aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
detectedLanguage = None detectedLanguage = None
normalizedRequest = None
intentText = userInput.prompt intentText = userInput.prompt
contextItems = [] contextItems = []
@ -260,6 +261,7 @@ class WorkflowManager:
if jsonStart != -1 and jsonEnd > jsonStart: if jsonStart != -1 and jsonEnd > jsonStart:
parsed = json.loads(aiResponse[jsonStart:jsonEnd]) parsed = json.loads(aiResponse[jsonStart:jsonEnd])
detectedLanguage = parsed.get('detectedLanguage') or None detectedLanguage = parsed.get('detectedLanguage') or None
normalizedRequest = parsed.get('normalizedRequest') or None
if parsed.get('intent'): if parsed.get('intent'):
intentText = parsed.get('intent') intentText = parsed.get('intent')
contextItems = parsed.get('contextItems') or [] contextItems = parsed.get('contextItems') or []
@ -269,7 +271,18 @@ class WorkflowManager:
# Update services state # Update services state
if detectedLanguage and isinstance(detectedLanguage, str): if detectedLanguage and isinstance(detectedLanguage, str):
self._setUserLanguage(detectedLanguage) self._setUserLanguage(detectedLanguage)
try:
setattr(self.services, 'currentUserLanguage', detectedLanguage)
except Exception:
pass
self.services.currentUserPrompt = intentText or userInput.prompt self.services.currentUserPrompt = intentText or userInput.prompt
try:
if normalizedRequest:
setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
if contextItems is not None:
setattr(self.services, 'currentUserContextItems', contextItems)
except Exception:
pass
# Telemetry (sizes and counts) # Telemetry (sizes and counts)
try: try: