added normalization service to merge structured data of any format
This commit is contained in:
parent
f83786b3a7
commit
bdc87eb5c6
12 changed files with 585 additions and 164 deletions
|
|
@ -121,5 +121,10 @@ class JsonMergeResult(BaseModel):
|
||||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="Merge process metadata")
|
||||||
|
|
||||||
|
|
||||||
# Update forward references
|
# Update forward references (compatible with Pydantic v1 and v2)
|
||||||
ListItem.model_rebuild()
|
try:
|
||||||
|
# Pydantic v2
|
||||||
|
ListItem.model_rebuild()
|
||||||
|
except AttributeError:
|
||||||
|
# Pydantic v1
|
||||||
|
ListItem.update_forward_refs()
|
||||||
|
|
|
||||||
|
|
@ -105,6 +105,12 @@ class SubDocumentGeneration:
|
||||||
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
|
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
|
||||||
raise Exception("AI response is not valid JSON document structure")
|
raise Exception("AI response is not valid JSON document structure")
|
||||||
|
|
||||||
|
# Emit raw extracted data as a chat message attachment before rendering
|
||||||
|
try:
|
||||||
|
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to emit raw extraction chat message (single-file)")
|
||||||
|
|
||||||
# Generate filename from document metadata
|
# Generate filename from document metadata
|
||||||
parsedFilename = None
|
parsedFilename = None
|
||||||
try:
|
try:
|
||||||
|
|
@ -211,6 +217,12 @@ class SubDocumentGeneration:
|
||||||
prompt, documents, options, outputFormat, title
|
prompt, documents, options, outputFormat, title
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Emit raw extracted data as a chat message attachment before transformation/rendering
|
||||||
|
try:
|
||||||
|
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to emit raw extraction chat message (multi-file)")
|
||||||
|
|
||||||
# Process multiple documents
|
# Process multiple documents
|
||||||
generated_documents = []
|
generated_documents = []
|
||||||
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
||||||
|
|
@ -457,3 +469,71 @@ Return only the JSON response.
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Response validation failed with exception: {str(e)}")
|
logger.warning(f"Response validation failed with exception: {str(e)}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
|
||||||
|
"""
|
||||||
|
Create a ChatMessage with the extracted raw JSON attached as a file so the user
|
||||||
|
has access to the data even if downstream processing fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
services = self.services
|
||||||
|
workflow = getattr(services, 'currentWorkflow', None)
|
||||||
|
if not workflow:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Serialize payload
|
||||||
|
import json as _json
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
|
||||||
|
content_bytes = content_text.encode('utf-8')
|
||||||
|
|
||||||
|
# Store as file via component storage
|
||||||
|
file_name = f"{label}_{ts}.json"
|
||||||
|
file_item = services.interfaceDbComponent.createFile(
|
||||||
|
name=file_name,
|
||||||
|
mimeType="application/json",
|
||||||
|
content=content_bytes
|
||||||
|
)
|
||||||
|
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
||||||
|
|
||||||
|
# Lookup file info for ChatDocument
|
||||||
|
file_info = services.workflow.getFileInfo(file_item.id)
|
||||||
|
doc = ChatDocument(
|
||||||
|
messageId="", # set after message creation
|
||||||
|
fileId=file_item.id,
|
||||||
|
fileName=file_info.get("fileName", file_name) if file_info else file_name,
|
||||||
|
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
|
||||||
|
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create message referencing the file
|
||||||
|
messageData = {
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"role": "assistant",
|
||||||
|
"message": "Raw extraction data saved",
|
||||||
|
"status": "data",
|
||||||
|
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
|
||||||
|
"publishedAt": services.utils.getUtcTimestamp(),
|
||||||
|
"documentsLabel": label,
|
||||||
|
"documents": []
|
||||||
|
}
|
||||||
|
message = services.workflow.createMessage(messageData)
|
||||||
|
if not message:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Persist ChatDocument with messageId
|
||||||
|
doc.messageId = message.id
|
||||||
|
services.interfaceDbChat.createDocument(doc.to_dict())
|
||||||
|
|
||||||
|
# Update message to include document
|
||||||
|
try:
|
||||||
|
if not message.documents:
|
||||||
|
message.documents = []
|
||||||
|
message.documents.append(doc)
|
||||||
|
services.workflow.updateMessage(message.id, {"documents": [d.to_dict() for d in message.documents]})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
# Non-fatal; ignore if storage or chat creation fails
|
||||||
|
return
|
||||||
|
|
|
||||||
|
|
@ -176,6 +176,27 @@ class SubDocumentProcessing:
|
||||||
# Merge with JSON mode
|
# Merge with JSON mode
|
||||||
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
||||||
|
|
||||||
|
# Normalize merged JSON into a single canonical table
|
||||||
|
try:
|
||||||
|
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
|
||||||
|
normalizer = NormalizationService(self.services)
|
||||||
|
inventory = normalizer.discoverStructures(mergedJsonDocument)
|
||||||
|
# Use workflow id if available as cache key, else default
|
||||||
|
cacheKey = getattr(self.services, 'currentWorkflow', None)
|
||||||
|
cacheKey = getattr(cacheKey, 'id', 'workflow_run') if cacheKey else 'workflow_run'
|
||||||
|
# Provide the extraction/merge prompt context when available to help mapping
|
||||||
|
mergePrompt = prompt
|
||||||
|
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
|
||||||
|
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
|
||||||
|
report = normalizer.validateCanonical(canonical)
|
||||||
|
if report.get('success'):
|
||||||
|
mergedJsonDocument = canonical
|
||||||
|
else:
|
||||||
|
raise ValueError('Normalization produced zero rows')
|
||||||
|
except Exception as e:
|
||||||
|
# Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
|
||||||
|
raise
|
||||||
|
|
||||||
# Save merged JSON extraction content to debug file - only if debug enabled
|
# Save merged JSON extraction content to debug file - only if debug enabled
|
||||||
try:
|
try:
|
||||||
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
|
@ -481,8 +502,16 @@ class SubDocumentProcessing:
|
||||||
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
|
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
|
||||||
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
|
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
|
||||||
|
|
||||||
|
# Strengthen prompt to forbid fabrication for text/container extraction
|
||||||
|
augmented_prompt = (
|
||||||
|
f"{prompt}\n\n"
|
||||||
|
"CRITICAL RULES (NO FABRICATION):\n"
|
||||||
|
"- Use ONLY content present in the provided CONTEXT.\n"
|
||||||
|
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
|
||||||
|
"- If a value is missing, leave the cell empty or omit the row.\n"
|
||||||
|
)
|
||||||
request = AiCallRequest(
|
request = AiCallRequest(
|
||||||
prompt=prompt,
|
prompt=augmented_prompt,
|
||||||
context=part.data,
|
context=part.data,
|
||||||
options=request_options
|
options=request_options
|
||||||
)
|
)
|
||||||
|
|
@ -579,8 +608,16 @@ class SubDocumentProcessing:
|
||||||
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
|
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
|
||||||
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
|
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
|
||||||
|
|
||||||
|
# Strengthen prompt to forbid fabrication for text extraction
|
||||||
|
augmented_prompt_text = (
|
||||||
|
f"{prompt}\n\n"
|
||||||
|
"CRITICAL RULES (NO FABRICATION):\n"
|
||||||
|
"- Use ONLY content present in the provided CONTEXT.\n"
|
||||||
|
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
|
||||||
|
"- If a value is missing, leave the cell empty or omit the row.\n"
|
||||||
|
)
|
||||||
request = AiCallRequest(
|
request = AiCallRequest(
|
||||||
prompt=prompt,
|
prompt=augmented_prompt_text,
|
||||||
context=part.data,
|
context=part.data,
|
||||||
options=request_options
|
options=request_options
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -92,11 +92,11 @@ class BaseRenderer(ABC):
|
||||||
|
|
||||||
def _get_section_type(self, section: Dict[str, Any]) -> str:
|
def _get_section_type(self, section: Dict[str, Any]) -> str:
|
||||||
"""Get the type of a section."""
|
"""Get the type of a section."""
|
||||||
return section.get("type", "paragraph")
|
return section.get("content_type", "paragraph")
|
||||||
|
|
||||||
def _get_section_data(self, section: Dict[str, Any]) -> Dict[str, Any]:
|
def _get_section_data(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||||
"""Get the data of a section."""
|
"""Get the elements of a section."""
|
||||||
return section.get("data", {})
|
return section.get("elements", [])
|
||||||
|
|
||||||
def _get_section_id(self, section: Dict[str, Any]) -> str:
|
def _get_section_id(self, section: Dict[str, Any]) -> str:
|
||||||
"""Get the ID of a section (if available)."""
|
"""Get the ID of a section (if available)."""
|
||||||
|
|
|
||||||
|
|
@ -212,24 +212,26 @@ class RendererDocx(BaseRenderer):
|
||||||
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||||||
"""Render a single JSON section to DOCX using AI-generated styles."""
|
"""Render a single JSON section to DOCX using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
section_type = section.get("type", "paragraph")
|
section_type = section.get("content_type", "paragraph")
|
||||||
section_data = section.get("data", {})
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
if section_type == "table":
|
# Process each element in the section
|
||||||
self._render_json_table(doc, section_data, styles)
|
for element in elements:
|
||||||
elif section_type == "bullet_list":
|
if section_type == "table":
|
||||||
self._render_json_bullet_list(doc, section_data, styles)
|
self._render_json_table(doc, element, styles)
|
||||||
elif section_type == "heading":
|
elif section_type == "bullet_list":
|
||||||
self._render_json_heading(doc, section_data, styles)
|
self._render_json_bullet_list(doc, element, styles)
|
||||||
elif section_type == "paragraph":
|
elif section_type == "heading":
|
||||||
self._render_json_paragraph(doc, section_data, styles)
|
self._render_json_heading(doc, element, styles)
|
||||||
elif section_type == "code_block":
|
elif section_type == "paragraph":
|
||||||
self._render_json_code_block(doc, section_data, styles)
|
self._render_json_paragraph(doc, element, styles)
|
||||||
elif section_type == "image":
|
elif section_type == "code_block":
|
||||||
self._render_json_image(doc, section_data, styles)
|
self._render_json_code_block(doc, element, styles)
|
||||||
else:
|
elif section_type == "image":
|
||||||
# Fallback to paragraph for unknown types
|
self._render_json_image(doc, element, styles)
|
||||||
self._render_json_paragraph(doc, section_data, styles)
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
self._render_json_paragraph(doc, element, styles)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ class RendererPdf(BaseRenderer):
|
||||||
sections = json_content.get("sections", [])
|
sections = json_content.get("sections", [])
|
||||||
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
|
self.services.utils.debugLogToFile(f"PDF SECTIONS TO PROCESS: {len(sections)} sections", "PDF_RENDERER")
|
||||||
for i, section in enumerate(sections):
|
for i, section in enumerate(sections):
|
||||||
self.services.utils.debugLogToFile(f"PDF SECTION {i}: type={section.get('type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
|
self.services.utils.debugLogToFile(f"PDF SECTION {i}: content_type={section.get('content_type', 'unknown')}, id={section.get('id', 'unknown')}", "PDF_RENDERER")
|
||||||
section_elements = self._render_json_section(section, styles)
|
section_elements = self._render_json_section(section, styles)
|
||||||
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
|
self.services.utils.debugLogToFile(f"PDF SECTION {i} ELEMENTS: {len(section_elements)} elements", "PDF_RENDERER")
|
||||||
story.extend(section_elements)
|
story.extend(section_elements)
|
||||||
|
|
@ -469,23 +469,28 @@ class RendererPdf(BaseRenderer):
|
||||||
"""Render a single JSON section to PDF elements using AI-generated styles."""
|
"""Render a single JSON section to PDF elements using AI-generated styles."""
|
||||||
try:
|
try:
|
||||||
section_type = self._get_section_type(section)
|
section_type = self._get_section_type(section)
|
||||||
section_data = self._get_section_data(section)
|
elements = self._get_section_data(section)
|
||||||
|
|
||||||
if section_type == "table":
|
# Process each element in the section
|
||||||
return self._render_json_table(section_data, styles)
|
all_elements = []
|
||||||
elif section_type == "bullet_list":
|
for element in elements:
|
||||||
return self._render_json_bullet_list(section_data, styles)
|
if section_type == "table":
|
||||||
elif section_type == "heading":
|
all_elements.extend(self._render_json_table(element, styles))
|
||||||
return self._render_json_heading(section_data, styles)
|
elif section_type == "bullet_list":
|
||||||
elif section_type == "paragraph":
|
all_elements.extend(self._render_json_bullet_list(element, styles))
|
||||||
return self._render_json_paragraph(section_data, styles)
|
elif section_type == "heading":
|
||||||
elif section_type == "code_block":
|
all_elements.extend(self._render_json_heading(element, styles))
|
||||||
return self._render_json_code_block(section_data, styles)
|
elif section_type == "paragraph":
|
||||||
elif section_type == "image":
|
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||||
return self._render_json_image(section_data, styles)
|
elif section_type == "code_block":
|
||||||
else:
|
all_elements.extend(self._render_json_code_block(element, styles))
|
||||||
# Fallback to paragraph for unknown types
|
elif section_type == "image":
|
||||||
return self._render_json_paragraph(section_data, styles)
|
all_elements.extend(self._render_json_image(element, styles))
|
||||||
|
else:
|
||||||
|
# Fallback to paragraph for unknown types
|
||||||
|
all_elements.extend(self._render_json_paragraph(element, styles))
|
||||||
|
|
||||||
|
return all_elements
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")
|
||||||
|
|
|
||||||
|
|
@ -600,29 +600,33 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
try:
|
try:
|
||||||
# Get section title from data or use default
|
# Get section title from data or use default
|
||||||
section_title = "Untitled Section"
|
section_title = "Untitled Section"
|
||||||
if section.get("type") == "heading":
|
if section.get("content_type") == "heading":
|
||||||
section_title = section.get("data", {}).get("text", "Untitled Section")
|
# Extract text from elements array
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "text" in element:
|
||||||
|
section_title = element.get("text", "Untitled Section")
|
||||||
|
break
|
||||||
elif section.get("title"):
|
elif section.get("title"):
|
||||||
section_title = section.get("title")
|
section_title = section.get("title")
|
||||||
|
|
||||||
content_type = section.get("type", "paragraph")
|
content_type = section.get("content_type", "paragraph")
|
||||||
section_data = section.get("data", {})
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
# Build slide content based on section type
|
# Build slide content based on section type
|
||||||
content_parts = []
|
content_parts = []
|
||||||
|
|
||||||
if content_type == "table":
|
if content_type == "table":
|
||||||
content_parts.append(self._format_table_for_slide(section_data))
|
content_parts.append(self._format_table_for_slide(elements))
|
||||||
elif content_type == "list":
|
elif content_type == "list":
|
||||||
content_parts.append(self._format_list_for_slide(section_data))
|
content_parts.append(self._format_list_for_slide(elements))
|
||||||
elif content_type == "heading":
|
elif content_type == "heading":
|
||||||
content_parts.append(self._format_heading_for_slide(section_data))
|
content_parts.append(self._format_heading_for_slide(elements))
|
||||||
elif content_type == "paragraph":
|
elif content_type == "paragraph":
|
||||||
content_parts.append(self._format_paragraph_for_slide(section_data))
|
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||||
elif content_type == "code":
|
elif content_type == "code":
|
||||||
content_parts.append(self._format_code_for_slide(section_data))
|
content_parts.append(self._format_code_for_slide(elements))
|
||||||
else:
|
else:
|
||||||
content_parts.append(self._format_paragraph_for_slide(section_data))
|
content_parts.append(self._format_paragraph_for_slide(elements))
|
||||||
|
|
||||||
# Combine content parts
|
# Combine content parts
|
||||||
slide_content = "\n\n".join(filter(None, content_parts))
|
slide_content = "\n\n".join(filter(None, content_parts))
|
||||||
|
|
@ -636,11 +640,17 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
logger.warning(f"Error creating slide from section: {str(e)}")
|
logger.warning(f"Error creating slide from section: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _format_table_for_slide(self, table_data: Dict[str, Any]) -> str:
|
def _format_table_for_slide(self, elements: List[Dict[str, Any]]) -> str:
|
||||||
"""Format table data for slide presentation."""
|
"""Format table data for slide presentation."""
|
||||||
try:
|
try:
|
||||||
headers = table_data.get("headers", [])
|
# Extract table data from elements array
|
||||||
rows = table_data.get("rows", [])
|
headers = []
|
||||||
|
rows = []
|
||||||
|
for element in elements:
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
headers = element.get("headers", [])
|
||||||
|
rows = element.get("rows", [])
|
||||||
|
break
|
||||||
|
|
||||||
if not headers:
|
if not headers:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -805,8 +815,8 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
current_slide_title = "Content Overview"
|
current_slide_title = "Content Overview"
|
||||||
|
|
||||||
for section in sections:
|
for section in sections:
|
||||||
section_type = section.get("type", "paragraph")
|
section_type = section.get("content_type", "paragraph")
|
||||||
section_data = section.get("data", {})
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
if section_type == "heading":
|
if section_type == "heading":
|
||||||
# If we have accumulated content, create a slide
|
# If we have accumulated content, create a slide
|
||||||
|
|
@ -818,7 +828,10 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
current_slide_content = []
|
current_slide_content = []
|
||||||
|
|
||||||
# Start new slide with heading as title
|
# Start new slide with heading as title
|
||||||
current_slide_title = section_data.get("text", "Untitled Section")
|
for element in elements:
|
||||||
|
if isinstance(element, dict) and "text" in element:
|
||||||
|
current_slide_title = element.get("text", "Untitled Section")
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
# Add content to current slide
|
# Add content to current slide
|
||||||
formatted_content = self._format_section_content(section)
|
formatted_content = self._format_section_content(section)
|
||||||
|
|
@ -841,21 +854,26 @@ JSON ONLY. NO OTHER TEXT."""
|
||||||
def _format_section_content(self, section: Dict[str, Any]) -> str:
|
def _format_section_content(self, section: Dict[str, Any]) -> str:
|
||||||
"""Format section content for slide presentation."""
|
"""Format section content for slide presentation."""
|
||||||
try:
|
try:
|
||||||
content_type = section.get("type", "paragraph")
|
content_type = section.get("content_type", "paragraph")
|
||||||
section_data = section.get("data", {})
|
elements = section.get("elements", [])
|
||||||
|
|
||||||
if content_type == "table":
|
# Process each element in the section
|
||||||
return self._format_table_for_slide(section_data)
|
content_parts = []
|
||||||
elif content_type == "list":
|
for element in elements:
|
||||||
return self._format_list_for_slide(section_data)
|
if content_type == "table":
|
||||||
elif content_type == "heading":
|
content_parts.append(self._format_table_for_slide([element]))
|
||||||
return self._format_heading_for_slide(section_data)
|
elif content_type == "list":
|
||||||
elif content_type == "paragraph":
|
content_parts.append(self._format_list_for_slide([element]))
|
||||||
return self._format_paragraph_for_slide(section_data)
|
elif content_type == "heading":
|
||||||
elif content_type == "code":
|
content_parts.append(self._format_heading_for_slide([element]))
|
||||||
return self._format_code_for_slide(section_data)
|
elif content_type == "paragraph":
|
||||||
else:
|
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||||
return self._format_paragraph_for_slide(section_data)
|
elif content_type == "code":
|
||||||
|
content_parts.append(self._format_code_for_slide([element]))
|
||||||
|
else:
|
||||||
|
content_parts.append(self._format_paragraph_for_slide([element]))
|
||||||
|
|
||||||
|
return "\n\n".join(filter(None, content_parts))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error formatting section content: {str(e)}")
|
logger.warning(f"Error formatting section content: {str(e)}")
|
||||||
|
|
|
||||||
|
|
@ -457,7 +457,7 @@ class RendererXlsx(BaseRenderer):
|
||||||
# Add additional sheets for other content types
|
# Add additional sheets for other content types
|
||||||
content_types = set()
|
content_types = set()
|
||||||
for section in sections:
|
for section in sections:
|
||||||
content_type = section.get("type", "paragraph")
|
content_type = section.get("content_type", "paragraph")
|
||||||
content_types.add(content_type)
|
content_types.add(content_type)
|
||||||
|
|
||||||
if "table" in content_types and len(table_sections) == 1:
|
if "table" in content_types and len(table_sections) == 1:
|
||||||
|
|
@ -658,25 +658,21 @@ class RendererXlsx(BaseRenderer):
|
||||||
start_row += 1
|
start_row += 1
|
||||||
|
|
||||||
# Process section based on type
|
# Process section based on type
|
||||||
section_type = section.get("type", "paragraph")
|
section_type = section.get("content_type", "paragraph")
|
||||||
|
|
||||||
if section_type == "table":
|
# Handle all section types using elements array
|
||||||
# Handle table section directly
|
elements = section.get("elements", [])
|
||||||
table_data = section.get("data", {})
|
for element in elements:
|
||||||
if table_data:
|
if section_type == "table":
|
||||||
start_row = self._add_table_to_excel(sheet, table_data, styles, start_row)
|
start_row = self._add_table_to_excel(sheet, element, styles, start_row)
|
||||||
else:
|
elif section_type == "list":
|
||||||
# Handle other section types
|
start_row = self._add_list_to_excel(sheet, element, styles, start_row)
|
||||||
elements = section.get("elements", [])
|
elif section_type == "paragraph":
|
||||||
for element in elements:
|
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||||
if section_type == "list":
|
elif section_type == "heading":
|
||||||
start_row = self._add_list_to_excel(sheet, element, styles, start_row)
|
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
|
||||||
elif section_type == "paragraph":
|
else:
|
||||||
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
||||||
elif section_type == "heading":
|
|
||||||
start_row = self._add_heading_to_excel(sheet, element, styles, start_row)
|
|
||||||
else:
|
|
||||||
start_row = self._add_paragraph_to_excel(sheet, element, styles, start_row)
|
|
||||||
|
|
||||||
return start_row
|
return start_row
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -286,7 +286,7 @@ async def buildExtractionPrompt(
|
||||||
from .subJsonSchema import get_document_subJsonSchema
|
from .subJsonSchema import get_document_subJsonSchema
|
||||||
jsonSchema = get_document_subJsonSchema()
|
jsonSchema = get_document_subJsonSchema()
|
||||||
|
|
||||||
# Generic block for JSON extraction
|
# Generic block for JSON extraction - use proper schema instead of hardcoded template
|
||||||
genericIntro = f"""
|
genericIntro = f"""
|
||||||
{extractionIntent}
|
{extractionIntent}
|
||||||
|
|
||||||
|
|
@ -295,44 +295,7 @@ You are extracting structured content from documents and must respond with valid
|
||||||
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
|
||||||
|
|
||||||
Extract the actual data from the source documents and structure it as JSON with this format:
|
Extract the actual data from the source documents and structure it as JSON with this format:
|
||||||
{{
|
{json.dumps(jsonSchema, indent=2)}
|
||||||
"metadata": {{
|
|
||||||
"title": "Document Title",
|
|
||||||
"version": "1.0"
|
|
||||||
}},
|
|
||||||
"sections": [
|
|
||||||
{{
|
|
||||||
"id": "section_1",
|
|
||||||
"type": "heading",
|
|
||||||
"data": {{
|
|
||||||
"level": 1,
|
|
||||||
"text": "Heading Text"
|
|
||||||
}}
|
|
||||||
}},
|
|
||||||
{{
|
|
||||||
"id": "section_2",
|
|
||||||
"type": "table",
|
|
||||||
"data": {{
|
|
||||||
"headers": ["Column1", "Column2"],
|
|
||||||
"rows": [["Data1", "Data2"], ["Data3", "Data4"]]
|
|
||||||
}}
|
|
||||||
}},
|
|
||||||
{{
|
|
||||||
"id": "section_3",
|
|
||||||
"type": "bullet_list",
|
|
||||||
"data": {{
|
|
||||||
"items": ["Item 1", "Item 2", "Item 3"]
|
|
||||||
}}
|
|
||||||
}},
|
|
||||||
{{
|
|
||||||
"id": "section_4",
|
|
||||||
"type": "paragraph",
|
|
||||||
"data": {{
|
|
||||||
"text": "Paragraph content here"
|
|
||||||
}}
|
|
||||||
}}
|
|
||||||
]
|
|
||||||
}}
|
|
||||||
|
|
||||||
Content Types to Extract:
|
Content Types to Extract:
|
||||||
1. Tables: Extract all rows and columns with proper headers
|
1. Tables: Extract all rows and columns with proper headers
|
||||||
|
|
@ -485,8 +448,8 @@ def _getFormatRules(outputFormat: str) -> str:
|
||||||
|
|
||||||
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||||
"""
|
"""
|
||||||
Use AI to extract the core content intention from the user prompt.
|
Use AI to extract a rich, structured extraction intent from the user prompt.
|
||||||
Focus on WHAT the user wants to extract, not HOW to format it.
|
Include language, normalization, structure needs, headers, formats, row strategy, and multi-file guidance.
|
||||||
"""
|
"""
|
||||||
if not aiService:
|
if not aiService:
|
||||||
# Fallback if no AI service available
|
# Fallback if no AI service available
|
||||||
|
|
@ -496,18 +459,46 @@ async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=N
|
||||||
# Protect userPrompt from injection by escaping quotes and newlines
|
# Protect userPrompt from injection by escaping quotes and newlines
|
||||||
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
# Simple AI call to extract the intention
|
# Rich analysis to derive a complete extraction intent and structure guidance
|
||||||
extractionPrompt = f"""
|
extractionPrompt = f"""
|
||||||
Extract the core content intention from this user request. Focus on WHAT raw data/content they want extracted.
|
Analyze the user's request and produce a RICH extraction intent. Return ONLY JSON.
|
||||||
|
|
||||||
|
Goals:
|
||||||
|
- Detect language and normalize the request into a full, explicit instruction (no summary; preserve all constraints and details).
|
||||||
|
- Decide if structured data is required; if so, define the target structure precisely (headers, order, formats, row strategy).
|
||||||
|
- Identify if multi-file output is appropriate and how to split/files name.
|
||||||
|
|
||||||
User request: "{safeUserPrompt}"
|
User request: "{safeUserPrompt}"
|
||||||
|
|
||||||
Return only the content intention in a simple format like "Extract: [content description]"
|
Return JSON in this exact shape:
|
||||||
Focus on extracting raw data, tables, lists, and factual content - NOT summaries or analysis.
|
{{
|
||||||
If the user mentions a table, extract the actual table data with rows and columns.
|
"detectedLanguage": "de|en|fr|it|...",
|
||||||
If the user mentions a list, extract the actual list items.
|
"normalizedRequest": "Full explicit instruction in detected language",
|
||||||
IMPORTANT: Preserve any language requirements in your response.
|
"requiresStructuredData": true|false,
|
||||||
Do not include formatting instructions, file types, or output methods.
|
"targetStructure": "table|list|mixed|unstructured",
|
||||||
|
"table": {{
|
||||||
|
"headers": ["Header1", "Header2", "..."],
|
||||||
|
"headerOrderStrict": true|false,
|
||||||
|
"rowStrategy": "one_row_per_document|one_row_per_entity|one_row_per_vat_rate|custom",
|
||||||
|
"formats": {{
|
||||||
|
"dateFormat": "DD.MM.YYYY|YYYY-MM-DD|...",
|
||||||
|
"amountDecimals": 2,
|
||||||
|
"currencyFormat": "code|symbol",
|
||||||
|
"idMasking": "none|last4|custom"
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
"multiFile": true|false,
|
||||||
|
"fileSplitStrategy": "single|per_entity|by_section|by_criteria|custom",
|
||||||
|
"fileNamingPattern": "suggested pattern for filenames",
|
||||||
|
"constraints": ["List of critical constraints to enforce"],
|
||||||
|
"reasoning": "Brief justification (one sentence)"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Preserve user terminology and language in normalizedRequest.
|
||||||
|
- If the user listed columns/fields, copy them exactly into table.headers and set headerOrderStrict=true.
|
||||||
|
- If the user implies separate rows for rates/entities, set an appropriate rowStrategy (e.g., one_row_per_vat_rate).
|
||||||
|
- If no structure is required, set requiresStructuredData=false and targetStructure="unstructured".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Call AI service to extract intention
|
# Call AI service to extract intention
|
||||||
|
|
@ -523,12 +514,24 @@ Do not include formatting instructions, file types, or output methods.
|
||||||
result = response.content if response else ""
|
result = response.content if response else ""
|
||||||
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
|
services.utils.debugLogToFile(f"DEBUG: Extraction intent processed", "PROMPT_BUILDER")
|
||||||
|
|
||||||
return result if result else f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
|
# Try to extract and pretty print JSON
|
||||||
|
if result:
|
||||||
|
import re, json as _json
|
||||||
|
match = re.search(r'\{[\s\S]*\}', result)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
obj = _json.loads(match.group(0))
|
||||||
|
return _json.dumps(obj, ensure_ascii=False, indent=2)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to previous simple format
|
||||||
|
return f"Extract: {safeUserPrompt}"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback on any error - preserve user prompt for language instructions
|
# Fallback on any error - preserve user prompt for language instructions
|
||||||
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
|
services.utils.debugLogToFile(f"DEBUG: AI extraction intent failed: {str(e)}", "PROMPT_BUILDER")
|
||||||
return f"Extract all relevant content from the document according to the user's requirements: {userPrompt}"
|
return f"Extract: {userPrompt}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,243 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List, Set
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
|
||||||
|
|
||||||
|
class NormalizationService:
|
||||||
|
"""
|
||||||
|
Produces a single canonical table in merged JSON using an AI-provided header mapping
|
||||||
|
and deterministic, in-code value normalization. No language heuristics in code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, services):
|
||||||
|
self.services = services
|
||||||
|
|
||||||
|
# Public API
|
||||||
|
def discoverStructures(self, mergedJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
headers: Set[str] = set()
|
||||||
|
samples: Dict[str, List[str]] = {}
|
||||||
|
|
||||||
|
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||||
|
for section in sections:
|
||||||
|
if not isinstance(section, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||||
|
if section.get("content_type") != "table":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract table data from elements array
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
hdrs = element.get("headers") or []
|
||||||
|
rows = element.get("rows") or []
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
for h in hdrs:
|
||||||
|
if not isinstance(h, str):
|
||||||
|
continue
|
||||||
|
headers.add(h)
|
||||||
|
# collect small value samples by column index
|
||||||
|
for row in rows[:5]:
|
||||||
|
if not isinstance(row, list):
|
||||||
|
continue
|
||||||
|
for i, value in enumerate(row):
|
||||||
|
headerName = hdrs[i] if i < len(hdrs) else f"col_{i}"
|
||||||
|
if headerName not in samples:
|
||||||
|
samples[headerName] = []
|
||||||
|
if len(samples[headerName]) < 5:
|
||||||
|
samples[headerName].append(str(value))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"tableHeaders": sorted(list(headers)),
|
||||||
|
"headerSamples": samples,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def requestHeaderMapping(self, inventory: Dict[str, Any], cacheKey: str, canonicalSpec: Dict[str, Any] | None = None, mergePrompt: str | None = None) -> Dict[str, Any]:
|
||||||
|
|
||||||
|
# Allow caller to specify any canonical schema. If none provided, default to discovered headers.
|
||||||
|
if canonicalSpec is None:
|
||||||
|
canonicalSpec = {
|
||||||
|
"canonicalHeaders": inventory.get("tableHeaders", []),
|
||||||
|
"constraints": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Protect merge prompt context by wrapping in single quotes and escaping internal quotes
|
||||||
|
protectedMerge = None
|
||||||
|
if mergePrompt:
|
||||||
|
try:
|
||||||
|
protectedMerge = str(mergePrompt).replace("'", "\\'")
|
||||||
|
except Exception:
|
||||||
|
protectedMerge = str(mergePrompt)
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
"You are a mapping generator. Return ONLY JSON.\n\n"
|
||||||
|
"Given discovered headers and sample values, map them to the canonical headers.\n"
|
||||||
|
"Do not invent fields. Use null if no mapping. Provide normalization policy.\n\n"
|
||||||
|
f"CANONICAL_SPEC:\n{json.dumps(canonicalSpec, ensure_ascii=False, indent=2)}\n\n"
|
||||||
|
f"HEADERS_DISCOVERED:\n{json.dumps(inventory, ensure_ascii=False, indent=2)}\n\n"
|
||||||
|
+ (f"MERGE_PROMPT_CONTEXT (protected):\n'{protectedMerge}'\n\n" if protectedMerge is not None else "") +
|
||||||
|
"REPLY JSON SHAPE:\n(Example)\n"
|
||||||
|
"{\n \"mappings\": {\"<sourceHeader>\": \"<Canonical>|null\"},\n"
|
||||||
|
" \"normalizationPolicy\": {\n \"TotalAmount\": {\"decimalSeparator\": \",\"|\".\"},\n"
|
||||||
|
" \"Currency\": {\"stripSymbols\": true},\n"
|
||||||
|
" \"Date\": {\"formats\": [\"DD.MM.YYYY\",\"YYYY-MM-DD\"]}\n }\n}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self.services.ai.callAi(prompt=prompt)
|
||||||
|
js = response[response.find('{'):response.rfind('}') + 1] if response else '{}'
|
||||||
|
mapping = json.loads(js)
|
||||||
|
# Normalize key naming from AI: prefer single key "mapping"
|
||||||
|
if "mapping" not in mapping and "mappings" in mapping and isinstance(mapping["mappings"], dict):
|
||||||
|
mapping["mapping"] = mapping["mappings"]
|
||||||
|
try:
|
||||||
|
del mapping["mappings"]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Ensure canonicalHeaders present in mapping for downstream use
|
||||||
|
if "canonicalHeaders" not in mapping:
|
||||||
|
mapping["canonicalHeaders"] = canonicalSpec.get("canonicalHeaders", [])
|
||||||
|
|
||||||
|
# debug artifact
|
||||||
|
self._writeDebugArtifact("mapping.json", mapping)
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
def applyMapping(self, mergedJson: Dict[str, Any], mappingSpec: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
mappings = (mappingSpec or {}).get("mapping", {})
|
||||||
|
policy = (mappingSpec or {}).get("normalizationPolicy", {})
|
||||||
|
|
||||||
|
# Prefer headers provided by mapping (generic across domains)
|
||||||
|
canonicalHeaders = (mappingSpec or {}).get("canonicalHeaders") or []
|
||||||
|
if not canonicalHeaders:
|
||||||
|
# Fallback to union of mapped targets
|
||||||
|
canonicalHeaders = sorted(list({t for t in mappings.values() if t}))
|
||||||
|
|
||||||
|
rows: List[List[str]] = []
|
||||||
|
sections = mergedJson.get("sections", []) if isinstance(mergedJson, dict) else []
|
||||||
|
for section in sections:
|
||||||
|
# Use only the fundamental agreed JSON structure: content_type/elements
|
||||||
|
if section.get("content_type") != "table":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract table data from elements array
|
||||||
|
for element in section.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "headers" in element and "rows" in element:
|
||||||
|
sourceHeaders = element.get("headers") or []
|
||||||
|
sourceRows = element.get("rows") or []
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if not sourceHeaders or not sourceRows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build index map: canonical -> source index or None
|
||||||
|
indexMap: Dict[str, int] = {}
|
||||||
|
for ci, ch in enumerate(canonicalHeaders):
|
||||||
|
srcIndex = None
|
||||||
|
for si, sh in enumerate(sourceHeaders):
|
||||||
|
# Prefer explicit mapping target; fallback to identity when names match
|
||||||
|
target = mappings.get(sh)
|
||||||
|
if target is None and sh == ch:
|
||||||
|
target = ch
|
||||||
|
if target == ch:
|
||||||
|
srcIndex = si
|
||||||
|
break
|
||||||
|
indexMap[ch] = srcIndex
|
||||||
|
|
||||||
|
# Transform rows
|
||||||
|
for r in sourceRows:
|
||||||
|
canonicalRow: List[str] = []
|
||||||
|
for ch in canonicalHeaders:
|
||||||
|
idx = indexMap.get(ch)
|
||||||
|
value = r[idx] if (idx is not None and idx < len(r)) else ""
|
||||||
|
canonicalRow.append(self._normalizeValue(ch, value, policy))
|
||||||
|
# consider as row if at least one non-empty meaningful field
|
||||||
|
if any(v.strip() for v in canonicalRow):
|
||||||
|
rows.append(canonicalRow)
|
||||||
|
|
||||||
|
canonical = {
|
||||||
|
"metadata": {
|
||||||
|
"title": mergedJson.get("metadata", {}).get("title", "Merged Document"),
|
||||||
|
"source_documents": mergedJson.get("metadata", {}).get("source_documents", [])
|
||||||
|
},
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"id": "canonical_table_1",
|
||||||
|
"content_type": "table",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"headers": canonicalHeaders,
|
||||||
|
"rows": rows
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"order": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# debug artifact
|
||||||
|
self._writeDebugArtifact("canonical_merged.json", canonical)
|
||||||
|
return canonical
|
||||||
|
|
||||||
|
def validateCanonical(self, canonicalJson: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
rows = []
|
||||||
|
try:
|
||||||
|
sections = canonicalJson.get("sections", [])
|
||||||
|
for s in sections:
|
||||||
|
if s.get("content_type") == "table":
|
||||||
|
# Extract rows from elements array
|
||||||
|
for element in s.get("elements", []):
|
||||||
|
if isinstance(element, dict) and "rows" in element:
|
||||||
|
rows.extend(element.get("rows", []))
|
||||||
|
except Exception:
|
||||||
|
rows = []
|
||||||
|
report = {
|
||||||
|
"rowCount": len(rows),
|
||||||
|
"success": len(rows) > 0
|
||||||
|
}
|
||||||
|
self._writeDebugArtifact("normalization_report.json", report)
|
||||||
|
return report
|
||||||
|
|
||||||
|
# Internal helpers
|
||||||
|
def _normalizeValue(self, canonicalHeader: str, value: Any, policy: Dict[str, Any]) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
text = str(value).strip()
|
||||||
|
# Generic normalization guided by policy; avoid domain specifics
|
||||||
|
if canonicalHeader in (policy.get("numericFields", []) or []):
|
||||||
|
dec = ((policy.get(canonicalHeader) or {}).get("decimalSeparator")
|
||||||
|
or (policy.get("numeric") or {}).get("decimalSeparator")
|
||||||
|
or ".")
|
||||||
|
if dec == ",":
|
||||||
|
text = text.replace(".", "").replace(",", ".") if "," in text else text
|
||||||
|
text = ''.join(ch for ch in text if ch.isdigit() or ch in ['.', '-', '+'])
|
||||||
|
elif (policy.get("text") or {}).get("stripSymbols") and canonicalHeader in (policy.get("text", {}).get("applyTo", []) or []):
|
||||||
|
text = ''.join(ch for ch in text if ch.isalpha())
|
||||||
|
text = text.upper()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def _writeDebugArtifact(self, fileName: str, obj: Any) -> None:
|
||||||
|
try:
|
||||||
|
debugEnabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
||||||
|
if not debugEnabled:
|
||||||
|
return
|
||||||
|
root = "./test-chat/ai"
|
||||||
|
os.makedirs(root, exist_ok=True)
|
||||||
|
# Prefix timestamp for files that are frequently overwritten
|
||||||
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
|
if fileName in ("mapping.json", "canonical_merged.json"):
|
||||||
|
outName = f"{ts}_{fileName}"
|
||||||
|
else:
|
||||||
|
outName = fileName
|
||||||
|
path = os.path.join(root, outName)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
if isinstance(obj, (dict, list)):
|
||||||
|
f.write(json.dumps(obj, ensure_ascii=False, indent=2))
|
||||||
|
else:
|
||||||
|
f.write(str(obj))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -42,16 +42,27 @@ def extractUserPrompt(context: Any) -> str:
|
||||||
Fallback to the task_step objective.
|
Fallback to the task_step objective.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Prefer services.currentUserPrompt when accessible through context
|
|
||||||
services = getattr(context, 'services', None)
|
services = getattr(context, 'services', None)
|
||||||
if services and getattr(services, 'currentUserPrompt', None):
|
|
||||||
return services.currentUserPrompt
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if hasattr(context, 'task_step') and context.task_step:
|
# Determine raw user prompt from services or task_step
|
||||||
return context.task_step.objective or 'No request specified'
|
rawPrompt = None
|
||||||
return 'No request specified'
|
if services and getattr(services, 'currentUserPrompt', None):
|
||||||
|
rawPrompt = services.currentUserPrompt
|
||||||
|
elif hasattr(context, 'task_step') and context.task_step:
|
||||||
|
rawPrompt = context.task_step.objective or 'No request specified'
|
||||||
|
else:
|
||||||
|
rawPrompt = 'No request specified'
|
||||||
|
|
||||||
|
# Prefer values computed at workflow start by WorkflowManager analyzer
|
||||||
|
normalized = getattr(services, 'currentUserPromptNormalized', None) if services else None
|
||||||
|
if normalized:
|
||||||
|
return normalized
|
||||||
|
return rawPrompt
|
||||||
|
except Exception:
|
||||||
|
# Robust fallback behavior
|
||||||
|
if hasattr(context, 'task_step') and context.task_step:
|
||||||
|
return context.task_step.objective or 'No request specified'
|
||||||
|
return 'No request specified'
|
||||||
|
|
||||||
def extractWorkflowHistory(service: Any, context: Any) -> str:
|
def extractWorkflowHistory(service: Any, context: Any) -> str:
|
||||||
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
"""Extract workflow history from context. Maps to {{KEY:WORKFLOW_HISTORY}}
|
||||||
|
|
@ -99,7 +110,15 @@ def extractAvailableMethods(service: Any) -> str:
|
||||||
|
|
||||||
def extractUserLanguage(service: Any) -> str:
|
def extractUserLanguage(service: Any) -> str:
|
||||||
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
|
"""Extract user language from service. Maps to {{KEY:USER_LANGUAGE}}"""
|
||||||
return service.user.language if service and service.user else 'en'
|
try:
|
||||||
|
# Prefer detected language if available
|
||||||
|
if service and getattr(service, 'currentUserLanguage', None):
|
||||||
|
return service.currentUserLanguage
|
||||||
|
return service.user.language if service and service.user else 'en'
|
||||||
|
except Exception:
|
||||||
|
return 'en'
|
||||||
|
|
||||||
|
# Normalization now happens centrally in WorkflowManager._sendFirstMessage; no AI call here.
|
||||||
|
|
||||||
|
|
||||||
def _computeMessageSummary(msg) -> str:
|
def _computeMessageSummary(msg) -> str:
|
||||||
|
|
|
||||||
|
|
@ -216,23 +216,23 @@ class WorkflowManager:
|
||||||
# Update the message with documents in database
|
# Update the message with documents in database
|
||||||
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
|
self.services.workflow.updateMessage(message.id, {"documents": [doc.to_dict() for doc in documents]})
|
||||||
|
|
||||||
# Analyze the user's input to extract intent and offload bulky context into documents
|
# Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents
|
||||||
try:
|
try:
|
||||||
analyzerPrompt = (
|
analyzerPrompt = (
|
||||||
"You are an input analyzer. Split the user's message into:\n"
|
"You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n"
|
||||||
"1) intent: the user's core request in one concise paragraph, normalized to the user's language.\n"
|
"1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n"
|
||||||
"2) contextItems: supportive data to attach as separate documents if significantly larger than the intent. "
|
"2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n"
|
||||||
"Include large literal data blocks, long lists/tables, code/JSON blocks, quoted transcripts, CSV fragments, or detailed specs. "
|
"3) intent: concise single-paragraph core request in the detected language for high-level routing.\n"
|
||||||
"Keep URLs in the intent unless they include large pasted content.\n\n"
|
"4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n"
|
||||||
"Rules:\n"
|
"Rules:\n"
|
||||||
"- If total content length (intent + data) is less than 10% of the model's max tokens, do not extract; "
|
"- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n"
|
||||||
"return an empty contextItems and keep a compact, self-contained intent.\n"
|
"- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n"
|
||||||
"- If content exceeds that, move bulky parts into contextItems, keeping the intent short and clear.\n"
|
"- Preserve critical references (URLs, filenames) in intent.\n"
|
||||||
"- Preserve critical references (URLs, filenames) in the intent.\n"
|
"- Normalize to the primary detected language if mixed-language.\n\n"
|
||||||
"- Normalize the intent to the detected language. If mixed-language, use the primary detected language and normalize.\n\n"
|
"Return ONLY JSON (no markdown) with this shape:\n"
|
||||||
"Output JSON only (no markdown):\n"
|
|
||||||
"{\n"
|
"{\n"
|
||||||
" \"detectedLanguage\": \"en\",\n"
|
" \"detectedLanguage\": \"de|en|fr|it|...\",\n"
|
||||||
|
" \"normalizedRequest\": \"Full explicit instruction in detected language\",\n"
|
||||||
" \"intent\": \"Concise normalized request...\",\n"
|
" \"intent\": \"Concise normalized request...\",\n"
|
||||||
" \"contextItems\": [\n"
|
" \"contextItems\": [\n"
|
||||||
" {\n"
|
" {\n"
|
||||||
|
|
@ -249,6 +249,7 @@ class WorkflowManager:
|
||||||
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
|
aiResponse = await self.services.ai.callAi(prompt=analyzerPrompt)
|
||||||
|
|
||||||
detectedLanguage = None
|
detectedLanguage = None
|
||||||
|
normalizedRequest = None
|
||||||
intentText = userInput.prompt
|
intentText = userInput.prompt
|
||||||
contextItems = []
|
contextItems = []
|
||||||
|
|
||||||
|
|
@ -260,6 +261,7 @@ class WorkflowManager:
|
||||||
if jsonStart != -1 and jsonEnd > jsonStart:
|
if jsonStart != -1 and jsonEnd > jsonStart:
|
||||||
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
|
parsed = json.loads(aiResponse[jsonStart:jsonEnd])
|
||||||
detectedLanguage = parsed.get('detectedLanguage') or None
|
detectedLanguage = parsed.get('detectedLanguage') or None
|
||||||
|
normalizedRequest = parsed.get('normalizedRequest') or None
|
||||||
if parsed.get('intent'):
|
if parsed.get('intent'):
|
||||||
intentText = parsed.get('intent')
|
intentText = parsed.get('intent')
|
||||||
contextItems = parsed.get('contextItems') or []
|
contextItems = parsed.get('contextItems') or []
|
||||||
|
|
@ -269,7 +271,18 @@ class WorkflowManager:
|
||||||
# Update services state
|
# Update services state
|
||||||
if detectedLanguage and isinstance(detectedLanguage, str):
|
if detectedLanguage and isinstance(detectedLanguage, str):
|
||||||
self._setUserLanguage(detectedLanguage)
|
self._setUserLanguage(detectedLanguage)
|
||||||
|
try:
|
||||||
|
setattr(self.services, 'currentUserLanguage', detectedLanguage)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
self.services.currentUserPrompt = intentText or userInput.prompt
|
self.services.currentUserPrompt = intentText or userInput.prompt
|
||||||
|
try:
|
||||||
|
if normalizedRequest:
|
||||||
|
setattr(self.services, 'currentUserPromptNormalized', normalizedRequest)
|
||||||
|
if contextItems is not None:
|
||||||
|
setattr(self.services, 'currentUserContextItems', contextItems)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Telemetry (sizes and counts)
|
# Telemetry (sizes and counts)
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue