content mapping and transformation valdiated

This commit is contained in:
ValueOn AG 2025-10-14 22:48:55 +02:00
parent e0afc72e13
commit f0733204fb
7 changed files with 249 additions and 42 deletions

View file

@ -176,22 +176,28 @@ class SubDocumentProcessing:
# Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
# Normalize merged JSON into a single canonical table
# Normalize merged JSON into a single canonical table (only if table content exists)
try:
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
normalizer = NormalizationService(self.services)
inventory = normalizer.discoverStructures(mergedJsonDocument)
# Use workflow id as cache key
cacheKey = self.services.currentWorkflow.id
# Provide the extraction/merge prompt context when available to help mapping
mergePrompt = prompt
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
report = normalizer.validateCanonical(canonical)
if report.get('success'):
mergedJsonDocument = canonical
# Check if any table content was discovered
tableHeaders = inventory.get("tableHeaders", [])
if not tableHeaders:
logger.info("No table content found in merged JSON, skipping normalization and returning original structure")
else:
raise ValueError('Normalization produced zero rows')
# Use workflow id as cache key
cacheKey = self.services.currentWorkflow.id
# Provide the extraction/merge prompt context when available to help mapping
mergePrompt = prompt
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
report = normalizer.validateCanonical(canonical)
if report.get('success'):
mergedJsonDocument = canonical
else:
raise ValueError('Normalization produced zero rows')
except Exception as e:
# Surface normalization failure while leaving original merged JSON (single-path expectation is to fail)
raise

View file

@ -108,16 +108,30 @@ class RendererText(BaseRenderer):
elif section_type == "bullet_list":
return self._render_json_bullet_list(section_data)
elif section_type == "heading":
return self._render_json_heading(section_data)
# Render each heading element in the elements array
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_heading(element))
return "\n".join(rendered_elements)
elif section_type == "paragraph":
return self._render_json_paragraph(section_data)
# Render each paragraph element in the elements array
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_paragraph(element))
return "\n".join(rendered_elements)
elif section_type == "code_block":
return self._render_json_code_block(section_data)
elif section_type == "image":
return self._render_json_image(section_data)
else:
# Fallback to paragraph for unknown types
return self._render_json_paragraph(section_data)
# Fallback to paragraph for unknown types - render each element
# section_data is already the elements array from _get_section_data
rendered_elements = []
for element in section_data:
rendered_elements.append(self._render_json_paragraph(element))
return "\n".join(rendered_elements)
except Exception as e:
self.logger.warning(f"Error rendering section {self._get_section_id(section)}: {str(e)}")

View file

@ -45,7 +45,28 @@ async def buildAdaptiveExtractionPrompt(
"filename": "section_1.xlsx",
"sections": [
{
"id": "table_1",
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
@ -53,7 +74,7 @@ async def buildAdaptiveExtractionPrompt(
"rows": [["Value 1", "Value 2"]]
}
],
"order": 1
"order": 3
}
]
}
@ -69,7 +90,28 @@ async def buildAdaptiveExtractionPrompt(
},
"sections": [
{
"id": "table_1",
"id": "section_1",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be extracted from the document."
}
],
"order": 2
},
{
"id": "section_3",
"content_type": "table",
"elements": [
{
@ -77,7 +119,7 @@ async def buildAdaptiveExtractionPrompt(
"rows": [["Value 1", "Value 2"]]
}
],
"order": 1
"order": 3
}
]
}
@ -253,14 +295,11 @@ Consider the user's intent and the most logical way to organize the extracted co
"sections": [
{
"id": "section_001",
"content_type": "table",
"content_type": "heading",
"elements": [
{
"headers": ["Column 1", "Column 2", "Column 3"],
"rows": [
["Value 1", "Value 2", "Value 3"],
["Value 4", "Value 5", "Value 6"]
]
"level": 1,
"text": "1. SECTION TITLE"
}
],
"order": 1,
@ -340,7 +379,7 @@ async def buildExtractionPrompt(
from .subJsonSchema import get_document_subJsonSchema
jsonSchema = get_document_subJsonSchema()
# Generic block for JSON extraction - use example data instead of schema
# Generic block for JSON extraction - use mixed example data showing different content types
example_data = {
"metadata": {
"title": "Example Document",
@ -351,6 +390,29 @@ async def buildExtractionPrompt(
"sections": [
{
"id": "section_001",
"content_type": "heading",
"elements": [
{
"level": 1,
"text": "1. INTRODUCTION"
}
],
"order": 1,
"metadata": {}
},
{
"id": "section_002",
"content_type": "paragraph",
"elements": [
{
"text": "This is a sample paragraph with actual content that should be extracted from the document."
}
],
"order": 2,
"metadata": {}
},
{
"id": "section_003",
"content_type": "table",
"elements": [
{
@ -361,7 +423,7 @@ async def buildExtractionPrompt(
]
}
],
"order": 1,
"order": 3,
"metadata": {}
}
],
@ -486,17 +548,38 @@ CRITICAL: The AI MUST generate content using the CANONICAL JSON FORMAT with this
"sections": [
{{
"id": "section_1",
"content_type": "table",
"content_type": "heading",
"elements": [
{{
"headers": ["Column1", "Column2", "Column3"],
"rows": [
["Value1", "Value2", "Value3"],
["Value4", "Value5", "Value6"]
]
"level": 1,
"text": "1. SECTION TITLE"
}}
],
"order": 1
}},
{{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{{
"text": "This is the actual content that should be extracted from the document."
}}
],
"order": 2
}},
{{
"id": "section_3",
"content_type": "table",
"elements": [
{{
"headers": ["Column 1", "Column 2", "Column 3"],
"rows": [
["Value 1", "Value 2", "Value 3"],
["Value 4", "Value 5", "Value 6"]
]
}}
],
"order": 3
}}
]
}}

View file

@ -597,12 +597,16 @@ class WorkflowService:
if not workflow or not hasattr(workflow, 'messages'):
return "No documents available"
# Reload workflow from database to ensure we have all messages
if hasattr(workflow, 'id'):
try:
workflow = self.getWorkflow(workflow.id)
except Exception as e:
logger.warning(f"Could not reload workflow from database: {str(e)}")
# Use the provided workflow object directly to avoid database reload issues
# that can cause filename truncation. The workflow object should already be up-to-date.
logger.debug(f"Using provided workflow object for getAvailableDocuments (ID: {workflow.id if hasattr(workflow, 'id') else 'unknown'})")
# Debug: Check document filenames in the workflow object
if hasattr(workflow, 'messages') and workflow.messages:
for message in workflow.messages:
if hasattr(message, 'documents') and message.documents:
for doc in message.documents:
logger.debug(f"Workflow document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
# Get document reference list using the exact same logic as old system
document_list = self._getDocumentReferenceList(workflow)
@ -739,12 +743,22 @@ class WorkflowService:
"""Update file attributes (fileName, fileSize, mimeType) for documents"""
for doc in documents:
try:
# Debug: Log original filename before refresh
original_filename = doc.fileName
logger.debug(f"Before refresh - Document {doc.id}: fileName='{original_filename}' (length: {len(original_filename)})")
# Use the proper WorkflowService method to get file info
file_info = self.getFileInfo(doc.fileId)
if file_info:
db_filename = file_info.get("fileName", doc.fileName)
logger.debug(f"Database filename for {doc.id}: '{db_filename}' (length: {len(db_filename)})")
doc.fileName = file_info.get("fileName", doc.fileName)
doc.fileSize = file_info.get("size", doc.fileSize)
doc.mimeType = file_info.get("mimeType", doc.mimeType)
# Debug: Log final filename after refresh
logger.debug(f"After refresh - Document {doc.id}: fileName='{doc.fileName}' (length: {len(doc.fileName)})")
else:
logger.warning(f"File not found for document {doc.id}, fileId: {doc.fileId}")
except Exception as e:
@ -760,6 +774,8 @@ class WorkflowService:
def _getDocumentReferenceFromChatDocument(self, document, message) -> str:
"""Get document reference using document ID and filename."""
try:
# Debug logging to track filename truncation
logger.debug(f"Creating document reference for {document.id}: fileName='{document.fileName}' (length: {len(document.fileName)})")
# Use document ID and filename for simple reference
return f"docItem:{document.id}:{document.fileName}"
except Exception as e:

View file

@ -3,6 +3,7 @@
import re
import logging
import json
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
@ -14,8 +15,14 @@ class ContentValidator:
pass
def validateContent(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""Validates delivered content against user intent"""
"""Validates delivered content against user intent using AI"""
try:
# First, try AI-based validation for intelligent gap analysis
aiValidation = self._validateWithAI(documents, intent)
if aiValidation:
return aiValidation
# Fallback to rule-based validation if AI validation fails
validationDetails = []
for doc in documents:
@ -306,3 +313,73 @@ class ContentValidator:
"validationDetails": [],
"improvementSuggestions": [f"Validation failed: {error}"]
}
def _validateWithAI(self, documents: List[Any], intent: Dict[str, Any]) -> Dict[str, Any]:
"""AI-based validation to intelligently assess task completion"""
try:
# Extract content from all documents
documentContents = []
for doc in documents:
content = self._extractContent(doc)
documentContents.append({
"name": getattr(doc, 'documentName', 'Unknown'),
"content": content[:2000] # Limit content for AI processing
})
# Create AI validation prompt
validationPrompt = f"""
You are a task completion validator. Analyze if the delivered content actually fulfills the user's request.
USER REQUEST: {intent.get('primaryGoal', 'Unknown')}
DELIVERED CONTENT:
{json.dumps(documentContents, indent=2)}
TASK: Determine if the user's request has been fully completed.
Analyze the gap between what was requested and what was delivered. Consider any missing elements, incorrect formats, incomplete work, or other discrepancies.
Respond with JSON only:
{{
"overallSuccess": true/false,
"qualityScore": 0.0-1.0,
"gapAnalysis": "Detailed analysis of what's missing or incorrect",
"improvementSuggestions": ["specific action 1", "specific action 2"]
}}
"""
# Call AI service for validation
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=validationPrompt, context="", options=request_options)
# Get AI service from the workflow context
if hasattr(self, 'services') and hasattr(self.services, 'ai'):
response = self.services.ai.aiObjects.call(request)
if response and response.content:
import re
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
aiResult = json.loads(result)
return {
"overallSuccess": aiResult.get("overallSuccess", False),
"qualityScore": aiResult.get("qualityScore", 0.0),
"validationDetails": [{
"documentName": "AI Validation",
"gapAnalysis": aiResult.get("gapAnalysis", ""),
"successCriteriaMet": [aiResult.get("overallSuccess", False)]
}],
"improvementSuggestions": aiResult.get("improvementSuggestions", [])
}
return None # Fallback to rule-based validation
except Exception as e:
logger.error(f"AI validation failed: {str(e)}")
return None # Fallback to rule-based validation

View file

@ -33,6 +33,7 @@ class ReactMode(BaseMode):
# Initialize adaptive components
self.intentAnalyzer = IntentAnalyzer()
self.contentValidator = ContentValidator()
self.contentValidator.services = self.services # Pass services for AI validation
self.learningEngine = LearningEngine()
self.progressTracker = ProgressTracker()
self.currentIntent = None
@ -235,8 +236,8 @@ class ReactMode(BaseMode):
valid_refs = []
for line in available_docs.split('\n'):
if 'docList:' in line or 'docItem:' in line:
# Extract reference from line like " - docList:msg_xxx:label"
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+)', line)
# Extract reference from line like " - docList:msg_xxx:label" or " - docItem:xxx:filename with spaces"
ref_match = re.search(r'(docList:[^\s]+|docItem:[^\s]+(?:\s+[^\s]+)*)', line)
if ref_match:
valid_refs.append(ref_match.group(1))

View file

@ -28,6 +28,8 @@ def generateTaskPlanningPrompt(services, context: Any) -> PromptBundle:
Break down user requests into logical, executable task steps.
**IMPORTANT**: If the user asks for ONE complete business objective, create ONLY ONE task that accomplishes the entire objective. Do NOT split it into multiple micro-tasks.
## 📋 Context
### User Request
@ -46,12 +48,20 @@ Break down user requests into logical, executable task steps.
- **ONE TOPIC PER TASK** - Each task should handle one complete business objective
- **HIGH-LEVEL FOCUS** - Plan strategic outcomes, not implementation steps
- **AVOID MICRO-TASKS** - Don't create separate tasks for each small action
- **CRITICAL**: If the user asks for ONE thing (like "analyse document list and produce summary"), create ONLY ONE task that does the complete job
### Task Grouping Examples
- **Research + Analysis + Report** ONE task: "Web research report"
- **Data Collection + Processing + Visualization** ONE task: "Collect and present data"
- **Document splitting** (analyze + extract + create files) ONE task: "Split document into separate files"
- **Different topics** (email + flowers) SEPARATE tasks: "Send formal email..." + "Order flowers from Fleurop for delivery to 123 Main St, include card message"
### Common Single-Task Scenarios
- **"Split document into sections"** ONE task: "Split document into separate files"
- **"Extract data and create report"** ONE task: "Extract data and create report"
- **"Analyze and summarize document"** ONE task: "Analyze and summarize document"
- **"Convert file to different format"** ONE task: "Convert file to different format"
### Retry Handling
- **If retry request**: Analyze previous rounds to understand what failed
- **Learn from mistakes**: Improve the plan based on previous failures