fixed critical bug in models to use previous workflow object instead of properly instanciated current object

This commit is contained in:
ValueOn AG 2025-11-04 14:30:48 +01:00
parent 55fb23f7c0
commit 2255c9009d
6 changed files with 402 additions and 371 deletions

View file

@ -1,5 +1,6 @@
import json
import logging
import re
import time
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
@ -731,28 +732,68 @@ Respond with ONLY a JSON object in this exact format:
self.services.chat.progressLogFinish(aiOperationId, False)
return {"success": False, "error": f"Generated content is not valid JSON: {str(e)}"}
# Extract title and filename from generated document structure
extractedTitle = title # Default to user-provided title
extractedFilename = None
if isinstance(generated_data, dict) and "documents" in generated_data:
documents = generated_data["documents"]
if isinstance(documents, list) and len(documents) > 0:
firstDoc = documents[0]
if isinstance(firstDoc, dict):
# Extract title from document (preferred over user-provided title)
if firstDoc.get("title"):
extractedTitle = firstDoc["title"]
# Extract filename from document
if firstDoc.get("filename"):
extractedFilename = firstDoc["filename"]
# Ensure metadata contains the extracted title for renderers
if "metadata" not in generated_data:
generated_data["metadata"] = {}
if extractedTitle:
generated_data["metadata"]["title"] = extractedTitle
self.services.chat.progressLogUpdate(aiOperationId, 0.8, f"Rendering to {outputFormat} format")
# Render to final format using the existing renderer
try:
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generationService = GenerationService(self.services)
# Pass extracted title to renderer (will use metadata.title if available)
rendered_content, mime_type = await generationService.renderReport(
generated_data, outputFormat, title or "Generated Document", prompt, self
generated_data, outputFormat, extractedTitle or "Generated Document", prompt, self
)
# Use extracted filename if available, otherwise generate from title or use generic
if extractedFilename:
documentName = extractedFilename
elif extractedTitle and extractedTitle != "Generated Document":
# Sanitize title for filename
sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", extractedTitle)
sanitized = re.sub(r"_+", "_", sanitized).strip("_")
if sanitized:
# Ensure correct extension
if not sanitized.lower().endswith(f".{outputFormat}"):
documentName = f"{sanitized}.{outputFormat}"
else:
documentName = sanitized
else:
documentName = f"generated.{outputFormat}"
else:
documentName = f"generated.{outputFormat}"
# Build result in the expected format
result = {
"success": True,
"content": generated_data,
"documents": [{
"documentName": f"generated.{outputFormat}",
"documentName": documentName,
"documentData": rendered_content,
"mimeType": mime_type,
"title": title or "Generated Document"
"title": extractedTitle or "Generated Document"
}],
"is_multi_file": False,
"format": outputFormat,
"title": title,
"title": extractedTitle or title,
"split_strategy": "single",
"total_documents": 1,
"processed_documents": 1

View file

@ -29,153 +29,232 @@ class ChatService:
logger.error("getChatDocumentsFromDocumentList: No workflow available (self.services.workflow is not set)")
return []
workflow_id = workflow.id if hasattr(workflow, 'id') else 'NO_ID'
workflow_obj_id = id(workflow)
workflowId = workflow.id if hasattr(workflow, 'id') else 'NO_ID'
workflowObjId = id(workflow)
logger.debug(f"getChatDocumentsFromDocumentList: input documentList = {documentList}")
logger.debug(f"getChatDocumentsFromDocumentList: using workflow.id = {workflow_id}, workflow object id = {workflow_obj_id}")
logger.debug(f"getChatDocumentsFromDocumentList: using workflow.id = {workflowId}, workflow object id = {workflowObjId}")
# Root cause analysis: Verify workflow.messages integrity and detect workflow changes
self._verifyWorkflowMessagesIntegrity(workflow, workflowId)
# Debug: list available messages with their labels and document names
# Debug: list available messages with their labels and document names (filtered by workflowId)
try:
if workflow and hasattr(workflow, 'messages') and workflow.messages:
msg_lines = []
msgLines = []
messagesFromOtherWorkflows = []
for message in workflow.messages:
msgWorkflowId = getattr(message, 'workflowId', None)
# Only include messages that belong to this workflow
if msgWorkflowId and msgWorkflowId != workflowId:
messagesFromOtherWorkflows.append(f"id={getattr(message, 'id', None)}, label={getattr(message, 'documentsLabel', None)}, workflowId={msgWorkflowId}")
continue
# Also skip messages without workflowId (shouldn't happen, but be safe)
if not msgWorkflowId:
messagesFromOtherWorkflows.append(f"id={getattr(message, 'id', None)}, label={getattr(message, 'documentsLabel', None)}, workflowId=Missing")
continue
label = getattr(message, 'documentsLabel', None)
doc_names = []
docNames = []
if getattr(message, 'documents', None):
for doc in message.documents:
name = getattr(doc, 'fileName', None) or getattr(doc, 'documentName', None) or 'Unnamed'
doc_names.append(name)
msg_lines.append(
f"- id={getattr(message, 'id', None)}, label={label}, docs={doc_names}"
docNames.append(name)
msgLines.append(
f"- id={getattr(message, 'id', None)}, label={label}, workflowId={msgWorkflowId}, docs={docNames}"
)
if msg_lines:
logger.debug("getChatDocumentsFromDocumentList: available messages:\n" + "\n".join(msg_lines))
if msgLines:
logger.debug("getChatDocumentsFromDocumentList: available messages (filtered for workflow):\n" + "\n".join(msgLines))
if messagesFromOtherWorkflows:
logger.warning(f"getChatDocumentsFromDocumentList: Found {len(messagesFromOtherWorkflows)} messages from other workflows in workflow.messages list:\n" + "\n".join(messagesFromOtherWorkflows))
else:
logger.debug("getChatDocumentsFromDocumentList: no messages available on current workflow")
except Exception as e:
logger.debug(f"getChatDocumentsFromDocumentList: unable to enumerate messages for debug: {e}")
all_documents = []
for doc_ref in documentList:
if doc_ref.startswith("docItem:"):
allDocuments = []
for docRef in documentList:
if docRef.startswith("docItem:"):
# docItem:<id>:<filename> - extract ID and find document
parts = doc_ref.split(':')
parts = docRef.split(':')
if len(parts) >= 2:
doc_id = parts[1]
docId = parts[1]
# Find the document by ID
for message in workflow.messages:
# Validate message belongs to this workflow
msg_workflow_id = getattr(message, 'workflowId', None)
if msg_workflow_id and msg_workflow_id != workflow_id:
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
continue
if message.documents:
for doc in message.documents:
if doc.id == doc_id:
doc_name = getattr(doc, 'fileName', 'unknown')
all_documents.append(doc)
if doc.id == docId:
docName = getattr(doc, 'fileName', 'unknown')
allDocuments.append(doc)
break
elif doc_ref.startswith("docList:"):
elif docRef.startswith("docList:"):
# docList:<messageId>:<label> or docList:<label> - extract message ID and find document list
parts = doc_ref.split(':')
parts = docRef.split(':')
if len(parts) >= 3:
# Format: docList:<messageId>:<label>
message_id = parts[1]
messageId = parts[1]
label = parts[2]
# First try to find the message by ID in the current workflow
message_found = None
messageFound = None
for message in workflow.messages:
# Validate message belongs to this workflow
msg_workflow_id = getattr(message, 'workflowId', None)
if msg_workflow_id and msg_workflow_id != workflow_id:
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
continue
if str(message.id) == message_id:
message_found = message
if str(message.id) == messageId:
messageFound = message
break
# If message ID not found in current workflow, this is a stale reference
# Log warning and return empty list (don't fall back to label - it might match wrong message)
if not message_found:
available_ids = [str(msg.id) for msg in workflow.messages]
logger.warning(f"Document reference contains stale message ID {message_id} not found in current workflow {workflow.id}. Label: {label}. Available message IDs: {available_ids}")
if not messageFound:
availableIds = [str(msg.id) for msg in workflow.messages]
logger.warning(f"Document reference contains stale message ID {messageId} not found in current workflow {workflow.id}. Label: {label}. Available message IDs: {availableIds}")
logger.warning(f"This indicates the document reference was created in a different workflow state. Returning empty list.")
# Return empty list - don't fall back to label matching which could match wrong message
continue
# If found, add documents
if message_found and message_found.documents:
all_documents.extend(message_found.documents)
if messageFound and messageFound.documents:
allDocuments.extend(messageFound.documents)
elif len(parts) >= 2:
# Format: docList:<label> - find message by documentsLabel
label = parts[1]
message_found = None
messageFound = None
for message in workflow.messages:
# Validate message belongs to this workflow
msg_workflow_id = getattr(message, 'workflowId', None)
if msg_workflow_id and msg_workflow_id != workflow_id:
logger.warning(f"Message {message.id} has workflowId {msg_workflow_id} but belongs to workflow {workflow_id}. Skipping.")
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
if msgWorkflowId:
logger.warning(f"Message {message.id} has workflowId {msgWorkflowId} but belongs to workflow {workflowId}. Skipping.")
else:
logger.warning(f"Message {message.id} has no workflowId. Skipping.")
continue
msg_label = getattr(message, 'documentsLabel', None)
if msg_label == label:
message_found = message
msgLabel = getattr(message, 'documentsLabel', None)
if msgLabel == label:
messageFound = message
break
# If found, add documents
if message_found and message_found.documents:
all_documents.extend(message_found.documents)
if messageFound and messageFound.documents:
allDocuments.extend(messageFound.documents)
else:
# Direct label reference (round1_task2_action3_contextinfo)
# Search for messages with matching documentsLabel to find the actual documents
if doc_ref.startswith("round"):
if docRef.startswith("round"):
# Parse round/task/action to find the corresponding document list
label_parts = doc_ref.split('_', 3)
if len(label_parts) >= 4:
round_num = int(label_parts[0].replace('round', ''))
task_num = int(label_parts[1].replace('task', ''))
action_num = int(label_parts[2].replace('action', ''))
context_info = label_parts[3]
labelParts = docRef.split('_', 3)
if len(labelParts) >= 4:
roundNum = int(labelParts[0].replace('round', ''))
taskNum = int(labelParts[1].replace('task', ''))
actionNum = int(labelParts[2].replace('action', ''))
contextInfo = labelParts[3]
# Find messages with matching documentsLabel (this is the correct way!)
# In case of retries, we want the NEWEST message (most recent publishedAt)
matching_messages = []
matchingMessages = []
for message in workflow.messages:
# Validate message belongs to this workflow
msg_workflow_id = getattr(message, 'workflowId', None)
if msg_workflow_id and msg_workflow_id != workflow_id:
logger.debug(f"Skipping message {message.id} with workflowId {msg_workflow_id} (expected {workflow_id})")
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
if msgWorkflowId:
logger.debug(f"Skipping message {message.id} with workflowId {msgWorkflowId} (expected {workflowId})")
else:
logger.debug(f"Skipping message {message.id} with no workflowId (expected {workflowId})")
continue
msg_documents_label = getattr(message, 'documentsLabel', '')
msgDocumentsLabel = getattr(message, 'documentsLabel', '')
# Check if this message's documentsLabel matches our reference
if msg_documents_label == doc_ref:
if msgDocumentsLabel == docRef:
# Found a matching message, collect it for comparison
matching_messages.append(message)
matchingMessages.append(message)
# If we found matching messages, take the newest one (highest publishedAt)
if matching_messages:
if matchingMessages:
# Sort by publishedAt descending (newest first)
matching_messages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
newest_message = matching_messages[0]
matchingMessages.sort(key=lambda msg: getattr(msg, 'publishedAt', 0), reverse=True)
newestMessage = matchingMessages[0]
if newest_message.documents:
doc_names = [doc.fileName for doc in newest_message.documents if hasattr(doc, 'fileName')]
logger.debug(f"Added {len(newest_message.documents)} documents from newest message {newest_message.id}: {doc_names}")
all_documents.extend(newest_message.documents)
if newestMessage.documents:
docNames = [doc.fileName for doc in newestMessage.documents if hasattr(doc, 'fileName')]
logger.debug(f"Added {len(newestMessage.documents)} documents from newest message {newestMessage.id}: {docNames}")
allDocuments.extend(newestMessage.documents)
else:
logger.debug(f"No documents found in newest message {newest_message.id}")
logger.debug(f"No documents found in newest message {newestMessage.id}")
else:
logger.error(f"No messages found with documentsLabel: {doc_ref}")
raise ValueError(f"Document reference not found: {doc_ref}")
logger.error(f"No messages found with documentsLabel: {docRef}")
raise ValueError(f"Document reference not found: {docRef}")
logger.debug(f"Resolved {len(all_documents)} documents from document list: {documentList}")
return all_documents
logger.debug(f"Resolved {len(allDocuments)} documents from document list: {documentList}")
return allDocuments
except Exception as e:
logger.error(f"Error getting documents from document list: {str(e)}")
return []
def _verifyWorkflowMessagesIntegrity(self, workflow, expectedWorkflowId: str) -> None:
"""
Verify that all messages in workflow.messages belong to the expected workflow.
This helps detect when workflow objects are being mixed up or when messages from
other workflows are incorrectly included.
"""
try:
if not workflow or not hasattr(workflow, 'messages') or not workflow.messages:
return
messagesFromOtherWorkflows = []
messagesWithoutWorkflowId = []
totalMessages = len(workflow.messages)
for message in workflow.messages:
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId:
messagesWithoutWorkflowId.append({
'id': getattr(message, 'id', 'unknown'),
'label': getattr(message, 'documentsLabel', None)
})
elif msgWorkflowId != expectedWorkflowId:
messagesFromOtherWorkflows.append({
'id': getattr(message, 'id', 'unknown'),
'label': getattr(message, 'documentsLabel', None),
'workflowId': msgWorkflowId,
'expectedWorkflowId': expectedWorkflowId
})
if messagesFromOtherWorkflows:
logger.error(
f"CRITICAL: Workflow integrity violation detected! "
f"Workflow {expectedWorkflowId} contains {len(messagesFromOtherWorkflows)} messages from other workflows. "
f"Total messages: {totalMessages}. "
f"Foreign messages: {messagesFromOtherWorkflows}"
)
if messagesWithoutWorkflowId:
logger.warning(
f"Workflow integrity issue: Workflow {expectedWorkflowId} contains {len(messagesWithoutWorkflowId)} messages without workflowId. "
f"Messages: {messagesWithoutWorkflowId}"
)
# Also check if self.services.workflow has changed (workflow object ID mismatch)
currentWorkflow = self.services.workflow
if currentWorkflow and hasattr(currentWorkflow, 'id'):
currentWorkflowId = currentWorkflow.id
if currentWorkflowId != expectedWorkflowId:
logger.error(
f"CRITICAL: Workflow object changed during execution! "
f"Expected workflow {expectedWorkflowId}, but self.services.workflow now points to {currentWorkflowId}. "
f"This indicates the workflow object was swapped mid-execution."
)
except Exception as e:
logger.debug(f"Error during workflow integrity verification: {e}")
def getConnectionReferenceFromUserConnection(self, connection: UserConnection) -> str:
"""Get connection reference from UserConnection with enhanced state information"""
# Get token information to check if it's expired
@ -640,9 +719,12 @@ class ChatService:
if not workflow or not hasattr(workflow, 'messages'):
return "No documents available"
workflow_id = workflow.id if hasattr(workflow, 'id') else 'NO_ID'
workflow_obj_id = id(workflow)
logger.debug(f"getAvailableDocuments: workflow.id = {workflow_id}, workflow object id = {workflow_obj_id}")
workflowId = workflow.id if hasattr(workflow, 'id') else 'NO_ID'
workflowObjId = id(workflow)
logger.debug(f"getAvailableDocuments: workflow.id = {workflowId}, workflow object id = {workflowObjId}")
# Root cause analysis: Verify workflow.messages integrity and detect workflow changes
self._verifyWorkflowMessagesIntegrity(workflow, workflowId)
# Use the provided workflow object directly to avoid database reload issues
# that can cause filename truncation. The workflow object should already be up-to-date.

View file

@ -1083,12 +1083,12 @@ class MethodOutlook(MethodBase):
return ActionResult.isFailure(error=str(e))
@action
async def composeAndSendEmailWithContext(self, parameters: Dict[str, Any]) -> ActionResult:
async def composeAndDraftEmailWithContext(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Compose email content using AI from context and optional documents, then create a draft/send.
- Purpose: Compose email content using AI from context and optional documents, then create a draft.
- Input requirements: connectionReference (required); to (required); context (required); optional documentList, cc, bcc, emailStyle, maxLength.
- Output format: JSON confirmation with AI-generated draft/send metadata.
- Output format: JSON confirmation with AI-generated draft metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
@ -1336,8 +1336,9 @@ Return JSON:
draft_data = response.json()
draft_id = draft_data.get("id", "Unknown")
result_data = {
"status": "success",
# Create draft result data with full draft information
draft_result_data = {
"status": "draft",
"message": "Email draft created successfully with AI-generated content",
"draftId": draft_id,
"folder": "Drafts (Entwürfe)",
@ -1352,14 +1353,15 @@ Return JSON:
"aiGenerated": True,
"context": context,
"emailStyle": emailStyle,
"timestamp": self.services.utils.timestampGetUtc()
"timestamp": self.services.utils.timestampGetUtc(),
"draftData": draft_data
}
return ActionResult(
success=True,
documents=[ActionDocument(
documentName=f"ai_generated_email_draft_{self._format_timestamp_for_filename()}.json",
documentData=json.dumps(result_data, indent=2),
documentData=json.dumps(draft_result_data, indent=2),
mimeType="application/json"
)]
)
@ -1372,7 +1374,142 @@ Return JSON:
return ActionResult.isFailure(error=f"Failed to create email: {str(e)}")
except Exception as e:
logger.error(f"Error in composeAndSendEmailWithContext: {str(e)}")
logger.error(f"Error in composeAndDraftEmailWithContext: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def sendDraftEmail(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Send a draft email using the draft email JSON data from action outlook.composeAndDraftEmailWithContext. This action is used to send the email after the email has been composed and drafted.
- Input requirements: connectionReference (required); draftEmailJson (required).
- Output format: JSON confirmation with sent mail metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- draftEmailJson (str or dict, required): Draft email JSON data containing draftId or draftData with id field.
"""
try:
connectionReference = parameters.get("connectionReference")
draftEmailJson = parameters.get("draftEmailJson")
if not connectionReference:
return ActionResult.isFailure(error="Connection reference is required")
if not draftEmailJson:
return ActionResult.isFailure(error="Draft email JSON is required")
# Parse draft email JSON if it's a string
if isinstance(draftEmailJson, str):
try:
draftEmailJson = json.loads(draftEmailJson)
except json.JSONDecodeError:
return ActionResult.isFailure(error="Invalid JSON format in draftEmailJson parameter")
# Extract draft ID from the JSON
draft_id = None
if isinstance(draftEmailJson, dict):
# Try to get draftId directly
draft_id = draftEmailJson.get("draftId")
# If not found, try to get it from draftData
if not draft_id and "draftData" in draftEmailJson:
draft_data = draftEmailJson.get("draftData")
if isinstance(draft_data, dict):
draft_id = draft_data.get("id")
# If still not found, try id field directly
if not draft_id:
draft_id = draftEmailJson.get("id")
if not draft_id:
return ActionResult.isFailure(error="Could not extract draft ID from draftEmailJson. Ensure it contains 'draftId' or 'draftData.id' field")
# Get Microsoft connection
connection = self._getMicrosoftConnection(connectionReference)
if not connection:
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
# Check permissions
permissions_ok = await self._checkPermissions(connection)
if not permissions_ok:
return ActionResult.isFailure(error="Connection lacks necessary permissions for Outlook operations")
# Send the draft email
try:
graph_url = "https://graph.microsoft.com/v1.0"
headers = {
"Authorization": f"Bearer {connection['accessToken']}",
"Content-Type": "application/json"
}
send_url = f"{graph_url}/me/messages/{draft_id}/send"
send_response = requests.post(send_url, headers=headers)
# Extract email details from draft JSON for confirmation
subject = draftEmailJson.get("subject", "Unknown")
recipients = draftEmailJson.get("recipients", [])
cc = draftEmailJson.get("cc", [])
bcc = draftEmailJson.get("bcc", [])
attachments_count = draftEmailJson.get("attachments", 0)
if send_response.status_code in [200, 202, 204]:
sent_confirmation_data = {
"status": "sent",
"message": "Email sent successfully",
"draftId": draft_id,
"subject": subject,
"recipients": recipients,
"cc": cc,
"bcc": bcc,
"attachments": attachments_count,
"sentTimestamp": self.services.utils.timestampGetUtc(),
"confirmation": "Email has been successfully sent to recipients"
}
logger.info(f"Email sent successfully. Draft ID: {draft_id}")
return ActionResult(
success=True,
documents=[ActionDocument(
documentName=f"sent_mail_confirmation_{self._format_timestamp_for_filename()}.json",
documentData=json.dumps(sent_confirmation_data, indent=2),
mimeType="application/json"
)]
)
else:
logger.error(f"Failed to send email. Status: {send_response.status_code}, Response: {send_response.text}")
sent_confirmation_data = {
"status": "error",
"message": "Failed to send draft email",
"draftId": draft_id,
"subject": subject,
"recipients": recipients,
"sendError": {
"statusCode": send_response.status_code,
"response": send_response.text
},
"sentTimestamp": self.services.utils.timestampGetUtc(),
"confirmation": "Email draft sending failed"
}
return ActionResult.isFailure(
error=f"Failed to send email: {send_response.status_code} - {send_response.text}",
documents=[ActionDocument(
documentName=f"sent_mail_confirmation_{self._format_timestamp_for_filename()}.json",
documentData=json.dumps(sent_confirmation_data, indent=2),
mimeType="application/json"
)]
)
except ImportError:
logger.error("requests module not available")
return ActionResult.isFailure(error="requests module not available")
except Exception as e:
logger.error(f"Error sending draft email via Microsoft Graph API: {str(e)}")
return ActionResult.isFailure(error=f"Failed to send draft email: {str(e)}")
except Exception as e:
logger.error(f"Error in sendDraftEmail: {str(e)}")
return ActionResult.isFailure(error=str(e))
async def checkPermissions(self, parameters: Dict[str, Any]) -> ActionResult:

View file

@ -1,266 +0,0 @@
# Content Validator - Deep Analysis & Target Design
## CURRENT STATE ANALYSIS
### How Validator Currently Works
#### 1. **Document Input Flow**
```
ActionResult.documents (List[ActionDocument])
→ modeReact.py extracts "structured content" with hardcoded checks
→ Creates SimpleNamespace objects with wrapped documentData
→ Passes to ContentValidator.validateContent()
```
#### 2. **Current Problems in modeReact.py (Lines 99-136)**
- ❌ **Hardcoded document name checks**: `docName == "structured_content.json"`
- ❌ **Hardcoded mimeType checks**: `mimeType == "application/json"`
- ❌ **Hardcoded structure checks**: `'content' in docData or 'documents' in docData or 'sections' in docData`
- ❌ **Single document selection**: `break` after first match - ignores other documents
- ❌ **Non-generic logic**: Specific to certain document structures
- ❌ **Workaround approach**: Trying to find structured content in various ways
#### 3. **Current Problems in contentValidator.py**
**`_extractContent()` method (Lines 21-41)**:
- ❌ **Inconsistent handling**: Checks for `dict with 'content'` but then also handles raw `data`
- ❌ **Silent failures**: Returns empty string on any exception
- ❌ **Size limit hardcoded**: 10KB threshold is arbitrary
- ❌ **No format awareness**: Doesn't check if document is binary/base64 before extracting
- ❌ **No document type detection**: Doesn't distinguish text vs binary vs structured data
**`_validateWithAI()` method (Lines 60-200)**:
- ❌ **Forces all content to string**: `content[:2000]` truncation assumes text
- ❌ **No document metadata passed**: Only name and content, no size, format, mimeType info
- ❌ **No binary/base64 handling**: Will fail or show garbage for binary documents
- ❌ **Multiple JSON extraction strategies**: Indicates unreliable AI response parsing
- ❌ **Size limits inconsistent**: 10KB in extraction, 2KB in prompt - why different?
#### 4. **Missing Capabilities**
- ❌ No document size reporting to validator
- ❌ No format validation (txt vs md vs pdf vs docx)
- ❌ No binary data handling (images, PDFs, etc.)
- ❌ No document count/summary statistics
- ❌ No distinction between document types for validation
---
## TARGET DESIGN
### Core Principles
1. **GENERIC**: No hardcoded document names, types, or structures
2. **DOCUMENT-AWARE**: Handle all document types (text, binary, base64, structured)
3. **SIZE-CONSCIOUS**: Never pass full large documents to AI
4. **METADATA-RICH**: Pass document metadata (name, size, format, mimeType) to validator
5. **FORMAT-FLEXIBLE**: Allow format flexibility (md ≈ text, but pdf ≠ docx)
### Target Architecture
```
Documents Input (List[ActionDocument])
Document Analyzer (generic)
- Extract metadata (name, size, mimeType, format)
- Determine content type (text/binary/base64/structured)
- Create preview/summary for large documents
Document Summary (for AI validation)
- Metadata only for binary/base64
- Preview/sample for large text documents
- Full content for small text/structured documents
Validation Prompt Builder (generic)
- Include document summaries (not full content)
- Include document metadata
- Include format validation rules (generic)
AI Validator
- Validates against task objective (generic)
- Validates format compliance (flexible)
- Validates document count/size
```
---
## REQUIRED CHANGES
### 1. **Remove All Hardcoded Checks from modeReact.py**
- ❌ Remove document name checks
- ❌ Remove mimeType-specific logic
- ❌ Remove structure-specific checks
- ✅ Pass ALL documents to validator (let validator decide what to validate)
- ✅ Keep it simple: `validationDocs = result.documents`
### 2. **Redesign contentValidator.py - New Structure**
#### New Method: `_analyzeDocuments(documents)`
```python
def _analyzeDocuments(self, documents: List[Any]) -> List[Dict[str, Any]]:
"""
Generic document analysis - extract metadata and create summaries.
Returns list of document summaries ready for validation prompt.
"""
summaries = []
for doc in documents:
summary = {
"name": getattr(doc, 'documentName', 'Unknown'),
"mimeType": getattr(doc, 'mimeType', 'unknown'),
"format": self._detectFormat(doc),
"size": self._calculateSize(doc),
"type": self._detectContentType(doc), # text/binary/base64/structured
"preview": self._createPreview(doc), # None for binary, sample for large text
"isAccessible": self._isContentAccessible(doc) # Can we read content?
}
summaries.append(summary)
return summaries
```
#### New Method: `_detectFormat(doc)`
- Extract from filename extension or mimeType
- Generic mapping: `text/plain``txt`, `text/markdown``md`, etc.
- Return format string (txt, md, pdf, docx, json, etc.)
#### New Method: `_calculateSize(doc)`
- Calculate document size in bytes
- Handle string, dict, list, bytes, base64
- Return: `{"bytes": int, "readable": "1.5 MB"}`
#### New Method: `_detectContentType(doc)`
- `text`: Readable text content
- `structured`: JSON/dict/list structures
- `binary`: Binary data (PDF, images, etc.)
- `base64`: Base64-encoded data
- Return content type string
#### New Method: `_createPreview(doc)`
- **Binary/Base64**: Return `None` (metadata only)
- **Large text (>50KB)**: Return first 1KB + size indicator
- **Small text (≤50KB)**: Return full content
- **Structured data**: Return JSON string (truncated if large)
#### New Method: `_isContentAccessible(doc)`
- Check if document content can be extracted for validation
- Binary/base64 documents: `False` (validate by metadata only)
- Text/structured documents: `True`
### 3. **Redesign Validation Prompt (Generic)**
```python
validationPrompt = f"""TASK VALIDATION
USER REQUEST: '{intent.get('primaryGoal', 'Unknown')}'
EXPECTED DATA TYPE: {intent.get('dataType', 'unknown')}
EXPECTED FORMAT: {intent.get('expectedFormat', 'unknown')}
SUCCESS CRITERIA ({criteriaCount} items): {successCriteria}
DELIVERED DOCUMENTS ({len(documentSummaries)} items):
{json.dumps(documentSummaries, indent=2)}
VALIDATION RULES:
1. Check if delivered documents match expected data type
2. Check if delivered formats are compatible with expected format
(Note: text formats like txt/md are compatible; pdf ≠ docx but both are documents)
3. Verify each success criterion is met based on document content/metadata
4. Check document sizes are reasonable for the task
5. Rate overall quality (0.0-1.0)
6. Identify specific gaps
7. Suggest next steps
OUTPUT FORMAT - JSON ONLY (no prose):
{{
"overallSuccess": false,
"qualityScore": 0.0,
"dataTypeMatch": false,
"formatMatch": false,
"documentCount": {len(documentSummaries)},
"successCriteriaMet": {[False] * criteriaCount},
"gapAnalysis": "Specific gaps found",
"improvementSuggestions": ["NEXT STEP: Action 1"],
"validationDetails": [
{{
"documentName": "document.ext",
"issues": ["Issue 1"],
"suggestions": ["NEXT STEP: Fix 1"]
}}
]
}}
"""
```
### 4. **Format Validation Logic (Generic & Flexible)**
```python
def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool:
"""
Generic format compatibility check.
- txt/md/html are text formats (compatible with each other)
- pdf/docx/xlsx are document formats (not compatible with each other)
- json/xml are structured formats
- images are image formats
"""
# Text formats are interchangeable
textFormats = ['txt', 'md', 'html', 'text', 'plain']
if deliveredFormat.lower() in textFormats and expectedFormat.lower() in textFormats:
return True
# Exact match
if deliveredFormat.lower() == expectedFormat.lower():
return True
# Structured formats
if deliveredFormat.lower() in ['json', 'xml'] and expectedFormat.lower() in ['json', 'xml']:
return True # Could be made more flexible
return False
```
---
## IMPLEMENTATION PLAN
### Phase 1: Clean Up modeReact.py
- Remove all hardcoded checks
- Simply pass `result.documents` to validator
### Phase 2: Redesign Document Analysis
- Implement `_analyzeDocuments()`
- Implement helper methods: `_detectFormat()`, `_calculateSize()`, `_detectContentType()`, `_createPreview()`
### Phase 3: Redesign Validation Prompt
- Generic prompt with document summaries
- Include metadata, not full content
- Size-aware handling
### Phase 4: Implement Format Validation
- Generic format compatibility logic
- Flexible matching (text formats, document formats, etc.)
### Phase 5: Testing
- Test with text documents (small & large)
- Test with binary documents (PDF, images)
- Test with base64 documents
- Test with structured data (JSON)
---
## KEY DESIGN DECISIONS
1. **Pass ALL documents**: Validator decides what to validate, not the caller
2. **Metadata over content**: For large/binary documents, pass metadata only
3. **Preview samples**: For large text documents, pass preview + size info
4. **Generic prompts**: No task-specific or format-specific logic
5. **Flexible format matching**: Text formats compatible, document formats strict
6. **Size limits**: 50KB threshold for full content (configurable)
7. **Content type detection**: Explicit type detection (text/binary/base64/structured)
---
## BENEFITS OF TARGET DESIGN
**Generic**: Works with any document type without hardcoding
**Scalable**: Handles large documents without issues
**Flexible**: Format validation is flexible where appropriate
**Maintainable**: Clear separation of concerns
**Robust**: Handles edge cases (binary, base64, large files)
**Testable**: Each component can be tested independently

View file

@ -15,7 +15,12 @@ logger = logging.getLogger(__name__)
methods = {}
def discoverMethods(serviceCenter):
"""Dynamically discover all method classes and their actions in modules methods package"""
"""Dynamically discover all method classes and their actions in modules methods package.
CRITICAL: If methods are already discovered, updates their Services reference to ensure
they use the current workflow (self.services.workflow). This prevents stale workflow IDs
from being used when a new workflow starts.
"""
try:
# Import the methods package
methodsPackage = importlib.import_module('modules.workflows.methods')
@ -32,33 +37,54 @@ def discoverMethods(serviceCenter):
if (inspect.isclass(item) and
issubclass(item, MethodBase) and
item != MethodBase):
# Instantiate the method
methodInstance = item(serviceCenter)
# Use the actions property from MethodBase which handles @action decorator
actions = methodInstance.actions
# Create method info
methodInfo = {
'instance': methodInstance,
'actions': actions,
'description': item.__doc__ or f"Method {itemName}"
}
# Store the method with full class name
methods[itemName] = methodInfo
# Also store with short name for action executor access
# Check if method already exists in cache
shortName = itemName.replace('Method', '').lower()
methods[shortName] = methodInfo
logger.info(f"Discovered method {itemName} (short: {shortName}) with {len(actions)} actions")
if itemName in methods or shortName in methods:
# Method already discovered - update Services reference to use current workflow
existingMethodInfo = methods.get(itemName) or methods.get(shortName)
if existingMethodInfo and existingMethodInfo.get('instance'):
existingMethodInfo['instance'].services = serviceCenter
logger.debug(f"Updated Services reference for cached method {itemName} to use current workflow")
else:
# Method exists but instance is missing - recreate it
methodInstance = item(serviceCenter)
actions = methodInstance.actions
methodInfo = {
'instance': methodInstance,
'actions': actions,
'description': item.__doc__ or f"Method {itemName}"
}
methods[itemName] = methodInfo
methods[shortName] = methodInfo
logger.info(f"Recreated method {itemName} (short: {shortName}) with {len(actions)} actions")
else:
# Method not discovered yet - create new instance
methodInstance = item(serviceCenter)
# Use the actions property from MethodBase which handles @action decorator
actions = methodInstance.actions
# Create method info
methodInfo = {
'instance': methodInstance,
'actions': actions,
'description': item.__doc__ or f"Method {itemName}"
}
# Store the method with full class name
methods[itemName] = methodInfo
# Also store with short name for action executor access
methods[shortName] = methodInfo
logger.info(f"Discovered method {itemName} (short: {shortName}) with {len(actions)} actions")
except Exception as e:
logger.error(f"Error discovering method {name}: {str(e)}")
continue
logger.info(f"Discovered {len(methods)} method entries total")
logger.info(f"Discovered/updated {len(methods)} method entries total")
except Exception as e:
logger.error(f"Error discovering methods: {str(e)}")

View file

@ -41,6 +41,11 @@ class WorkflowManager:
# Store workflow in services for reference (this is the ChatWorkflow object)
self.services.workflow = workflow
# CRITICAL: Update all method instances to use the current Services object with the correct workflow
from modules.workflows.processing.shared.methodDiscovery import discoverMethods
discoverMethods(self.services)
logger.debug(f"Updated method instances to use workflow {self.services.workflow.id}")
if workflow.status == "running":
logger.info(f"Stopping running workflow {workflowId} before processing new prompt")
@ -102,6 +107,12 @@ class WorkflowManager:
# Store workflow in services (this is the ChatWorkflow object)
self.services.workflow = workflow
# CRITICAL: Update all method instances to use the current Services object with the correct workflow
# This ensures cached method instances don't use stale workflow IDs from previous workflows
from modules.workflows.processing.shared.methodDiscovery import discoverMethods
discoverMethods(self.services)
logger.debug(f"Updated method instances to use workflow {self.services.workflow.id}")
# Start workflow processing asynchronously
asyncio.create_task(self._workflowProcess(userInput))