stable workflow

This commit is contained in:
ValueOn AG 2025-08-16 23:32:36 +02:00
parent a0219181e9
commit 8592cdd790
20 changed files with 3701 additions and 3581 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -22,13 +22,11 @@ class DocumentGenerator:
"""
try:
documents = action_result.data.get("documents", [])
logger.debug(f"Processing {len(documents)} documents from action result")
processed_documents = []
for doc in documents:
processed_doc = self.processSingleDocument(doc, action)
if processed_doc:
processed_documents.append(processed_doc)
logger.debug(f"Successfully processed {len(processed_documents)} documents")
return processed_documents
except Exception as e:
logger.error(f"Error processing action result documents: {str(e)}")
@ -54,19 +52,29 @@ class DocumentGenerator:
# Dictionary format document - handle both 'documentName' and 'filename' keys
filename = doc.get('documentName', doc.get('filename', \
f"{action.execMethod}_{action.execAction}_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}"))
fileSize = doc.get('fileSize', len(str(doc.get('documentData', ''))))
mimeType = doc.get('mimeType', 'application/octet-stream')
if mimeType == "application/octet-stream":
document_data = doc.get('documentData', '')
mimeType = detectMimeTypeFromContent(document_data, filename, self.service)
# Handle documentData structure - it might be a dict with 'content' key or direct content
document_data = doc.get('documentData', '')
if isinstance(document_data, dict) and 'content' in document_data:
# This is the structure returned by extract action: documentData.content
content = document_data['content']
# Also check for other potential content fields
if not content and 'data' in document_data:
content = document_data['data']
else:
# Direct content (fallback)
content = document_data
# Calculate file size from actual content
fileSize = len(str(content)) if content else 0
# Detect mime type if not specified
if mimeType == "application/octet-stream":
mimeType = detectMimeTypeFromContent(content, filename, self.service)
logger.info(f"Processed document: {filename}, content length: {len(str(content))}, mimeType: {mimeType}")
return {
'filename': filename,
'fileSize': fileSize,
@ -96,20 +104,32 @@ class DocumentGenerator:
Returns a list of created document objects.
"""
try:
logger.info(f"Creating documents from action result for {action.execMethod}.{action.execAction}")
logger.info(f"Action result data keys: {list(action_result.data.keys())}")
processed_docs = self.processActionResultDocuments(action_result, action, workflow)
logger.info(f"Processed {len(processed_docs)} documents")
created_documents = []
for doc_data in processed_docs:
for i, doc_data in enumerate(processed_docs):
try:
document_name = doc_data['filename']
document_data = doc_data['content']
mime_type = doc_data['mimeType']
logger.info(f"Creating document {i+1}: {document_name} (mime: {mime_type}, content length: {len(str(document_data))})")
# Convert document data to string content
content = convertDocumentDataToString(document_data, getFileExtension(document_name))
# Skip empty or minimal content
minimal_content_patterns = ['{}', '[]', 'null', '""', "''"]
if not content or content.strip() == "" or content.strip() in minimal_content_patterns:
logger.warning(f"Empty or minimal content for document {document_name}, skipping")
continue
logger.info(f"Document {document_name} has content: {len(content)} characters")
# Create file in system
file_id = self.service.createFile(
fileName=document_name,
@ -120,22 +140,27 @@ class DocumentGenerator:
if not file_id:
logger.error(f"Failed to create file for document {document_name}")
continue
# Create document object
logger.info(f"Created file with ID: {file_id}")
# Create document object using existing file ID
document = self.service.createDocument(
fileName=document_name,
mimeType=mime_type,
content=content,
base64encoded=False
base64encoded=False,
existing_file_id=file_id
)
if document:
created_documents.append(document)
logger.debug(f"Created document: {document_name} ({len(content)} bytes, {mime_type})")
logger.info(f"Successfully created ChatDocument: {document_name} (ID: {getattr(document, 'id', 'N/A')}, fileId: {getattr(document, 'fileId', 'N/A')})")
else:
logger.error(f"Failed to create ChatDocument object for {document_name}")
except Exception as e:
logger.error(f"Error creating document {doc_data.get('filename', 'unknown')}: {str(e)}")
continue
logger.info(f"Created {len(created_documents)} documents from action result")
logger.info(f"Successfully created {len(created_documents)} documents")
return created_documents
except Exception as e:
logger.error(f"Error creating documents from action result: {str(e)}")

View file

@ -16,6 +16,10 @@ from modules.chat.documents.documentGeneration import DocumentGenerator
logger = logging.getLogger(__name__)
class WorkflowStoppedException(Exception):
"""Exception raised when a workflow is stopped by the user."""
pass
class HandlingTasks:
def __init__(self, chatInterface, service, workflow=None):
self.chatInterface = chatInterface
@ -23,15 +27,42 @@ class HandlingTasks:
self.workflow = workflow
self.documentGenerator = DocumentGenerator(service)
def _checkWorkflowStopped(self):
"""
Check if workflow has been stopped by user and raise exception if so.
This function centralizes all workflow stop checking logic to avoid code duplication.
"""
try:
# Get the current workflow status from the database to avoid stale data
current_workflow = self.chatInterface.getWorkflow(self.service.workflow.id)
if current_workflow and current_workflow.status == "stopped":
logger.info("Workflow stopped by user, aborting execution")
raise WorkflowStoppedException("Workflow was stopped by user")
except WorkflowStoppedException:
# Re-raise the WorkflowStoppedException immediately
raise
except Exception as e:
# If we can't get the current status due to other database issues, fall back to the in-memory object
logger.warning(f"Could not check current workflow status from database: {str(e)}")
if self.service.workflow.status == "stopped":
logger.info("Workflow stopped by user (from in-memory object), aborting execution")
raise WorkflowStoppedException("Workflow was stopped by user")
async def generateTaskPlan(self, userInput: str, workflow) -> TaskPlan:
"""Generate a high-level task plan for the workflow."""
try:
# Check workflow status before generating task plan
self._checkWorkflowStopped()
logger.info(f"Generating task plan for workflow {workflow.id}")
available_docs = self.service.getAvailableDocuments(workflow)
logger.debug(f"Available documents: {available_docs}")
# Check workflow status before calling AI service
self._checkWorkflowStopped()
prompt = await self.service.callAiTextAdvanced(
createTaskPlanningPrompt(self, {
createTaskPlanningPrompt({
'user_request': userInput,
'available_documents': available_docs,
'workflow_id': workflow.id
@ -53,16 +84,38 @@ class HandlingTasks:
if not self._validateTaskPlan(task_plan_dict):
logger.error("Generated task plan failed validation")
logger.error(f"AI Response: {prompt}")
logger.error(f"Parsed Task Plan: {json.dumps(task_plan_dict, indent=2)}")
raise Exception("AI-generated task plan failed validation - AI is required for task planning")
tasks = [TaskStep(**task_dict) for task_dict in task_plan_dict.get('tasks', [])]
tasks = []
for task_dict in task_plan_dict.get('tasks', []):
# Map old 'description' field to new 'objective' field
if 'description' in task_dict and 'objective' not in task_dict:
task_dict['objective'] = task_dict.pop('description')
tasks.append(TaskStep(**task_dict))
task_plan = TaskPlan(
overview=task_plan_dict.get('overview', ''),
tasks=tasks
)
logger.info(f"Task plan generated successfully with {len(tasks)} tasks")
logger.debug(f"Task plan: {json.dumps(task_plan_dict, indent=2)}")
# Log the generated tasks
for i, task in enumerate(tasks):
logger.info(f" Task {i+1}: {task.objective}")
if hasattr(task, 'success_criteria') and task.success_criteria:
logger.info(f" Success criteria: {task.success_criteria}")
# Log the complete task plan
logger.info("=== GENERATED TASK PLAN ===")
logger.info(f"Overview: {task_plan.overview}")
logger.info(f"Total tasks: {len(tasks)}")
# Log the RAW AI-generated task plan JSON for debugging
logger.info("=== RAW AI TASK PLAN JSON ===")
logger.info(f"AI Response with task plan: {prompt}")
logger.info("=== END RAW AI TASK PLAN JSON ===")
return task_plan
except Exception as e:
@ -72,12 +125,14 @@ class HandlingTasks:
async def generateTaskActions(self, task_step, workflow, previous_results=None, enhanced_context=None) -> List[TaskAction]:
"""Generate actions for a given task step."""
try:
logger.info(f"Generating actions for task: {task_step.description}")
# Check workflow status before generating actions
self._checkWorkflowStopped()
logger.info(f"Generating actions for task: {task_step.objective}")
available_docs = self.service.getAvailableDocuments(workflow)
available_connections = self.service.getConnectionReferenceList()
logger.debug(f"Available documents: {available_docs}")
logger.debug(f"Available connections: {available_connections}")
context = enhanced_context or TaskContext(
task_step=task_step,
@ -94,8 +149,11 @@ class HandlingTasks:
failed_actions=[],
successful_actions=[]
)
# Check workflow status before calling AI service
self._checkWorkflowStopped()
prompt = await self.service.callAiTextAdvanced(
await createActionDefinitionPrompt(self, context)
await createActionDefinitionPrompt(context, self.service)
)
# Inline parseActionResponse logic here
json_start = prompt.find('{')
@ -126,40 +184,169 @@ class HandlingTasks:
}) for a in actions]
valid_actions = [ta for ta in task_actions if ta]
logger.info(f"Generated {len(valid_actions)} actions for task: {task_step.description}")
logger.debug(f"Task actions plan: {json.dumps(action_data, indent=2)}")
logger.info(f"Generated {len(valid_actions)} actions for task: {task_step.objective}")
# Log the generated actions
for i, action in enumerate(valid_actions):
logger.info(f" Action {i+1}: {action.execMethod}.{action.execAction}")
if action.expectedDocumentFormats:
logger.info(f" Expected formats: {action.expectedDocumentFormats}")
if action.execParameters.get('documentList'):
logger.info(f" Input documents: {action.execParameters['documentList']}")
# Log the complete action plan
logger.info("=== GENERATED ACTION PLAN ===")
logger.info(f"Task: {task_step.objective}")
logger.info(f"Total actions: {len(valid_actions)}")
# Log the RAW AI-generated action plan JSON for debugging
logger.info("=== RAW AI ACTION PLAN JSON ===")
logger.info(f"AI Response with parsed actions: {prompt}")
logger.info("=== END RAW AI ACTION PLAN JSON ===")
return valid_actions
except Exception as e:
logger.error(f"Error in generateTaskActions: {str(e)}")
return []
async def executeTask(self, task_step, workflow, context) -> TaskResult:
async def executeTask(self, task_step, workflow, context, task_index=None, total_tasks=None) -> TaskResult:
"""Execute all actions for a task step, with state management and retries."""
logger.info(f"Executing task: {task_step.description}")
logger.info(f"=== STARTING TASK {task_index or '?'}: {task_step.objective} ===")
# Create database log entry for task start in format expected by frontend
if task_index is not None:
if total_tasks is not None:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"Executing task {task_index}/{total_tasks}",
"type": "info"
})
else:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"Executing task {task_index}/?",
"type": "info"
})
# Create a task start message for the user
task_progress = f"{task_index}/{total_tasks}" if total_tasks is not None else str(task_index)
task_start_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": f"🚀 Starting Task {task_progress}\n\nObjective: {task_step.objective}",
"status": "step",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": f"task_{task_index}_start",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(task_start_message)
if message:
workflow.messages.append(message)
logger.info(f"Task start message created for task {task_index}")
state = TaskExecutionState(task_step)
retry_context = context
max_retries = state.max_retries
for attempt in range(max_retries):
logger.info(f"Task execution attempt {attempt+1}/{max_retries}")
# Check workflow status before starting task execution
self._checkWorkflowStopped()
actions = await self.generateTaskActions(task_step, workflow, previous_results=retry_context.previous_results, enhanced_context=retry_context)
if not actions:
logger.error("No actions defined for task step, aborting task execution")
break
# Log total actions count for this task
total_actions = len(actions)
logger.info(f"Task {task_index or '?'} has {total_actions} actions")
action_results = []
for action in actions:
result = await self.executeSingleAction(action, workflow)
for action_idx, action in enumerate(actions):
# Check workflow status before each action execution
self._checkWorkflowStopped()
# Log action start in format expected by frontend
action_number = action_idx + 1
logger.info(f"Task {task_index} - Starting action {action_number}/{total_actions}")
# Create database log entry for action start
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"Task {task_index} - Starting action {action_number}/{total_actions}",
"type": "info"
})
# Create an action start message for the user
action_start_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": f"⚡ Task {task_index} - Action {action_number}/{total_actions}\n\nMethod: {action.execMethod}.{action.execAction}",
"status": "step",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": f"action_{action_number}_start",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(action_start_message)
if message:
workflow.messages.append(message)
logger.info(f"Action start message created for action {action_number}")
# Pass action index to executeSingleAction with task context
result = await self.executeSingleAction(action, workflow, task_step, task_index, action_number, total_actions)
action_results.append(result)
if result.success:
state.addSuccessfulAction(result)
else:
state.addFailedAction(result)
# Check workflow status before review
self._checkWorkflowStopped()
review_result = await self.reviewTaskCompletion(task_step, actions, action_results, workflow)
success = review_result.status == 'success'
feedback = review_result.reason
error = None if success else review_result.reason
if success:
logger.info(f"Task step '{task_step.description}' completed successfully")
logger.info(f"=== TASK {task_index or '?'} COMPLETED SUCCESSFULLY: {task_step.objective} ===")
# Create database log entry for task completion
if total_tasks is not None:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"🎯 Task {task_index}/{total_tasks} completed",
"type": "success"
})
else:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"🎯 Task {task_index}/? completed",
"type": "success"
})
# Create a task completion message for the user
task_progress = f"{task_index}/{total_tasks}" if total_tasks is not None else str(task_index)
task_completion_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": f"🎯 Task {task_progress} Completed Successfully!\n\nObjective: {task_step.objective}\n\nFeedback: {feedback or 'Task completed successfully'}",
"status": "step",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": f"task_{task_index}_completion",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(task_completion_message)
if message:
workflow.messages.append(message)
logger.info(f"Task completion message created for task {task_index}")
return TaskResult(
taskId=task_step.id,
status=TaskStatus.COMPLETED,
@ -168,7 +355,7 @@ class HandlingTasks:
error=None
)
elif review_result.status == 'retry' and state.canRetry():
logger.warning(f"Task step '{task_step.description}' requires retry: {review_result.improvements}")
logger.warning(f"Task step '{task_step.objective}' requires retry: {review_result.improvements}")
state.incrementRetryCount()
retry_context.retry_count = state.retry_count
retry_context.improvements = review_result.improvements
@ -180,7 +367,7 @@ class HandlingTasks:
retry_context.successful_actions = state.successful_actions
continue
else:
logger.error(f"Task step '{task_step.description}' failed after {attempt+1} attempts")
logger.error(f"=== TASK {task_index or '?'} FAILED: {task_step.objective} after {attempt+1} attempts ===")
return TaskResult(
taskId=task_step.id,
status=TaskStatus.FAILED,
@ -188,7 +375,7 @@ class HandlingTasks:
feedback=feedback,
error=error
)
logger.error(f"Task step '{task_step.description}' failed after all retries")
logger.error(f"=== TASK {task_index or '?'} FAILED AFTER ALL RETRIES: {task_step.objective} ===")
return TaskResult(
taskId=task_step.id,
status=TaskStatus.FAILED,
@ -199,6 +386,9 @@ class HandlingTasks:
async def reviewTaskCompletion(self, task_step, task_actions, action_results, workflow):
try:
# Check workflow status before reviewing task completion
self._checkWorkflowStopped()
review_context = ReviewContext(
task_step=task_step,
action_results=action_results,
@ -210,8 +400,11 @@ class HandlingTasks:
'errors': [result.error for result in action_results if not result.success]
}
)
# Check workflow status before calling AI service
self._checkWorkflowStopped()
# Use promptFactory for review prompt
prompt = await createResultReviewPrompt(self, review_context)
prompt = await createResultReviewPrompt(review_context)
response = await self.service.callAiTextAdvanced(prompt)
# Inline parseReviewResponse logic here
json_start = response.find('{')
@ -239,10 +432,6 @@ class HandlingTasks:
improvements = []
# Ensure all list fields are properly typed
missing_outputs = review.get('missing_outputs', [])
if not isinstance(missing_outputs, list):
missing_outputs = []
met_criteria = review.get('met_criteria', [])
if not isinstance(met_criteria, list):
met_criteria = []
@ -256,14 +445,14 @@ class HandlingTasks:
reason=review.get('reason', 'No reason provided'),
improvements=improvements,
quality_score=review.get('quality_score', 5),
missing_outputs=missing_outputs,
missing_outputs=[],
met_criteria=met_criteria,
unmet_criteria=unmet_criteria,
confidence=review.get('confidence', 0.5)
)
# Enhanced validation logging
logger.info(f"VALIDATION RESULT - Task: '{task_step.description}' - Status: {review_result.status.upper()}, Quality: {review_result.quality_score}/10")
logger.info(f"VALIDATION RESULT - Task: '{task_step.objective}' - Status: {review_result.status.upper()}, Quality: {review_result.quality_score}/10")
if review_result.status == 'success':
logger.info(f"VALIDATION SUCCESS - Task completed successfully")
if review_result.met_criteria:
@ -274,8 +463,6 @@ class HandlingTasks:
logger.warning(f"Unmet criteria: {', '.join(review_result.unmet_criteria)}")
else:
logger.error(f"VALIDATION FAILED - Task failed: {review_result.reason}")
if review_result.missing_outputs:
logger.error(f"Missing outputs: {', '.join(review_result.missing_outputs)}")
return review_result
except Exception as e:
@ -288,26 +475,22 @@ class HandlingTasks:
async def prepareTaskHandover(self, task_step, task_actions, review_result, workflow):
try:
# Check workflow status before preparing task handover
self._checkWorkflowStopped()
# Log handover status summary
if hasattr(review_result, 'status'):
status = review_result.status
if hasattr(review_result, 'missing_outputs'):
missing = review_result.missing_outputs
else:
missing = []
if hasattr(review_result, 'met_criteria'):
met = review_result.met_criteria
else:
met = []
logger.debug(f"Task handover status: {status}")
logger.debug(f"Promised documents: {task_step.expected_outputs}")
logger.debug(f"Delivered documents: {met}")
logger.debug(f"Missing documents: {missing}")
handover_data = {
'task_id': task_step.id,
'task_description': task_step.description,
'task_description': task_step.objective,
'actions': [action.to_dict() for action in task_actions],
'review_result': review_result.to_dict() if hasattr(review_result, 'to_dict') else review_result,
'workflow_id': workflow.id,
@ -321,20 +504,35 @@ class HandlingTasks:
# --- Helper action handling methods ---
async def executeSingleAction(self, action, workflow):
async def executeSingleAction(self, action, workflow, task_step, task_index=None, action_index=None, total_actions=None):
"""Execute a single action and return ActionResult with enhanced document processing"""
try:
logger.info(f"Executing action: {action.execMethod}.{action.execAction}")
# Check workflow status before executing action
self._checkWorkflowStopped()
# Log input documents and connections
# Use passed indices or fallback to '?'
task_num = task_index if task_index is not None else '?'
action_num = action_index if action_index is not None else '?'
logger.info(f"=== TASK {task_num} ACTION {action_num}: {action.execMethod}.{action.execAction} ===")
# Log input parameters
input_docs = action.execParameters.get('documentList', [])
logger.debug(f"Input documents: {input_docs}")
logger.debug(f"Input connections: {action.execParameters.get('connections', [])}")
input_connections = action.execParameters.get('connections', [])
logger.info(f"Input documents: {input_docs} (type: {type(input_docs)})")
if input_connections:
logger.info(f"Input connections: {input_connections}")
# Log all action parameters for debugging
logger.info(f"All action parameters: {action.execParameters}")
enhanced_parameters = action.execParameters.copy()
if action.expectedDocumentFormats:
enhanced_parameters['expectedDocumentFormats'] = action.expectedDocumentFormats
logger.debug(f"Expected document formats: {action.expectedDocumentFormats}")
logger.info(f"Expected formats: {action.expectedDocumentFormats}")
# Check workflow status before executing the action
self._checkWorkflowStopped()
result = await self.service.executeAction(
methodName=action.execMethod,
@ -350,11 +548,51 @@ class HandlingTasks:
action.setSuccess()
action.result = result.data.get("result", "")
action.execResultLabel = result_label
await self.createActionMessage(action, result, workflow, result_label, created_documents)
logger.info(f"Action {action.execMethod}.{action.execAction} executed successfully")
await self.createActionMessage(action, result, workflow, result_label, created_documents, task_step, task_index)
# Log action results
logger.info(f"✓ Action completed successfully")
# Create database log entry for action completion
if total_actions is not None:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"✅ Task {task_num} - Action {action_num}/{total_actions} completed",
"type": "success"
})
else:
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"✅ Task {task_num} - Action {action_num}/? completed",
"type": "success"
})
if created_documents:
logger.info(f"Output documents ({len(created_documents)}):")
for i, doc in enumerate(created_documents):
if hasattr(doc, 'filename'):
logger.info(f" {i+1}. {doc.filename}")
elif isinstance(doc, dict) and 'filename' in doc:
logger.info(f" {i+1}. {doc['filename']}")
else:
logger.info(f" {i+1}. {type(doc).__name__}")
# Log document details for debugging
logger.info("Document details:")
for i, doc in enumerate(created_documents):
if hasattr(doc, 'filename'):
logger.info(f" Doc {i+1}: filename={doc.filename}, type={type(doc)}")
if hasattr(doc, 'id'):
logger.info(f" ID: {doc.id}")
if hasattr(doc, 'fileId'):
logger.info(f" File ID: {doc.fileId}")
elif isinstance(doc, dict):
logger.info(f" Doc {i+1}: dict with keys: {list(doc.keys())}")
else:
logger.info("Output: No documents created")
else:
action.setError(result.error or "Action execution failed")
logger.error(f"Action {action.execMethod}.{action.execAction} failed: {result.error}")
logger.error(f"✗ Action failed: {result.error}")
# Extract document filenames for the ActionResult
document_filenames = []
@ -367,6 +605,9 @@ class HandlingTasks:
# Also include the original documents from the service result for validation
original_documents = result.data.get("documents", [])
# Log action summary
logger.info(f"=== TASK {task_num} ACTION {action_num} COMPLETED ===")
return ActionResult(
success=result.success,
data={
@ -407,9 +648,12 @@ class HandlingTasks:
error=str(e)
)
async def createActionMessage(self, action, result, workflow, result_label=None, created_documents=None):
async def createActionMessage(self, action, result, workflow, result_label=None, created_documents=None, task_step=None, task_index=None):
"""Create and store a message for the action result in the workflow with enhanced document processing"""
try:
# Check workflow status before creating action message
self._checkWorkflowStopped()
if result_label is None:
result_label = action.execResultLabel
@ -417,25 +661,29 @@ class HandlingTasks:
if created_documents is None:
created_documents = self.documentGenerator.createDocumentsFromActionResult(result, action, workflow)
# Log delivered documents with sizes
# Log delivered documents
if created_documents:
doc_info = []
for doc in created_documents:
if hasattr(doc, 'filename') and hasattr(doc, 'fileSize'):
doc_info.append(f"{doc.filename} ({doc.fileSize} bytes)")
elif hasattr(doc, 'filename'):
doc_info.append(f"{doc.filename}")
logger.info(f"Result label: {result_label} - {len(created_documents)} documents")
else:
doc_info.append("unknown document")
logger.debug(f"Produced result label: {result_label}")
logger.debug(f"Delivered documents: {doc_info}")
logger.info(f"Result label: {result_label} - No documents")
# Create a more meaningful message that includes task context
task_objective = task_step.objective if task_step else 'Unknown task'
# Build a user-friendly message
if created_documents and len(created_documents) > 0:
doc_names = [doc.filename if hasattr(doc, 'filename') else str(doc) for doc in created_documents[:3]]
if len(created_documents) > 3:
doc_names.append(f"... and {len(created_documents) - 3} more")
message_text = f"✅ Task {task_index or '?'} - Action {action.execMethod}.{action.execAction} completed\n\nObjective: {task_objective}\n\nGenerated {len(created_documents)} document(s): {', '.join(doc_names)}"
else:
logger.debug(f"Produced result label: {result_label} (no documents)")
message_text = f"✅ Task {task_index or '?'} - Action {action.execMethod}.{action.execAction} completed\n\nObjective: {task_objective}\n\nAction executed successfully"
message_data = {
"workflowId": workflow.id,
"role": "assistant",
"message": f"Executed action {action.execMethod}.{action.execAction}",
"message": message_text,
"status": "step",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
@ -449,7 +697,7 @@ class HandlingTasks:
message = self.chatInterface.createWorkflowMessage(message_data)
if message:
workflow.messages.append(message)
logger.info(f"Created action message for {action.execMethod}.{action.execAction} with {len(created_documents)} documents")
logger.info(f"Message created: {action.execMethod}.{action.execAction}")
else:
logger.error(f"Failed to create workflow message for action {action.execMethod}.{action.execAction}")
except Exception as e:
@ -459,29 +707,59 @@ class HandlingTasks:
def _validateTaskPlan(self, task_plan: Dict[str, Any]) -> bool:
try:
if not isinstance(task_plan, dict):
logger.error("Task plan is not a dictionary")
return False
if 'tasks' not in task_plan or not isinstance(task_plan['tasks'], list):
logger.error(f"Task plan missing 'tasks' field or not a list. Found: {type(task_plan.get('tasks', 'MISSING'))}")
return False
# First pass: collect all task IDs to validate dependencies
task_ids = set()
for task in task_plan['tasks']:
if not isinstance(task, dict):
logger.error(f"Task is not a dictionary: {type(task)}")
return False
required_fields = ['id', 'description', 'expected_outputs', 'success_criteria']
if not all(field in task for field in required_fields):
return False
if task['id'] in task_ids:
if 'id' not in task:
logger.error(f"Task missing 'id' field: {task}")
return False
task_ids.add(task['id'])
# Second pass: validate each task
for i, task in enumerate(task_plan['tasks']):
if not isinstance(task, dict):
logger.error(f"Task {i} is not a dictionary: {type(task)}")
return False
required_fields = ['id', 'objective', 'success_criteria']
missing_fields = [field for field in required_fields if field not in task]
if missing_fields:
logger.error(f"Task {i} missing required fields: {missing_fields}")
return False
# Check for duplicate IDs (shouldn't happen after first pass, but safety check)
if task['id'] in task_ids and list(task_plan['tasks']).count(task['id']) > 1:
logger.error(f"Task {i} has duplicate ID: {task['id']}")
return False
dependencies = task.get('dependencies', [])
if not isinstance(dependencies, list):
logger.error(f"Task {i} dependencies is not a list: {type(dependencies)}")
return False
for dep in dependencies:
if dep not in task_ids and dep != 'task_0':
logger.error(f"Task {i} has invalid dependency: {dep} (available: {list(task_ids) + ['task_0']})")
return False
if 'ai_prompt' in task and not isinstance(task['ai_prompt'], str):
return False
logger.info(f"Task plan validation successful with {len(task_ids)} tasks")
return True
except Exception as e:
logger.error(f"Error validating task plan: {str(e)}")
return False

View file

@ -7,7 +7,7 @@ from typing import Any, Dict
# Prompt creation helpers extracted from managerChat.py
def createTaskPlanningPrompt(self, context: Dict[str, Any]) -> str:
def createTaskPlanningPrompt(context: Dict[str, Any]) -> str:
"""Create prompt for task planning"""
return f"""You are a task planning AI that analyzes user requests and creates structured task plans.
@ -19,17 +19,16 @@ INSTRUCTIONS:
1. Analyze the user request and available documents
2. Break down the request into 2-4 meaningful high-level task steps
3. Focus on business outcomes, not technical operations
4. For document processing, create ONE task with a comprehensive AI prompt rather than multiple granular tasks
5. Each task should produce meaningful, usable outputs
6. Ensure proper handover between tasks using result labels
7. Return a JSON object with the exact structure shown below
4. Each task should produce meaningful, usable outputs
5. Ensure proper handover between tasks using result labels
6. Return a JSON object with the exact structure shown below
TASK PLANNING PRINCIPLES:
- Combine related operations into single tasks (e.g., \"Extract and analyze all candidate profiles\" instead of separate \"read file\" and \"analyze content\" tasks)
- Use comprehensive AI prompts for document processing rather than multiple small tasks
- Break down complex requests into logical, sequential steps
- Focus on business value and outcomes
- Keep tasks at a meaningful level of abstraction
- Each task should produce results that can be used by subsequent tasks
- Ensure clear dependencies and handovers between tasks
REQUIRED JSON STRUCTURE:
{{
@ -37,31 +36,34 @@ REQUIRED JSON STRUCTURE:
\"tasks\": [
{{
\"id\": \"task_1\",
\"description\": \"Clear description of what this task accomplishes (business outcome)\",
\"objective\": \"Clear business objective this task accomplishes\",
\"dependencies\": [\"task_0\"], // IDs of tasks that must complete first
\"expected_outputs\": [\"output1\", \"output2\"],
\"success_criteria\": [\"criteria1\", \"criteria2\"],
\"required_documents\": [\"doc1\", \"doc2\"],
\"estimated_complexity\": \"low|medium|high\",
\"ai_prompt\": \"Comprehensive AI prompt for document processing tasks (if applicable)\"
\"estimated_complexity\": \"low|medium|high\"
}}
]
}}
EXAMPLES OF GOOD TASK DESCRIPTIONS:
- \"Extract and analyze all candidate profiles to identify key qualifications and experience\"
- \"Create evaluation matrix and rate candidates against product designer criteria\"
- \"Generate comprehensive PowerPoint presentation for management decision\"
- \"Store final presentation in SharePoint for specified account\"
EXAMPLES OF GOOD TASK OBJECTIVES:
- \"Extract key information from documents for email preparation\"
- \"Draft professional email incorporating analyzed information\"
- \"Send email using specified email account\"
- \"Store email draft and confirmation in system\"
EXAMPLES OF BAD TASK DESCRIPTIONS:
EXAMPLES OF GOOD SUCCESS CRITERIA:
- \"Document analysis completed with key points identified\"
- \"Email draft created with professional tone and clear structure\"
- \"Email successfully sent with delivery confirmation\"
- \"All outputs properly stored and accessible for future use\"
EXAMPLES OF BAD TASK OBJECTIVES:
- \"Open and read the PDF file\" (too granular)
- \"Identify table structure\" (technical detail)
- \"Convert data to CSV format\" (implementation detail)
NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
async def createActionDefinitionPrompt(self, context) -> str:
async def createActionDefinitionPrompt(context, service) -> str:
"""Create prompt for action generation with enhanced document extraction guidance and retry context"""
task_step = context.task_step
workflow = context.workflow
@ -71,23 +73,32 @@ async def createActionDefinitionPrompt(self, context) -> str:
retry_count = context.retry_count or 0
previous_action_results = context.previous_action_results or []
previous_review_result = context.previous_review_result
methodList = self.service.getMethodsList()
methodList = service.getMethodsList()
method_actions = {}
for sig in methodList:
if '.' in sig:
method, rest = sig.split('.', 1)
action = rest.split('(')[0]
method_actions.setdefault(method, []).append((action, sig))
messageSummary = await self.service.summarizeChat(workflow.messages)
docRefs = self.service.getDocumentReferenceList()
connRefs = self.service.getConnectionReferenceList()
all_doc_refs = docRefs.get('chat', []) + docRefs.get('history', [])
messageSummary = await service.summarizeChat(workflow.messages)
# Get ALL documents from the entire workflow, not just current round
docRefs = service.getDocumentReferenceList()
connRefs = service.getConnectionReferenceList()
# Get documents from current round (chat) and entire workflow history
current_round_docs = docRefs.get('chat', [])
workflow_history_docs = docRefs.get('history', [])
# Combine all documents, prioritizing current round first, then workflow history
all_doc_refs = current_round_docs + workflow_history_docs
# Log document availability for debugging
logging.debug(f"Document references - Current round: {len(current_round_docs)}, Workflow history: {len(workflow_history_docs)}, Total: {len(all_doc_refs)}")
available_methods_str = ''
for method, actions in method_actions.items():
available_methods_str += f"- {method}:\n"
for action, sig in actions:
available_methods_str += f" - {action}: {sig}\n"
task_ai_prompt = task_step.ai_prompt or ''
retry_context = ""
if retry_count > 0:
retry_context = f"""
@ -105,17 +116,36 @@ Previous review feedback:
- Status: {previous_review_result.status or 'unknown'}
- Reason: {previous_review_result.reason or 'No reason provided'}
- Quality Score: {previous_review_result.quality_score or 0}/10
- Missing Outputs: {', '.join(previous_review_result.missing_outputs or [])}
- Unmet Criteria: {', '.join(previous_review_result.unmet_criteria or [])}
"""
expected_outputs_str = ', '.join(task_step.expected_outputs or [])
success_criteria_str = ', '.join(task_step.success_criteria or [])
previous_results_str = ', '.join(previous_results) if previous_results else 'None'
improvements_str = str(improvements) if improvements else 'None'
available_connections_str = '\n'.join(f"- {conn}" for conn in connRefs)
available_documents_str = '\n'.join(
f"- {doc.documentsLabel} contains {', '.join(doc.documents)}" for doc in all_doc_refs
)
# Build comprehensive document list showing both current round and workflow history
if all_doc_refs:
available_documents_str = "CURRENT ROUND DOCUMENTS:\n"
if current_round_docs:
for doc in current_round_docs:
available_documents_str += f"- {doc.documentsLabel} contains {', '.join(doc.documents)}\n"
else:
available_documents_str += "- No documents in current round\n"
available_documents_str += "\nWORKFLOW HISTORY DOCUMENTS:\n"
if workflow_history_docs:
for doc in workflow_history_docs:
available_documents_str += f"- {doc.documentsLabel} contains {', '.join(doc.documents)}\n"
else:
available_documents_str += "- No documents in workflow history\n"
else:
available_documents_str = "NO DOCUMENTS AVAILABLE - This workflow has no documents to process."
# Debug logging for document availability
logging.debug(f"Available documents string length: {len(available_documents_str)}")
logging.debug(f"Current round docs count: {len(current_round_docs)}")
logging.debug(f"Workflow history docs count: {len(workflow_history_docs)}")
logging.debug(f"Total doc refs: {len(all_doc_refs)}")
prompt = f"""
You are an action generation AI that creates specific actions to accomplish a task step.
@ -130,12 +160,11 @@ CRITICAL DOCUMENT REFERENCE RULES:
- NEVER invent new labels or use message IDs
- NEVER use formats like "msg_xxx:documents" or "task_X_results" (these will fail)
- ONLY use the exact labels shown in AVAILABLE DOCUMENTS
- **When generating multiple actions, you may only use as input documents those that are already present in AVAILABLE DOCUMENTS or produced by actions that come earlier in the list. Do NOT use as input any document label that will be produced by a later action.**
- When generating multiple actions, you may only use as input documents those that are already present in AVAILABLE DOCUMENTS or produced by actions that come earlier in the list. Do NOT use as input any document label that will be produced by a later action.
- If AVAILABLE DOCUMENTS shows "NO DOCUMENTS AVAILABLE", you CANNOT create document extraction actions. Instead, create actions that generate new content or inform the user that documents are needed.
TASK STEP: {task_step.description} (ID: {task_step.id})
EXPECTED OUTPUTS: {expected_outputs_str}
TASK STEP: {task_step.objective} (ID: {task_step.id})
SUCCESS CRITERIA: {success_criteria_str}
TASK AI PROMPT: {task_ai_prompt if task_ai_prompt else 'None provided'}
CONTEXT - Chat History:
{messageSummary}
@ -180,7 +209,8 @@ ACTION GENERATION PRINCIPLES:
INSTRUCTIONS:
- Generate actions to accomplish this task step using available documents, connections, and previous results
- Use docItem for single documents and docList labels for groups of documents as shown in AVAILABLE DOCUMENTS
- Always pass documentList as a LIST of references (docItem and/or docList)
- If AVAILABLE DOCUMENTS shows "NO DOCUMENTS AVAILABLE", you cannot create document extraction actions. Instead, create actions that generate new content or inform the user that documents are needed.
- Always pass documentList as a LIST of references (docItem and/or docList) - this list CANNOT be empty for document extraction actions
- For resultLabel, use the format: "task{{task_id}}_action{{action_number}}_{{short_label}}" where:
- {{task_id}} = the current task's id (e.g., 1)
- {{action_number}} = the sequence number of the action within the task (e.g., 2)
@ -202,8 +232,8 @@ REQUIRED JSON STRUCTURE:
"resultLabel": "task1_action3_analysis_results",
"expectedDocumentFormats": [ // OPTIONAL: Specify expected document formats when needed
{{
"extension": ".csv",
"mimeType": "text/csv",
"extension": ".txt",
"mimeType": "text/plain",
"description": "Structured data output"
}}
],
@ -314,19 +344,33 @@ EXAMPLES OF GOOD ACTIONS:
]
}}
NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
logging.debug(f"[ACTION PLAN PROMPT] Available Documents Section:\n{available_documents_str}\nUser Connections Section:\n{available_connections_str}\nAvailable Methods (summarized):\n{', '.join(method_actions.keys())}")
6. When no documents are available (NO DOCUMENTS AVAILABLE scenario):
{{
"method": "document",
"action": "generateReport",
"parameters": {{
"documentList": [],
"title": "Workflow Status Report"
}},
"resultLabel": "task1_action1_status_report",
"description": "Generate a status report informing the user that no documents are available for processing and requesting document upload or alternative input."
}}
IMPORTANT NOTES:
- Respond with ONLY the JSON object. Do not include any explanatory text.
- Before creating any document extraction action, verify that AVAILABLE DOCUMENTS contains actual document references.
- If AVAILABLE DOCUMENTS shows "NO DOCUMENTS AVAILABLE", use example 6 above to create a status report action instead of document extraction."""
logging.debug(f"[ACTION PLAN PROMPT] Available Documents Section:\n{available_documents_str}\nUser Connections Section:\n{available_connections_str}\nAvailable Methods (detailed):\n{available_methods_str}")
return prompt
async def createResultReviewPrompt(self, review_context) -> str:
async def createResultReviewPrompt(review_context) -> str:
"""Create prompt for result review"""
task_step = review_context.task_step
step_result = review_context.step_result or {}
step_result_serializable = {
'task_step': {
'id': task_step.id,
'description': task_step.description,
'expected_outputs': task_step.expected_outputs or [],
'objective': task_step.objective,
'success_criteria': task_step.success_criteria or []
},
'action_results': [],
@ -380,12 +424,10 @@ async def createResultReviewPrompt(self, review_context) -> str:
}
step_result_serializable['action_results'].append(serializable_action_result)
step_result_json = json.dumps(step_result_serializable, indent=2, ensure_ascii=False)
expected_outputs_str = ', '.join(task_step.expected_outputs or [])
success_criteria_str = ', '.join(task_step.success_criteria or [])
return f"""You are a result review AI that evaluates task step completion with BASIC validation.
TASK STEP: {task_step.description}
EXPECTED OUTPUTS: {expected_outputs_str}
TASK STEP: {task_step.objective}
SUCCESS CRITERIA: {success_criteria_str}
STEP RESULT: {step_result_json}
@ -402,8 +444,8 @@ VALIDATION PRINCIPLES:
- Text outputs are SECONDARY indicators
- Only retry for CLEAR technical issues, not minor imperfections
- Don't be picky about formatting or minor details
- Check if ANY documents were produced (documents_count > 0), not specific expected output names
- If documents were produced, consider it a SUCCESS regardless of expected output names
- Check if ANY documents were produced (documents_count > 0)
- If documents were produced, consider it a SUCCESS
EXAMPLES OF SUCCESS:
- Document extraction produced a file (even if imperfect)
@ -428,7 +470,6 @@ REQUIRED JSON STRUCTURE:
"reason": "Brief explanation",
"improvements": ["specific technical fixes only"],
"quality_score": 1-10,
"missing_outputs": [],
"met_criteria": ["basic functionality achieved"],
"unmet_criteria": []
}}
@ -437,6 +478,6 @@ VALIDATION LOGIC:
- If ANY action has documents_count > 0, mark as SUCCESS
- If ALL actions have documents_count = 0 AND no meaningful text output, mark as FAILED
- Only mark as RETRY for clear technical issues that can be fixed
- Do NOT fail based on expected output name mismatches - focus on actual document production
- Focus on actual document production and functionality, not specific output names
NOTE: Respond with ONLY the JSON object. Be GENEROUS with success ratings."""

View file

@ -4,15 +4,7 @@ from modules.interfaces.interfaceAppModel import User
from modules.interfaces.interfaceChatModel import ChatWorkflow, UserInputRequest, TaskStep, TaskAction, ActionResult, ReviewResult, TaskPlan, WorkflowResult, TaskContext
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceChatObjects import ChatObjects
from .handling.handlingTasks import HandlingTasks
logger = logging.getLogger(__name__)
# ===== STATE MANAGEMENT AND VALIDATION CLASSES =====
class WorkflowStoppedException(Exception):
"""Exception raised when workflow is stopped by user"""
pass
from .handling.handlingTasks import HandlingTasks, WorkflowStoppedException
logger = logging.getLogger(__name__)
@ -47,16 +39,15 @@ class ChatManager:
raise Exception("No tasks generated in task plan.")
# Phase 2-5: For each task, execute and get results
logger.info(f"Phase 2: Executing {len(task_plan.tasks)} tasks")
total_tasks = len(task_plan.tasks)
logger.info(f"Phase 2: Executing {total_tasks} tasks")
all_task_results = []
previous_results = []
for idx, task_step in enumerate(task_plan.tasks):
logger.info(f"Task {idx+1}/{len(task_plan.tasks)}: {task_step.description}")
# Pass task index to executeTask method
current_task_index = idx + 1
# Check if workflow has been stopped before each task
if self.service.workflow.status == "stopped":
logger.info("Workflow stopped by user, aborting execution")
raise WorkflowStoppedException("Workflow was stopped by user")
logger.info(f"Task {idx+1}/{total_tasks}: {task_step.objective}")
# Create task context for this task
task_context = TaskContext(
@ -67,7 +58,7 @@ class ChatManager:
previous_results=previous_results
)
# Execute task (this handles action generation, execution, and review internally)
task_result = await self.handlingTasks.executeTask(task_step, workflow, task_context)
task_result = await self.handlingTasks.executeTask(task_step, workflow, task_context, current_task_index, total_tasks)
# Handover
handover_data = await self.handlingTasks.prepareTaskHandover(task_step, [], task_result, workflow)
# Collect results
@ -90,6 +81,15 @@ class ChatManager:
)
logger.info(f"Unified workflow execution completed successfully for workflow {workflow.id}")
return workflow_result
except WorkflowStoppedException:
logger.info(f"Workflow {workflow.id} was stopped by user")
return WorkflowResult(
status="stopped",
completed_tasks=0,
total_tasks=0,
execution_time=0.0,
final_results_count=0
)
except Exception as e:
logger.error(f"Error in executeUnifiedWorkflow: {str(e)}")
return WorkflowResult(

View file

@ -310,54 +310,43 @@ class ServiceCenter:
chat_exchanges = []
history_exchanges = []
# Process messages in reverse order to find current chat round
# Process messages in reverse order; "first" marks boundary: include up to and including
# the first "first" message in the chat container, older messages in the history container
in_current_round = True
for message in reversed(self.workflow.messages):
# Get document references from message
is_first = getattr(message, "status", None) == "first"
# Build a DocumentExchange if message has documents
doc_exchange = None
if message.documents:
# For messages with action context, create DocumentExchange with docList reference
if message.actionId and message.documentsLabel:
doc_ref = self.getDocumentReferenceFromMessage(message)
if doc_ref:
# Create DocumentExchange with single docList reference
doc_exchange = DocumentExchange(
documentsLabel=message.documentsLabel,
documents=[doc_ref]
)
# Add to appropriate list based on message status
if message.status == "first":
chat_exchanges.append(doc_exchange)
break # Stop after finding first message
elif message.status == "step":
chat_exchanges.append(doc_exchange)
else:
history_exchanges.append(doc_exchange)
# For regular messages, create DocumentExchange with individual docItem references
else:
doc_refs = []
for doc in message.documents:
doc_ref = self.getDocumentReferenceFromChatDocument(doc)
doc_refs.append(doc_ref)
if doc_refs:
# Create DocumentExchange with individual document references
doc_exchange = DocumentExchange(
documentsLabel=f"{message.id}:documents",
documents=doc_refs
)
# Add to appropriate list based on message status
if message.status == "first":
chat_exchanges.append(doc_exchange)
break # Stop after finding first message
elif message.status == "step":
# Append to appropriate container based on boundary
if doc_exchange:
if in_current_round:
chat_exchanges.append(doc_exchange)
else:
history_exchanges.append(doc_exchange)
# Stop processing if we hit a first message
if message.status == "first":
break
# Flip boundary after including the "first" message in chat
if in_current_round and is_first:
in_current_round = False
# Sort both lists by datetime in descending order
chat_exchanges.sort(key=lambda x: x.documentsLabel, reverse=True)
@ -393,31 +382,7 @@ class ServiceCenter:
try:
# ADDED LOGGING: Print workflow id, message count, and all message labels and document counts
import logging
logger = logging.getLogger(__name__)
logger.debug(f"WORKFLOW STATE at getChatDocumentsFromDocumentList: id={id(self.workflow)}, message_count={len(self.workflow.messages) if hasattr(self.workflow, 'messages') else 'N/A'}")
for idx, message in enumerate(getattr(self.workflow, 'messages', [])):
label = getattr(message, 'documentsLabel', None)
docs = getattr(message, 'documents', None)
logger.debug(f" Message {idx}: label='{label}', documents_count={len(docs) if docs else 0}")
# DEBUG LOGGING: Print all document labels and their documents before extraction
import logging
logger = logging.getLogger(__name__)
logger.info("==== DEBUG: Listing all workflow message document labels and contained documents ====")
for message in self.workflow.messages:
label = getattr(message, 'documentsLabel', None)
docs = getattr(message, 'documents', None)
if label is not None:
doc_names = []
if docs:
for doc in docs:
if hasattr(doc, 'filename'):
doc_names.append(doc.filename)
elif isinstance(doc, dict) and 'filename' in doc:
doc_names.append(doc['filename'])
else:
doc_names.append(str(doc))
logger.info(f"Message label: '{label}' | Documents: {doc_names if doc_names else 'None'}")
logger.info("==== END DEBUG LIST ====")
all_documents = []
for doc_ref in documentList:
# Parse reference format
@ -434,12 +399,12 @@ class ServiceCenter:
found = True
break
if not found:
logger.warning(f"No documents found for label: {label}")
logger.debug(f"No documents found for label: {label}")
continue
# Handle structured reference format
if len(parts) < 3:
logger.warning(f"Invalid document reference format: {doc_ref}")
logger.debug(f"Invalid document reference format: {doc_ref}")
continue
ref_type = parts[0]
@ -487,7 +452,7 @@ class ServiceCenter:
def getConnectionReferenceFromUserConnection(self, connection: UserConnection) -> str:
"""Get connection reference from UserConnection"""
return f"connection:{connection.authority}:{connection.externalUsername}:{connection.id}"
return f"connection:{connection.authority.value}:{connection.externalUsername}:{connection.id}"
def getUserConnectionFromConnectionReference(self, connectionReference: str) -> Optional[UserConnection]:
"""Get UserConnection from reference string"""
@ -506,7 +471,7 @@ class ServiceCenter:
# Find matching connection
for conn in user_connections:
if str(conn.id) == conn_id and conn.authority == authority and conn.externalUsername == username:
if str(conn.id) == conn_id and conn.authority.value == authority and conn.externalUsername == username:
return conn
return None
@ -700,16 +665,16 @@ Please provide a clear summary of this message."""
async def extractContentFromDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
"""Extract content from ChatDocument using prompt"""
try:
# Extract file data from ChatDocument
if document.data:
fileData = document.data.encode('utf-8') if isinstance(document.data, str) else document.data
else:
# Try to get file data from service center if document has fileId
if hasattr(document, 'fileId') and document.fileId:
# ChatDocument is just a reference, so we need to get file data using fileId
if not hasattr(document, 'fileId') or not document.fileId:
logger.error(f"Document {document.id} has no fileId")
raise ValueError("Document has no fileId")
# Get file data from service center using document's fileId
fileData = self.getFileData(document.fileId)
else:
logger.error(f"No file data available in document: {document}")
raise ValueError("No file data available in document")
if not fileData:
logger.error(f"No file data found for fileId: {document.fileId}")
raise ValueError("No file data found for document")
# Get filename and mime type from document
filename = document.filename if hasattr(document, 'filename') else "document"
@ -739,11 +704,11 @@ Please provide a clear summary of this message."""
"""Extract content from file data directly using prompt"""
try:
return await self.documentProcessor.processFileData(
prompt=prompt,
fileData=fileData,
filename=filename,
mimeType=mimeType,
base64Encoded=base64Encoded,
prompt=prompt,
documentId=documentId
)
except Exception as e:
@ -771,15 +736,19 @@ Please provide a clear summary of this message."""
return file_item.id
def createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True) -> ChatDocument:
def createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True, existing_file_id: str = None) -> ChatDocument:
"""Create document from file data object created by AI call"""
# Use existing file ID if provided, otherwise create new file
if existing_file_id:
file_id = existing_file_id
else:
# First create the file and get its ID
file_id = self.createFile(fileName, mimeType, content, base64encoded)
# Get file info for metadata
file_info = self.interfaceComponent.getFile(file_id)
# Create document with file reference
# Create document with file reference (ChatDocument is just a reference, not a data container)
return ChatDocument(
id=str(uuid.uuid4()),
fileId=file_id,
@ -807,8 +776,7 @@ Please provide a clear summary of this message."""
bytesReceived=bytesReceived
)
# Log the stats event
logger.debug(f"Workflow stats updated - Event: {eventLabel}, Sent: {bytesSent}, Received: {bytesReceived}, Tokens: {tokenCount}")
except Exception as e:
logger.error(f"Error updating workflow stats: {str(e)}")

View file

@ -62,33 +62,42 @@ class AiCalls:
Advanced text processing using Anthropic.
Fallback to OpenAI if Anthropic is overloaded or rate-limited.
"""
messages = []
# For Anthropic, we need to handle system content differently
# Anthropic expects system content in a top-level parameter, not as a message role
try:
# Create messages without system role for Anthropic
anthropic_messages = []
if hasattr(self, 'userLanguage') and self.userLanguage:
ltext = f"Please respond in '{self.userLanguage}' language."
if context:
messages.append({
"role": "system",
"content": context
})
messages.append({
# Combine context and language instruction
full_context = f"{ltext}\n\n{context}"
else:
full_context = ltext
else:
full_context = context
# Add user message
anthropic_messages.append({
"role": "user",
"content": prompt
})
if hasattr(self, 'userLanguage') and self.userLanguage:
ltext = f"Please respond in '{self.userLanguage}' language."
if messages and messages[0]["role"] == "system":
if "language" not in messages[0]["content"].lower():
messages[0]["content"] = f"{ltext} {messages[0]['content']}"
# Call Anthropic - let the connector handle system content conversion
if full_context:
# Send context as part of the user message for Anthropic
enhanced_prompt = f"Context:\n{full_context}\n\nUser Request:\n{prompt}"
response = await self.anthropicService.callAiBasic([
{"role": "user", "content": enhanced_prompt}
])
else:
messages.insert(0, {
"role": "system",
"content": ltext
})
try:
response = await self.anthropicService.callAiBasic(messages)
response = await self.anthropicService.callAiBasic(anthropic_messages)
return response["choices"][0]["message"]["content"]
except Exception as e:
err_str = str(e)
logger.warning(f"[UI NOTICE] Advanced AI failed, falling back to Basic AI (OpenAI). Reason: {err_str}")
# Optionally, you could surface this message to the UI via a return value or error object
# Fallback to OpenAI basic
return await self.callAiTextBasic(prompt, context)
async def callAiImageBasic(self, prompt: str, imageData: Union[str, bytes], mimeType: str = None) -> str:

View file

@ -753,8 +753,8 @@ class AppObjects:
logger.error(f"Error saving token: {str(e)}")
raise
def getToken(self, authority: AuthAuthority) -> Optional[Token]:
"""Get the latest token for the current user and authority"""
def getToken(self, authority: str) -> Optional[Token]:
"""Get the latest valid token for the current user and authority"""
try:
# Get tokens for this user and authority
tokens = self.db.getRecordset("tokens", recordFilter={
@ -767,13 +767,20 @@ class AppObjects:
# Sort by creation date and get the latest
tokens.sort(key=lambda x: x.get("createdAt", ""), reverse=True)
return Token(**tokens[0])
latest_token = Token(**tokens[0])
# Check if token is expired
if latest_token.expiresAt and latest_token.expiresAt < datetime.now().timestamp():
logger.warning(f"Token for {authority} is expired (expiresAt: {latest_token.expiresAt})")
return None # Don't return expired tokens
return latest_token
except Exception as e:
logger.error(f"Error getting token: {str(e)}")
return None
def deleteToken(self, authority: AuthAuthority) -> None:
def deleteToken(self, authority: str) -> None:
"""Delete all tokens for the current user and authority"""
try:
# Get tokens to delete

View file

@ -256,6 +256,7 @@ class TaskAction(BaseModel, ModelMixin):
execResultLabel: Optional[str] = Field(None, description="Label for the set of result documents")
# NEW: Optional document format specification
expectedDocumentFormats: Optional[List[Dict[str, str]]] = Field(None, description="Expected document formats (optional)")
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Action status")
error: Optional[str] = Field(None, description="Error message if action failed")
retryCount: int = Field(default=0, description="Number of retries attempted")
@ -530,13 +531,11 @@ register_model_labels(
class TaskStep(BaseModel, ModelMixin):
id: str
description: str
objective: str
dependencies: Optional[list[str]] = []
expected_outputs: Optional[list[str]] = []
success_criteria: Optional[list[str]] = []
required_documents: Optional[list[str]] = []
estimated_complexity: Optional[str] = None
ai_prompt: Optional[str] = None
class TaskContext(BaseModel, ModelMixin):
task_step: TaskStep

View file

@ -881,6 +881,29 @@ class ChatObjects:
if not workflow:
raise ValueError(f"Workflow {workflowId} not found")
# Check if workflow is currently running and stop it first
if workflow.status == "running":
logger.info(f"Stopping running workflow {workflowId} before processing new prompt")
# Stop the running workflow
workflow.status = "stopped"
workflow.lastActivity = currentTime
self.updateWorkflow(workflowId, {
"status": "stopped",
"lastActivity": currentTime
})
# Add log entry for workflow stop
self.createWorkflowLog({
"workflowId": workflowId,
"message": "Workflow stopped for new prompt",
"type": "info",
"status": "stopped",
"progress": 100
})
# Wait a moment for any running processes to detect the stop
await asyncio.sleep(0.1)
# Update workflow - set status back to running for resumed workflows
self.updateWorkflow(workflowId, {

View file

@ -5,7 +5,6 @@ Handles document operations using the document service.
import logging
from typing import Dict, Any, List, Optional
import uuid
from datetime import datetime, UTC
from modules.chat.methodBase import MethodBase, ActionResult, action
@ -24,19 +23,19 @@ class MethodDocument(MethodBase):
@action
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Extract specific content from document with ai prompt and return it in the specified format
Extract specific content from document with AI prompt and return it in the specified format.
Parameters:
documentList (str): Reference to the document list to extract content from
aiPrompt (str): AI prompt for content extraction
includeMetadata (bool, optional): Whether to include metadata (default: True)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
includeMetadata (bool, optional): Whether to include metadata (default: True)
"""
try:
documentList = parameters.get("documentList")
aiPrompt = parameters.get("aiPrompt")
includeMetadata = parameters.get("includeMetadata", True)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
includeMetadata = parameters.get("includeMetadata", True)
if not documentList:
return self._createResult(
@ -60,32 +59,7 @@ class MethodDocument(MethodBase):
error="No documents found for the provided reference"
)
# Determine output format based on expected formats
output_extension = ".txt" # Default
output_mime_type = "text/plain" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".txt")
output_mime_type = expected_format.get("mimeType", "text/plain")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
logger.info(f"Expected document formats: {expectedDocumentFormats}")
else:
logger.info("No expected format specified, using default .txt format")
# Enhance AI prompt to specify output format
enhanced_prompt = aiPrompt
if output_extension == ".csv":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows. Do not include ```csv or ``` markers."
elif output_extension == ".json":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure JSON data without any markdown formatting, code blocks, or additional text. Output only the JSON content. Do not include ```json or ``` markers."
elif output_extension == ".xml":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure XML data without any markdown formatting, code blocks, or additional text. Output only the XML content. Do not include ```xml or ``` markers."
elif output_extension != ".txt":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure {output_extension.upper()} data without any markdown formatting, code blocks, or additional text. Output only the {output_extension.upper()} content. Do not include any markdown markers."
# Extract content from all documents
# Extract content from all documents using AI
all_extracted_content = []
file_infos = []
@ -99,7 +73,7 @@ class MethodDocument(MethodBase):
continue
extracted_content = await self.service.extractContentFromFileData(
prompt=enhanced_prompt, # Use enhanced prompt instead of original
prompt=aiPrompt,
fileData=file_data,
filename=file_info.get('name', 'document'),
mimeType=file_info.get('mimeType', 'application/octet-stream'),
@ -118,21 +92,7 @@ class MethodDocument(MethodBase):
error="No content could be extracted from any documents"
)
# Extract text content from ExtractedContent objects
text_contents = []
for content_obj in all_extracted_content:
if hasattr(content_obj, 'contents') and content_obj.contents:
# Extract text from ContentItem objects
for content_item in content_obj.contents:
if hasattr(content_item, 'data') and content_item.data:
text_contents.append(content_item.data)
elif isinstance(content_obj, str):
text_contents.append(content_obj)
else:
# Fallback: convert to string representation
text_contents.append(str(content_obj))
# Process each document individually and create separate output files
# Process each document individually with its own format conversion
output_documents = []
for i, (chatDocument, extracted_content) in enumerate(zip(chatDocuments, all_extracted_content)):
@ -140,36 +100,68 @@ class MethodDocument(MethodBase):
text_content = ""
if hasattr(extracted_content, 'contents') and extracted_content.contents:
# Extract text from ContentItem objects
text_parts = []
for content_item in extracted_content.contents:
if hasattr(content_item, 'data') and content_item.data:
text_content += content_item.data + "\n"
text_parts.append(content_item.data)
text_content = "\n".join(text_parts)
elif isinstance(extracted_content, str):
text_content = extracted_content
else:
# Fallback: convert to string representation
text_content = str(extracted_content)
# Create output filename based on original filename
# Get the expected format for this document (or use default)
target_format = None
if expectedDocumentFormats and i < len(expectedDocumentFormats):
target_format = expectedDocumentFormats[i]
elif expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# If fewer formats than documents, use the last format for remaining documents
target_format = expectedDocumentFormats[-1]
# Determine output format and filename
if target_format:
target_extension = target_format.get("extension", ".txt")
target_mime_type = target_format.get("mimeType", "text/plain")
# Check if format conversion is needed
if target_extension not in [".txt", ".text"] or target_mime_type != "text/plain":
logger.info(f"Converting document {i+1} to format: {target_extension} ({target_mime_type})")
# Use AI to convert format
formatted_content = await self._convertContentToFormat(text_content, target_format)
final_content = formatted_content
final_mime_type = target_mime_type
final_extension = target_extension
else:
logger.info(f"Document {i+1}: No format conversion needed, using plain text")
final_content = text_content
final_mime_type = "text/plain"
final_extension = ".txt"
else:
logger.info(f"Document {i+1}: No expected format specified, using plain text")
final_content = text_content
final_mime_type = "text/plain"
final_extension = ".txt"
# Create output filename based on original filename and target format
original_filename = chatDocument.filename
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}"
output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{final_extension}"
# Create result data for this document
result_data = {
"documentCount": 1,
"content": text_content,
"content": final_content,
"originalFilename": original_filename,
"fileInfos": [file_infos[i]] if includeMetadata and i < len(file_infos) else None,
"timestamp": datetime.now(UTC).isoformat()
}
logger.info(f"Created output document: {output_filename} with {len(text_content)} characters")
logger.info(f"Content preview: {text_content[:200]}...")
logger.info(f"Created output document: {output_filename} with {len(final_content)} characters")
output_documents.append({
"documentName": output_filename,
"documentData": result_data,
"mimeType": output_mime_type
"mimeType": final_mime_type
})
return self._createResult(
@ -186,6 +178,327 @@ class MethodDocument(MethodBase):
error=str(e)
)
@action
async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Generate documents in specific formats from document references.
This action automatically extracts content from documents and converts it to the specified format.
Parameters:
documentList (list): List of document references to extract content from
expectedDocumentFormats (list): Expected document formats with extension, mimeType, description
originalDocuments (list, optional): List of original document names
includeMetadata (bool, optional): Whether to include metadata (default: True)
"""
try:
document_list = parameters.get("documentList", [])
expected_document_formats = parameters.get("expectedDocumentFormats", [])
original_documents = parameters.get("originalDocuments", [])
include_metadata = parameters.get("includeMetadata", True)
if not document_list:
return self._createResult(
success=False,
data={},
error="Document list is required for generation"
)
if not expected_document_formats or len(expected_document_formats) == 0:
return self._createResult(
success=False,
data={},
error="Expected document formats specification is required"
)
# Get chat documents for original documents list
chat_documents = self.service.getChatDocumentsFromDocumentList(document_list)
logger.info(f"Found {len(chat_documents)} chat documents")
if not chat_documents:
return self._createResult(
success=False,
data={},
error="No documents found for the provided documentList reference"
)
# Update original documents list if not provided
if not original_documents:
original_documents = [doc.filename if hasattr(doc, 'filename') else str(doc.id) for doc in chat_documents]
# Process each document individually with its own format conversion
output_documents = []
for i, chat_document in enumerate(chat_documents):
# Extract content from this document
# ChatDocument is just a reference, so we need to get file data using fileId
content = ""
if hasattr(chat_document, 'fileId') and chat_document.fileId:
# Need to get file data
file_data = self.service.getFileData(chat_document.fileId)
if file_data:
if isinstance(file_data, bytes):
content = file_data.decode('utf-8', errors='ignore')
else:
content = str(file_data)
else:
logger.warning(f"Could not get file data for document {i+1}, skipping")
continue
else:
logger.warning(f"Document {i+1} has no fileId, skipping")
continue
if not content:
logger.warning(f"Could not extract content from document {i+1}, skipping")
continue
logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
# Get the expected format for this document (or use default)
target_format = None
if i < len(expected_document_formats):
target_format = expected_document_formats[i]
elif len(expected_document_formats) > 0:
# If fewer formats than documents, use the last format for remaining documents
target_format = expected_document_formats[-1]
if not target_format:
logger.warning(f"No expected format for document {i+1}, skipping")
continue
# Use AI to convert format
formatted_content = await self._convertContentToFormat(content, target_format)
if not formatted_content:
logger.warning(f"Failed to format document {i+1}, skipping")
continue
target_extension = target_format.get("extension", ".txt")
target_mime_type = target_format.get("mimeType", "text/plain")
# Create output filename
timestamp = datetime.now(UTC).strftime('%Y%m%d_%H%M%S')
if i < len(original_documents):
base_name = original_documents[i].rsplit('.', 1)[0] if '.' in original_documents[i] else original_documents[i]
else:
base_name = f"document_{i+1}"
output_filename = f"{base_name}_generated_{timestamp}{target_extension}"
# Create result data
result_data = {
"documentCount": 1,
"content": formatted_content,
"outputFormat": target_format,
"originalDocument": original_documents[i] if i < len(original_documents) else f"document_{i+1}",
"timestamp": datetime.now(UTC).isoformat()
}
logger.info(f"Generated document: {output_filename} with {len(formatted_content)} characters")
output_documents.append({
"documentName": output_filename,
"documentData": result_data,
"mimeType": target_mime_type
})
if not output_documents:
return self._createResult(
success=False,
data={},
error="No documents could be generated"
)
return self._createResult(
success=True,
data={
"documents": output_documents
}
)
except Exception as e:
logger.error(f"Error generating document: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)
async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
"""
Helper function to convert content to the specified format using AI.
"""
try:
extension = target_format.get("extension", ".txt")
mime_type = target_format.get("mimeType", "text/plain")
logger.info(f"Converting content to format: {extension} ({mime_type})")
# Create AI prompt for format conversion
format_prompts = {
".csv": f"""
Convert the following content into a proper CSV format.
Requirements:
1. Output ONLY the CSV data without any markdown, code blocks, or additional text
2. Use appropriate headers based on the content
3. Ensure proper CSV formatting with commas and quotes where needed
4. Make the data easily readable and importable into spreadsheet applications
Content to convert:
{content}
Generate ONLY the CSV data:
""",
".json": f"""
Convert the following content into a proper JSON format.
Requirements:
1. Output ONLY the JSON data without any markdown, code blocks, or additional text
2. Structure the data logically with appropriate keys and values
3. Ensure valid JSON syntax
4. Make the data easily parseable and readable
Content to convert:
{content}
Generate ONLY the JSON data:
""",
".xml": f"""
Convert the following content into a proper XML format.
Requirements:
1. Output ONLY the XML data without any markdown, code blocks, or additional text
2. Use appropriate XML tags and structure
3. Ensure valid XML syntax
4. Make the data easily parseable and readable
Content to convert:
{content}
Generate ONLY the XML data:
""",
".html": f"""
Convert the following content into a proper HTML format.
Requirements:
1. Output ONLY the HTML data without any markdown, code blocks, or additional text
2. Use appropriate HTML tags and structure
3. Ensure valid HTML syntax
4. Make the data easily readable in web browsers
Content to convert:
{content}
Generate ONLY the HTML data:
""",
".md": f"""
Convert the following content into a proper Markdown format.
Requirements:
1. Output ONLY the Markdown data without any code blocks or additional text
2. Use appropriate Markdown syntax for headers, lists, emphasis, etc.
3. Structure the content logically
4. Make the data easily readable and convertible to other formats
Content to convert:
{content}
Generate ONLY the Markdown data:
"""
}
# Get the appropriate prompt for the target format
if extension in format_prompts:
ai_prompt = format_prompts[extension]
else:
# Generic format conversion
ai_prompt = f"""
Convert the following content into {extension.upper()} format.
Requirements:
1. Output ONLY the {extension.upper()} data without any markdown, code blocks, or additional text
2. Use appropriate formatting for {extension.upper()} files
3. Ensure the output is valid and usable
4. Make the data easily readable and importable
Content to convert:
{content}
Generate ONLY the {extension.upper()} data:
"""
# Call AI to generate the formatted content
logger.info(f"Calling AI for {extension} format conversion")
formatted_content = await self.service.callAiTextBasic(ai_prompt, content)
if not formatted_content or formatted_content.strip() == "":
logger.warning("AI format conversion failed, using fallback")
return self._generateFallbackFormattedContent(content, extension, mime_type)
# Clean up the AI response
formatted_content = formatted_content.strip()
# Remove markdown code blocks if present
if formatted_content.startswith("```") and formatted_content.endswith("```"):
lines = formatted_content.split('\n')
if len(lines) > 2:
formatted_content = '\n'.join(lines[1:-1])
return formatted_content
except Exception as e:
logger.error(f"Error in AI format conversion: {str(e)}")
return self._generateFallbackFormattedContent(content, extension, mime_type)
def _generateFallbackFormattedContent(self, content: str, extension: str, mime_type: str) -> str:
"""
Generate fallback formatted content when AI conversion fails.
"""
try:
if extension == ".csv":
# Simple CSV fallback - split by lines and create basic CSV
lines = content.strip().split('\n')
if lines:
# Create a simple CSV with line numbers and content
csv_lines = ["Line,Content"]
for i, line in enumerate(lines, 1):
# Escape quotes and wrap in quotes if comma present
if ',' in line:
line = f'"{line.replace(chr(34), chr(34) + chr(34))}"'
csv_lines.append(f"{i},{line}")
return '\n'.join(csv_lines)
return "Line,Content\n1,No content available"
elif extension == ".json":
# Simple JSON fallback
content_escaped = content.replace('"', '\\"')
timestamp = datetime.now(UTC).isoformat()
return f'{{"content": "{content_escaped}", "format": "json", "timestamp": "{timestamp}"}}'
elif extension == ".xml":
# Simple XML fallback
timestamp = datetime.now(UTC).isoformat()
return f'<?xml version="1.0" encoding="UTF-8"?>\n<document>\n<content>{content}</content>\n<format>xml</format>\n<timestamp>{timestamp}</timestamp>\n</document>'
elif extension == ".html":
# Simple HTML fallback
timestamp = datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')
return f'<!DOCTYPE html>\n<html>\n<head><meta charset="UTF-8"><title>Generated Document</title></head>\n<body>\n<pre>{content}</pre>\n<p><em>Generated on {timestamp}</em></p>\n</body>\n</html>'
elif extension == ".md":
# Simple Markdown fallback
timestamp = datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')
return f"# Generated Document\n\n{content}\n\n---\n*Generated on {timestamp}*"
else:
# Generic fallback - return content as-is
return content
except Exception as e:
logger.error(f"Error in fallback format conversion: {str(e)}")
return content
@action
async def generateReport(self, parameters: Dict[str, Any]) -> ActionResult:
"""
@ -209,6 +522,8 @@ class MethodDocument(MethodBase):
)
chatDocuments = self.service.getChatDocumentsFromDocumentList(documentList)
logger.info(f"Retrieved {len(chatDocuments)} chat documents for report generation")
if not chatDocuments:
return self._createResult(
success=False,
@ -261,15 +576,30 @@ class MethodDocument(MethodBase):
for doc in chatDocuments:
content = ""
if hasattr(doc, 'content') and doc.content:
content = doc.content.strip()
elif hasattr(doc, 'data') and doc.data:
content = doc.data.strip()
logger.info(f"Processing document: type={type(doc)}")
# Get actual file content using the fileId reference
try:
file_data = self.service.getFileData(doc.fileId)
if file_data:
# Convert bytes to string
if isinstance(file_data, bytes):
content = file_data.decode('utf-8')
else:
content = str(file_data)
logger.info(f" Retrieved content from file: {len(content)} characters")
else:
logger.warning(f" No file data found for fileId: {doc.fileId}")
except Exception as e:
logger.error(f" Error retrieving file data: {str(e)}")
# Skip empty documents
if content:
validDocuments.append(doc)
allContent.append(f"Document: {doc.filename}\n{content}\n")
logger.info(f" Added document to valid documents list")
else:
logger.warning(f" Skipping document with no content")
if not validDocuments:
# If no valid documents, create a simple report
@ -354,10 +684,17 @@ class MethodDocument(MethodBase):
# Add document content if available
content = ""
if hasattr(doc, 'content') and doc.content:
content = doc.content
elif hasattr(doc, 'data') and doc.data:
content = doc.data
if hasattr(doc, 'fileId') and doc.fileId:
# ChatDocument is just a reference, so we need to get file data using fileId
try:
file_data = self.service.getFileData(doc.fileId)
if file_data:
if isinstance(file_data, bytes):
content = file_data.decode('utf-8')
else:
content = str(file_data)
except Exception as e:
logger.warning(f"Could not retrieve content for document {doc.filename}: {str(e)}")
if content:
html.append(f"<div style='white-space:pre-wrap; border:1px solid #ccc; padding:0.5em; margin-bottom:1em; background-color:#f9f9f9;'>{content}</div>")

View file

@ -26,13 +26,13 @@ class MethodOutlook(MethodBase):
"""Get Microsoft connection from connection reference"""
try:
userConnection = self.service.getUserConnectionFromConnectionReference(connectionReference)
if not userConnection or userConnection.authority != "msft" or userConnection.status != "active":
if not userConnection or userConnection.authority.value != "msft" or userConnection.status.value != "active":
return None
# Get the corresponding token for this user and authority
token = self.service.interfaceApp.getToken(userConnection.authority)
token = self.service.interfaceApp.getToken(userConnection.authority.value)
if not token:
logger.warning(f"No token found for user {userConnection.userId} and authority {userConnection.authority}")
logger.warning(f"No token found for user {userConnection.userId} and authority {userConnection.authority.value}")
return None
return {
@ -80,7 +80,45 @@ class MethodOutlook(MethodBase):
error="No valid Microsoft connection found for the provided connection reference"
)
# Create email reading prompt
# Read emails using Microsoft Graph API
try:
import requests
# Microsoft Graph API endpoint for messages
graph_url = "https://graph.microsoft.com/v1.0"
headers = {
"Authorization": f"Bearer {connection['accessToken']}",
"Content-Type": "application/json"
}
# Build the API request
api_url = f"{graph_url}/me/mailFolders/{folder}/messages"
params = {
"$top": limit,
"$orderby": "receivedDateTime desc"
}
if filter:
params["$filter"] = filter
# Make the API call
response = requests.get(api_url, headers=headers, params=params)
response.raise_for_status()
emails_data = response.json()
email_data = {
"emails": emails_data.get("value", []),
"count": len(emails_data.get("value", [])),
"folder": folder,
"filter": filter,
"apiResponse": emails_data
}
logger.info(f"Successfully retrieved {len(emails_data.get('value', []))} emails from {folder}")
except ImportError:
logger.error("requests module not available, falling back to simulation")
# Fallback to simulation if requests module is not available
email_prompt = f"""
Simulate reading emails from Microsoft Outlook.
@ -95,8 +133,24 @@ class MethodOutlook(MethodBase):
3. Important or urgent emails highlighted
4. Email categorization if possible
"""
email_data = await self.service.interfaceAiCalls.callAiTextAdvanced(email_prompt)
except Exception as e:
logger.error(f"Error reading emails from Microsoft Graph API: {str(e)}")
# Fallback to simulation on API error
email_prompt = f"""
Simulate reading emails from Microsoft Outlook.
# Use AI to simulate email reading
Connection: {connection['id']}
Folder: {folder}
Limit: {limit}
Filter: {filter or 'None'}
Please provide:
1. List of emails with subject, sender, date, and content
2. Summary of email statistics
3. Important or urgent emails highlighted
4. Email categorization if possible
"""
email_data = await self.service.interfaceAiCalls.callAiTextAdvanced(email_prompt)
# Create result data
@ -151,7 +205,7 @@ class MethodOutlook(MethodBase):
@action
async def sendEmail(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Send email via Outlook
Create email draft in Outlook for sending out
Parameters:
connectionReference (str): Reference to the Microsoft connection
@ -160,6 +214,7 @@ class MethodOutlook(MethodBase):
body (str): Email body content
cc (List[str], optional): CC recipients
bcc (List[str], optional): BCC recipients
attachments (List[str], optional): List of document references to attach
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
@ -169,6 +224,7 @@ class MethodOutlook(MethodBase):
body = parameters.get("body")
cc = parameters.get("cc", [])
bcc = parameters.get("bcc", [])
attachments = parameters.get("attachments", [])
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not to or not subject or not body:
@ -187,9 +243,72 @@ class MethodOutlook(MethodBase):
error="No valid Microsoft connection found for the provided connection reference"
)
# Create email sending prompt
# Create email draft using Microsoft Graph API
try:
import requests
# Microsoft Graph API endpoint for creating draft messages
graph_url = "https://graph.microsoft.com/v1.0"
headers = {
"Authorization": f"Bearer {connection['accessToken']}",
"Content-Type": "application/json"
}
# Build the email message
message = {
"subject": subject,
"body": {
"contentType": "HTML",
"content": body
},
"toRecipients": [{"emailAddress": {"address": email}} for email in to],
"ccRecipients": [{"emailAddress": {"address": email}} for email in cc] if cc else [],
"bccRecipients": [{"emailAddress": {"address": email}} for email in bcc] if bcc else []
}
# Add attachments if provided
if attachments:
message["attachments"] = []
for attachment_ref in attachments:
# Get attachment document from service center
attachment_docs = self.service.getChatDocumentsFromDocumentList([attachment_ref])
if attachment_docs:
for doc in attachment_docs:
# Create attachment object for Graph API
attachment = {
"@odata.type": "#microsoft.graph.fileAttachment",
"name": doc.filename,
"contentType": doc.mimeType,
"contentBytes": doc.data if hasattr(doc, 'data') else ""
}
message["attachments"].append(attachment)
# Create the draft message
api_url = f"{graph_url}/me/messages"
response = requests.post(api_url, headers=headers, json=message)
response.raise_for_status()
draft_data = response.json()
draft_result = {
"status": "draft_created",
"messageId": draft_data.get("id", "unknown"),
"draftId": draft_data.get("id", "unknown"),
"recipients": to,
"cc": cc,
"bcc": bcc,
"attachments": len(attachments) if attachments else 0,
"draftLocation": "Drafts folder",
"apiResponse": response.status_code,
"draftData": draft_data
}
logger.info(f"Successfully created email draft for {len(to)} recipients with {len(attachments) if attachments else 0} attachments")
except ImportError:
logger.error("requests module not available, falling back to simulation")
# Fallback to simulation if requests module is not available
send_prompt = f"""
Simulate sending an email via Microsoft Outlook.
Simulate creating an email draft in Microsoft Outlook.
Connection: {connection['id']}
To: {to}
@ -197,16 +316,38 @@ class MethodOutlook(MethodBase):
Body: {body}
CC: {cc}
BCC: {bcc}
Attachments: {attachments if attachments else 'None'}
Please provide:
1. Email composition details
2. Validation of email addresses
3. Email formatting and structure
4. Delivery confirmation simulation
4. Attachment processing and validation
5. Draft creation confirmation
"""
draft_result = await self.service.interfaceAiCalls.callAiTextAdvanced(send_prompt)
except Exception as e:
logger.error(f"Error creating email draft via Microsoft Graph API: {str(e)}")
# Fallback to simulation on API error
send_prompt = f"""
Simulate creating an email draft in Microsoft Outlook.
# Use AI to simulate email sending
send_result = await self.service.interfaceAiCalls.callAiTextAdvanced(send_prompt)
Connection: {connection['id']}
To: {to}
Subject: {subject}
Body: {body}
CC: {cc}
BCC: {bcc}
Attachments: {attachments if attachments else 'None'}
Please provide:
1. Email composition details
2. Validation of email addresses
3. Email formatting and structure
4. Attachment processing and validation
5. Draft creation confirmation
"""
draft_result = await self.service.interfaceAiCalls.callAiTextAdvanced(send_prompt)
# Create result data
result_data = {
@ -216,7 +357,8 @@ class MethodOutlook(MethodBase):
"body": body,
"cc": cc,
"bcc": bcc,
"sendResult": send_result,
"attachments": attachments,
"draftResult": draft_result,
"connection": {
"id": connection["id"],
"authority": "microsoft",
@ -243,7 +385,7 @@ class MethodOutlook(MethodBase):
data={
"documents": [
{
"documentName": f"outlook_email_sent_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentName": f"outlook_email_draft_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
@ -252,7 +394,7 @@ class MethodOutlook(MethodBase):
)
except Exception as e:
logger.error(f"Error sending email: {str(e)}")
logger.error(f"Error creating email draft: {str(e)}")
return self._createResult(
success=False,
data={},
@ -294,7 +436,48 @@ class MethodOutlook(MethodBase):
error="No valid Microsoft connection found for the provided connection reference"
)
# Create email search prompt
# Search emails using Microsoft Graph API
try:
import requests
# Microsoft Graph API endpoint for searching messages
graph_url = "https://graph.microsoft.com/v1.0"
headers = {
"Authorization": f"Bearer {connection['accessToken']}",
"Content-Type": "application/json"
}
# Build the search API request
api_url = f"{graph_url}/me/messages"
params = {
"$top": limit,
"$orderby": "receivedDateTime desc",
"$search": f'"{query}"'
}
# Add folder filter if specified
if folder and folder.lower() != "all":
params["$filter"] = f"parentFolderId eq '{folder}'"
# Make the API call
response = requests.get(api_url, headers=headers, params=params)
response.raise_for_status()
search_data = response.json()
search_result = {
"query": query,
"results": search_data.get("value", []),
"count": len(search_data.get("value", [])),
"folder": folder,
"limit": limit,
"apiResponse": search_data
}
logger.info(f"Successfully searched emails with query '{query}', found {len(search_data.get('value', []))} results")
except ImportError:
logger.error("requests module not available, falling back to simulation")
# Fallback to simulation if requests module is not available
search_prompt = f"""
Simulate searching emails in Microsoft Outlook.
@ -309,8 +492,24 @@ class MethodOutlook(MethodBase):
3. Email previews and key information
4. Search suggestions and refinements
"""
search_result = await self.service.interfaceAiCalls.callAiTextAdvanced(search_prompt)
except Exception as e:
logger.error(f"Error searching emails via Microsoft Graph API: {str(e)}")
# Fallback to simulation on API error
search_prompt = f"""
Simulate searching emails in Microsoft Outlook.
# Use AI to simulate email search
Connection: {connection['id']}
Query: {query}
Folder: {folder}
Limit: {limit}
Please provide:
1. Search results with relevant emails
2. Search statistics and relevance scores
3. Email previews and key information
4. Search suggestions and refinements
"""
search_result = await self.service.interfaceAiCalls.callAiTextAdvanced(search_prompt)
# Create result data

View file

@ -32,9 +32,9 @@ class MethodSharepoint(MethodBase):
return None
# Get the corresponding token for this user and authority
token = self.service.interfaceApp.getToken(userConnection.authority)
token = self.service.interfaceApp.getToken(userConnection.authority.value)
if not token:
logger.warning(f"No token found for user {userConnection.userId} and authority {userConnection.authority}")
logger.warning(f"No token found for user {userConnection.userId} and authority {userConnection.authority.value}")
return None
return {

View file

@ -8,7 +8,8 @@ from modules.interfaces.interfaceAppObjects import User
from modules.interfaces.interfaceChatModel import (UserInputRequest, ChatMessage, ChatWorkflow, TaskItem, TaskStatus)
from modules.interfaces.interfaceChatObjects import ChatObjects
from modules.chat.managerChat import ChatManager, WorkflowStoppedException
from modules.chat.managerChat import ChatManager
from modules.chat.handling.handlingTasks import WorkflowStoppedException
from modules.interfaces.interfaceChatModel import WorkflowResult
logger = logging.getLogger(__name__)
@ -52,6 +53,21 @@ class WorkflowManager:
"lastActivity": workflow.lastActivity
})
# Create final stopped message
stopped_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": "🛑 Workflow stopped by user",
"status": "last",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": "workflow_stopped",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(stopped_message)
if message:
workflow.messages.append(message)
# Add log entry
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
@ -99,6 +115,8 @@ class WorkflowManager:
async def _sendFirstMessage(self, userInput: UserInputRequest, workflow: ChatWorkflow) -> ChatMessage:
"""Send first message to start workflow"""
try:
self.chatManager.handlingTasks._checkWorkflowStopped()
# Create initial message using interface
messageData = {
"workflowId": workflow.id,
@ -130,6 +148,8 @@ class WorkflowManager:
async def _generateWorkflowFeedback(self, workflow: ChatWorkflow) -> str:
"""Generate feedback message for workflow completion"""
try:
self.chatManager.handlingTasks._checkWorkflowStopped()
# Count messages by role
user_messages = [msg for msg in workflow.messages if msg.role == 'user']
assistant_messages = [msg for msg in workflow.messages if msg.role == 'assistant']
@ -155,9 +175,13 @@ class WorkflowManager:
async def _sendLastMessage(self, workflow: ChatWorkflow) -> None:
"""Send last message to complete workflow"""
try:
self.chatManager.handlingTasks._checkWorkflowStopped()
# Generate feedback
feedback = await self._generateWorkflowFeedback(workflow)
self.chatManager.handlingTasks._checkWorkflowStopped()
# Create last message using interface
messageData = {
"workflowId": workflow.id,
@ -199,7 +223,60 @@ class WorkflowManager:
async def _processWorkflowResults(self, workflow: ChatWorkflow, workflow_result: WorkflowResult, initial_message: ChatMessage) -> None:
"""Process workflow results and create appropriate messages"""
try:
if workflow_result.status == 'failed':
try:
self.chatManager.handlingTasks._checkWorkflowStopped()
except WorkflowStoppedException:
logger.info(f"Workflow {workflow.id} was stopped during result processing")
# Create final stopped message
stopped_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": "🛑 Workflow stopped by user",
"status": "last",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": "workflow_stopped",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(stopped_message)
if message:
workflow.messages.append(message)
# Update workflow status to stopped
workflow.status = "stopped"
workflow.lastActivity = datetime.now(UTC).isoformat()
self.chatInterface.updateWorkflow(workflow.id, {
"status": "stopped",
"lastActivity": workflow.lastActivity
})
return
if workflow_result.status == 'stopped':
# Create stopped message
stopped_message = {
"workflowId": workflow.id,
"role": "assistant",
"message": "🛑 Workflow stopped by user",
"status": "last",
"sequenceNr": len(workflow.messages) + 1,
"publishedAt": datetime.now(UTC).isoformat(),
"documentsLabel": "workflow_stopped",
"documents": []
}
message = self.chatInterface.createWorkflowMessage(stopped_message)
if message:
workflow.messages.append(message)
# Update workflow status to stopped
workflow.status = "stopped"
workflow.lastActivity = datetime.now(UTC).isoformat()
self.chatInterface.updateWorkflow(workflow.id, {
"status": "stopped",
"lastActivity": workflow.lastActivity
})
return
elif workflow_result.status == 'failed':
# Create error message
error_message = {
"workflowId": workflow.id,

View file

@ -25,6 +25,7 @@ PyMuPDF>=1.23.7 # Statt dem ungenauen 'fitz'
PyPDF2==3.0.1
python-docx>=0.8.11 # Für Word-Dokumente
openpyxl>=3.1.2 # Für Excel-Dateien
python-pptx>=0.6.21 # Für PowerPoint-Dateien
## Data Processing & Analysis
numpy==1.26.3 # Version die mit pandas und matplotlib kompatibel ist
@ -52,3 +53,14 @@ sortedcontainers>=2.4.0 # Required by trio
## MSFT Integration
msal==1.24.1
# Enhanced Office document processing
python-docx>=0.8.11
openpyxl>=3.0.9
python-pptx>=0.6.21
xlrd>=2.0.1 # For legacy .xls files
Pillow>=9.0.0 # For image processing
PyPDF2>=3.0.0
PyMuPDF>=1.20.0
beautifulsoup4>=4.11.0
chardet>=4.0.0 # For encoding detection

855
test_documentExtraction.py Normal file
View file

@ -0,0 +1,855 @@
#!/usr/bin/env python3
"""
Test script for DocumentExtraction class.
Processes all files in d:/temp folder and stores extracted content in d:/temp/extracted.
Features:
- Option to extract content WITH AI processing (default)
- Option to extract content WITHOUT AI processing (content-only mode)
- Supports all document types: text, images, PDFs, Office documents, etc.
- Detailed logging and progress tracking
- Separate output directories for AI vs content-only modes
Usage:
- Interactive mode: python test_documentExtraction.py
- Content-only mode: python test_documentExtraction.py --no-ai
- Content-only mode: python test_documentExtraction.py --content-only
- Specify custom input/output: python test_documentExtraction.py --input-dir /path/to/input --output-dir /path/to/output --no-ai
"""
import os
import asyncio
import logging
import sys
import argparse
from pathlib import Path
from typing import List, Optional
from datetime import datetime, UTC
# Configure logging
logging.basicConfig(
level=logging.DEBUG, # Changed from INFO to DEBUG
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Filter out specific unwanted log messages
class LogFilter(logging.Filter):
"""Filter to hide specific unwanted log messages."""
def filter(self, record):
# Hide workflow stats update errors
if "Workflow" in record.getMessage() and "not found for stats update" in record.getMessage():
return False
# Hide HTTP request info messages
if "HTTP Request:" in record.getMessage() and "POST https://api.openai.com" in record.getMessage():
return False
# Hide HTTP response info messages
if "HTTP/1.1 200 OK" in record.getMessage():
return False
return True
# Apply the filter to the root logger
root_logger = logging.getLogger()
root_logger.addFilter(LogFilter())
def check_dependencies():
"""Check if required dependencies are available and provide installation instructions."""
missing_deps = []
# Check for required dependencies
try:
import bs4
logger.info("✓ beautifulsoup4 is available")
except ImportError:
missing_deps.append("beautifulsoup4")
logger.error("✗ beautifulsoup4 is missing")
try:
import PyPDF2
logger.info("✓ PyPDF2 is available")
except ImportError:
missing_deps.append("PyPDF2")
logger.error("✗ PyPDF2 is missing")
try:
import fitz
logger.info("✓ PyMuPDF (fitz) is available")
except ImportError:
missing_deps.append("PyMuPDF")
logger.error("✗ PyMuPDF (fitz) is missing")
try:
import docx
logger.info("✓ python-docx is available")
except ImportError:
missing_deps.append("python-docx")
logger.error("✗ python-docx is missing")
try:
import openpyxl
logger.info("✓ openpyxl is available")
except ImportError:
missing_deps.append("openpyxl")
logger.error("✗ openpyxl is missing")
try:
import pptx
logger.info("✓ python-pptx is available")
except ImportError:
missing_deps.append("python-pptx")
logger.error("✗ python-pptx is missing")
try:
from PIL import Image
logger.info("✓ Pillow (PIL) is available")
except ImportError:
missing_deps.append("Pillow")
logger.error("✗ Pillow (PIL) is missing")
if missing_deps:
logger.error("\n" + "="*60)
logger.error("MISSING DEPENDENCIES DETECTED!")
logger.error("="*60)
logger.error("The following packages are required but not installed:")
for dep in missing_deps:
logger.error(f" - {dep}")
logger.error("\nTo install all dependencies, run:")
logger.error("pip install -r requirements.txt")
logger.error("\nOr install individual packages:")
for dep in missing_deps:
if dep == "beautifulsoup4":
logger.error(f" pip install {dep}")
elif dep == "PyMuPDF":
logger.error(f" pip install {dep}")
elif dep == "Pillow":
logger.error(f" pip install {dep}")
else:
logger.error(f" pip install {dep}")
logger.error("="*60)
return False
logger.info("✓ All required dependencies are available!")
return True
def check_module_imports():
"""Check if we can import the required modules."""
try:
# Add the gateway directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserConnection
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem
logger.info("✓ All required modules imported successfully")
return True
except ImportError as e:
logger.error(f"✗ Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
return False
except Exception as e:
logger.error(f"✗ Unexpected error importing modules: {e}")
return False
def create_mock_service_center():
"""Create a proper ServiceCenter for testing purposes with all required fields."""
try:
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow, TaskItem, TaskStatus
from modules.interfaces.interfaceChatModel import ChatLog, ChatMessage, ChatStat
# Create proper user with all required fields
mock_user = User(
id="test_user_001",
username="testuser",
email="test@example.com",
fullName="Test User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_001"
)
# Create proper workflow with all required fields
current_time = datetime.now(UTC).isoformat()
mock_workflow = ChatWorkflow(
id="test_workflow_001",
mandateId="test_mandate_001",
status="active",
name="Test Document Extraction Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
# Create service center
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("✓ ServiceCenter created successfully with proper objects")
return service_center
except Exception as e:
logger.error(f"✗ Failed to create ServiceCenter: {e}")
return None
class DocumentExtractionTester:
"""Test class for DocumentExtraction functionality."""
def __init__(self, input_dir: str = "d:/temp/test-extraction", output_dir: str = None, enable_ai: bool = True):
"""
Initialize the tester.
Args:
input_dir: Directory containing files to process
output_dir: Directory to store extracted content (auto-generated if None)
enable_ai: Whether to enable AI processing (default: True)
"""
self.input_dir = Path(input_dir)
# Auto-generate output directory if not specified
if output_dir is None:
if enable_ai:
self.output_dir = Path(input_dir) / "extracted"
else:
self.output_dir = Path(input_dir) / "extracted-raw"
else:
self.output_dir = Path(output_dir)
self.extractor = None
self.service_center = None
self.enable_ai = enable_ai
if enable_ai:
self.prompt = "Make a summary of each sentence for each page or chapter of the document"
else:
self.prompt = None # No prompt needed for content-only extraction
# Track processing results for summary
self.processing_results = []
# Ensure output directory exists
logger.info(f"Creating output directory: {self.output_dir}")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Verify directory was created
if self.output_dir.exists():
logger.info(f"✓ Output directory created/verified: {self.output_dir}")
logger.info(f"Output directory absolute path: {self.output_dir.absolute()}")
else:
logger.error(f"✗ Failed to create output directory: {self.output_dir}")
# Log configuration
logger.info(f"Configuration: AI processing = {'ENABLED' if self.enable_ai else 'DISABLED'}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
# Test basic file writing capability
test_file = self.output_dir / "test_write_capability.txt"
try:
logger.info(f"Testing file write capability to: {test_file}")
logger.info(f"Absolute path: {test_file.absolute()}")
with open(test_file, 'w', encoding='utf-8') as f:
f.write("Test file to verify write capability")
if test_file.exists():
actual_size = test_file.stat().st_size
logger.info(f"✓ Basic file writing test passed: {test_file} (size: {actual_size} bytes)")
# Test reading the file back
with open(test_file, 'r', encoding='utf-8') as f:
content = f.read()
logger.info(f"✓ File read test passed: content length = {len(content)}")
# Clean up test file
test_file.unlink()
logger.info("✓ Test file cleaned up")
else:
logger.error(f"✗ Basic file writing test failed: {test_file}")
except Exception as e:
logger.error(f"✗ Basic file writing test failed with error: {e}")
import traceback
traceback.print_exc()
# Supported file extensions for content extraction
self.supported_extensions = {
# Text and data files
'.txt', '.csv', '.json', '.xml', '.html', '.htm', '.svg',
'.md', '.markdown', '.rst', '.log', '.ini', '.cfg', '.conf',
# Programming languages
'.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.c', '.cpp', '.cc', '.cxx',
'.h', '.hpp', '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
'.r', '.m', '.pl', '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
'.vbs', '.lua', '.sql', '.r', '.dart', '.elm', '.clj', '.hs', '.fs', '.ml',
# Web technologies
'.css', '.scss', '.sass', '.less', '.vue', '.svelte', '.astro',
# Configuration and build files
'.yaml', '.yml', '.toml', '.env', '.gitignore', '.dockerfile', '.dockerignore',
'.makefile', '.cmake', '.gradle', '.maven', '.pom', '.sln', '.vcxproj',
'.csproj', '.fsproj', '.vbproj', '.xcodeproj', '.pbxproj',
# Documentation and markup
'.tex', '.bib', '.adoc', '.asciidoc', '.wiki', '.creole',
# Images
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico',
# Documents
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.ods', '.odp',
# Legacy Office formats
'.doc', '.xls', '.ppt',
# Archives and binaries
'.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib'
}
def initialize_extractor(self):
"""Initialize the DocumentExtraction instance with a proper ServiceCenter."""
try:
# First create the service center
self.service_center = create_mock_service_center()
if not self.service_center:
logger.error("Failed to create ServiceCenter!")
return False
# Now create DocumentExtraction with the service center
from modules.chat.documents.documentExtraction import DocumentExtraction
self.extractor = DocumentExtraction(self.service_center)
logger.info("✓ DocumentExtraction initialized successfully with ServiceCenter")
return True
except Exception as e:
logger.error(f"✗ Failed to initialize DocumentExtraction: {e}")
return False
def get_files_to_process(self) -> List[Path]:
"""Get list of files to process from input directory."""
if not self.input_dir.exists():
logger.error(f"Input directory {self.input_dir} does not exist!")
logger.info("Creating input directory and adding a test file...")
self.input_dir.mkdir(parents=True, exist_ok=True)
# Create a test file if none exist
test_file = self.input_dir / "test.txt"
with open(test_file, 'w') as f:
f.write("This is a test file for document extraction.\nIt contains multiple lines.\nAnd some special characters: äöüß")
logger.info(f"Created test file: {test_file}")
files = []
all_files = list(self.input_dir.iterdir())
logger.info(f"All files in directory: {[f.name for f in all_files]}")
for file_path in all_files:
if file_path.is_file():
logger.debug(f"Checking file: {file_path.name} (extension: {file_path.suffix})")
if file_path.suffix.lower() in self.supported_extensions:
files.append(file_path)
logger.debug(f"Added file: {file_path.name}")
else:
logger.debug(f"Skipped file: {file_path.name} (unsupported extension)")
logger.info(f"Found {len(files)} supported files to process")
if files:
logger.info(f"Files to process: {[f.name for f in files]}")
return files
async def process_single_file(self, file_path: Path) -> bool:
"""
Process a single file and extract its content.
Args:
file_path: Path to the file to process
Returns:
True if successful, False otherwise
"""
if not self.extractor:
logger.error("DocumentExtraction not initialized!")
return False
try:
logger.info(f"Processing file: {file_path.name}")
# Read file data
with open(file_path, 'rb') as f:
file_data = f.read()
logger.debug(f"File size: {len(file_data)} bytes")
# Determine MIME type based on extension
mime_type = self._get_mime_type(file_path.suffix)
logger.debug(f"MIME type: {mime_type}")
# Process the file with or without AI based on configuration
extracted_content = await self.extractor.processFileData(
fileData=file_data,
filename=file_path.name,
mimeType=mime_type,
base64Encoded=False,
prompt=self.prompt,
enableAI=self.enable_ai
)
logger.debug(f"Extracted {len(extracted_content.contents)} content items")
# Debug: Show content details
for i, content_item in enumerate(extracted_content.contents):
logger.debug(f"Content item {i+1}: label='{content_item.label}', has_data={content_item.data is not None}, data_length={len(content_item.data) if content_item.data else 0}")
# Special logging for JavaScript files
if mime_type == "application/javascript":
logger.debug(f"JavaScript file detected: {file_path.name}")
logger.debug(f"Original file size: {len(file_data)} bytes")
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
logger.debug(f"JavaScript content item {i+1}: {content_size} bytes")
# Check if content was truncated
if content_size < len(file_data) * 0.9: # If less than 90% of original
logger.warning(f"JavaScript content may be truncated: {content_size} bytes vs {len(file_data)} bytes original")
# Track processing result
result = {
'filename': file_path.name,
'status': 'OK',
'content_items': 0,
'output_files': [],
'total_content_size': 0
}
# Save each content item as a separate file
if extracted_content.contents:
for i, content_item in enumerate(extracted_content.contents):
if content_item.data:
content_size = len(content_item.data.encode('utf-8'))
result['total_content_size'] += content_size
logger.debug(f"Content item {i+1}: {content_item.label}, size: {content_size} bytes")
# Generate filename with new naming convention
if len(extracted_content.contents) == 1:
# Single content item
output_filename = f"{file_path.stem} - {content_item.label} 1.txt"
else:
# Multiple content items - add sequence number
output_filename = f"{file_path.stem} - {content_item.label} {i+1}.txt"
output_file = self.output_dir / output_filename
# Write only the raw extracted content
logger.debug(f"Attempting to write to: {output_file}")
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content_item.data)
# Verify file was created
if output_file.exists():
actual_size = output_file.stat().st_size
logger.info(f"✓ File created successfully: {output_filename} (expected: {content_size} bytes, actual: {actual_size} bytes)")
else:
logger.error(f"✗ File was not created: {output_file}")
result['output_files'].append(output_filename)
result['content_items'] += 1
except Exception as write_error:
logger.error(f"✗ Error writing file {output_filename}: {write_error}")
import traceback
traceback.print_exc()
else:
logger.warning(f"Content item {i+1} has no data, skipping")
else:
logger.warning(f"No content extracted from {file_path.name}")
result['status'] = 'FAIL'
result['error'] = 'No content extracted'
# Add result to tracking list
self.processing_results.append(result)
logger.info(f"Successfully processed {file_path.name} - Total content: {result['total_content_size']} bytes")
return True
except Exception as e:
error_msg = str(e)
logger.error(f"Error processing {file_path.name}: {error_msg}")
# Track failed result
result = {
'filename': file_path.name,
'status': 'FAIL',
'content_items': 0,
'output_files': [],
'error': error_msg,
'total_content_size': 0
}
self.processing_results.append(result)
return False
def _get_mime_type(self, extension: str) -> str:
"""Get MIME type based on file extension."""
mime_types = {
# Text and data files
'.txt': 'text/plain',
'.csv': 'text/csv',
'.json': 'application/json',
'.xml': 'application/xml',
'.html': 'text/html',
'.htm': 'text/html',
'.svg': 'image/svg+xml',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.rst': 'text/x-rst',
'.log': 'text/plain',
'.ini': 'text/plain',
'.cfg': 'text/plain',
'.conf': 'text/plain',
# Programming languages
'.js': 'application/javascript',
'.ts': 'application/typescript',
'.jsx': 'text/jsx',
'.tsx': 'text/tsx',
'.py': 'text/x-python',
'.java': 'text/x-java-source',
'.c': 'text/x-c',
'.cpp': 'text/x-c++src',
'.cc': 'text/x-c++src',
'.cxx': 'text/x-c++src',
'.h': 'text/x-c',
'.hpp': 'text/x-c++hdr',
'.cs': 'text/x-csharp',
'.php': 'application/x-httpd-php',
'.rb': 'text/x-ruby',
'.go': 'text/x-go',
'.rs': 'text/x-rust',
'.swift': 'text/x-swift',
'.kt': 'text/x-kotlin',
'.scala': 'text/x-scala',
'.r': 'text/x-r',
'.m': 'text/x-matlab',
'.pl': 'text/x-perl',
'.sh': 'application/x-sh',
'.bash': 'application/x-sh',
'.zsh': 'application/x-sh',
'.fish': 'application/x-sh',
'.ps1': 'application/x-powershell',
'.bat': 'application/x-msdos-program',
'.cmd': 'application/x-msdos-program',
'.vbs': 'text/vbscript',
'.lua': 'text/x-lua',
'.sql': 'application/sql',
'.dart': 'application/dart',
'.elm': 'text/x-elm',
'.clj': 'text/x-clojure',
'.hs': 'text/x-haskell',
'.fs': 'text/x-fsharp',
'.ml': 'text/x-ocaml',
# Web technologies
'.css': 'text/css',
'.scss': 'text/x-scss',
'.sass': 'text/x-sass',
'.less': 'text/x-less',
'.vue': 'text/x-vue',
'.svelte': 'text/x-svelte',
'.astro': 'text/x-astro',
# Configuration and build files
'.yaml': 'application/x-yaml',
'.yml': 'application/x-yaml',
'.toml': 'application/toml',
'.env': 'text/plain',
'.gitignore': 'text/plain',
'.dockerfile': 'text/x-dockerfile',
'.dockerignore': 'text/plain',
'.makefile': 'text/x-makefile',
'.cmake': 'text/x-cmake',
'.gradle': 'text/x-gradle',
'.maven': 'text/x-maven',
'.pom': 'application/xml',
'.sln': 'text/plain',
'.vcxproj': 'application/xml',
'.csproj': 'application/xml',
'.fsproj': 'application/xml',
'.vbproj': 'application/xml',
'.xcodeproj': 'text/plain',
'.pbxproj': 'text/plain',
# Documentation and markup
'.tex': 'application/x-tex',
'.bib': 'text/x-bibtex',
'.adoc': 'text/asciidoc',
'.asciidoc': 'text/asciidoc',
'.wiki': 'text/x-wiki',
'.creole': 'text/x-wiki',
# Images
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
'.bmp': 'image/bmp',
'.tiff': 'image/tiff',
'.ico': 'image/x-icon',
# Documents
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.odt': 'application/vnd.oasis.opendocument.text',
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
'.odp': 'application/vnd.oasis.opendocument.presentation',
# Legacy Office formats
'.doc': 'application/msword',
'.xls': 'application/vnd.ms-excel',
'.ppt': 'application/vnd.ms-powerpoint',
# Archives and binaries (will be processed as binary)
'.zip': 'application/zip',
'.tar': 'application/x-tar',
'.gz': 'application/gzip',
'.7z': 'application/x-7z-compressed',
'.rar': 'application/vnd.rar',
'.exe': 'application/x-msdownload',
'.dll': 'application/x-msdownload',
'.so': 'application/x-sharedlib',
'.dylib': 'application/x-mach-binary'
}
return mime_types.get(extension.lower(), 'application/octet-stream')
async def run_tests(self) -> None:
"""Run the document extraction tests on all files."""
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info(f"Starting document extraction tests - {mode}")
logger.info(f"Input directory: {self.input_dir}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info(f"Processing prompt: {self.prompt}")
else:
logger.info("AI processing: DISABLED - Raw content extraction only")
# Initialize the extractor
if not self.initialize_extractor():
logger.error("Cannot proceed without DocumentExtraction!")
return
# Get files to process
files = self.get_files_to_process()
if not files:
logger.warning("No files found to process!")
return
# Process each file
successful = 0
failed = 0
logger.info(f"Starting to process {len(files)} files...")
for i, file_path in enumerate(files):
logger.info(f"Processing file {i+1}/{len(files)}: {file_path.name}")
try:
if await self.process_single_file(file_path):
successful += 1
logger.info(f"✓ File {i+1} processed successfully")
else:
failed += 1
logger.error(f"✗ File {i+1} processing failed")
except Exception as e:
failed += 1
logger.error(f"✗ Exception processing file {i+1}: {e}")
import traceback
traceback.print_exc()
# Print detailed summary
mode = "WITH AI" if self.enable_ai else "CONTENT ONLY (No AI)"
logger.info("\n" + "=" * 80)
logger.info(f"DETAILED TEST SUMMARY - {mode}")
logger.info("=" * 80)
logger.info(f"Total files processed: {len(files)}")
logger.info(f"Successful: {successful}")
logger.info(f"Failed: {failed}")
logger.info(f"Output directory: {self.output_dir}")
if self.enable_ai:
logger.info("AI processing: ENABLED")
else:
logger.info("AI processing: DISABLED")
logger.info("=" * 80)
# List all processed documents with results
logger.info("\nPROCESSING RESULTS:")
logger.info("-" * 80)
for result in self.processing_results:
status_icon = "" if result['status'] == 'OK' else ""
logger.info(f"{status_icon} {result['filename']} - {result['status']}")
if result['status'] == 'OK':
if result['content_items'] == 1:
logger.info(f" └─ Generated: {result['output_files'][0]} ({result['total_content_size']} bytes)")
else:
logger.info(f" └─ Generated {result['content_items']} files ({result['total_content_size']} total bytes):")
for output_file in result['output_files']:
logger.info(f" └─ {output_file}")
else:
error_msg = result.get('error', 'Unknown error')
logger.info(f" └─ Error: {error_msg}")
logger.info("-" * 80)
logger.info("=" * 80)
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='Document Extraction Test Script')
parser.add_argument('--no-ai', '--content-only', action='store_true',
help='Run in content-only mode without AI processing')
parser.add_argument('--input-dir', type=str, default='d:/temp/test-extraction',
help='Input directory containing files to process (default: d:/temp/test-extraction)')
parser.add_argument('--output-dir', type=str,
help='Output directory for extracted content (auto-generated if not specified)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Enable verbose logging')
return parser.parse_args()
async def main():
"""Main function to run the tests."""
# Parse command line arguments
args = parse_arguments()
# Set logging level based on verbosity
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
logger.info("DocumentExtraction Test Script")
logger.info("=" * 50)
logger.info(f"Source: {args.input_dir}")
# Determine output directory
if args.output_dir:
output_dir = args.output_dir
else:
if args.no_ai:
output_dir = f"{args.input_dir}/extracted-raw"
else:
output_dir = f"{args.input_dir}/extracted"
logger.info(f"Output: {output_dir}")
logger.info("=" * 50)
# Check dependencies first
if not check_dependencies():
logger.error("Please install missing dependencies before running tests.")
return
# Check module imports
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return
# Determine mode based on command line arguments
if args.no_ai:
enable_ai = False
logger.info("Running in CONTENT ONLY mode (no AI processing)")
else:
# Interactive mode: ask user for choice
print("\n" + "=" * 50)
print("SELECT EXTRACTION MODE:")
print("=" * 50)
print("1. With AI processing (default)")
print("2. Content only (no AI processing)")
print("=" * 50)
try:
choice = input("Enter your choice (1 or 2, default is 1): ").strip()
if choice == "2":
enable_ai = False
output_dir = f"{args.input_dir}/extracted-raw"
logger.info("Selected: Content only mode (no AI processing)")
else:
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Selected: AI processing mode")
except (EOFError, KeyboardInterrupt):
# Default to AI mode if input fails
enable_ai = True
output_dir = f"{args.input_dir}/extracted"
logger.info("Defaulting to AI processing mode")
# Run tests with selected mode
tester = DocumentExtractionTester(
input_dir=args.input_dir,
output_dir=output_dir,
enable_ai=enable_ai
)
await tester.run_tests()
if __name__ == "__main__":
# Check if command line arguments are provided for automated testing
if len(sys.argv) > 1:
# Parse arguments and run directly
asyncio.run(main())
else:
# Interactive mode: ask user for choice
asyncio.run(main())
# Convenience function for easy content-only extraction
async def extract_documents_content_only(input_folder: str, output_folder: str = None):
"""
Convenience function to extract documents without AI processing.
Args:
input_folder: Path to folder containing documents to extract
output_folder: Path to folder where extracted content will be stored (optional)
Example:
# Extract from d:/temp to d:/temp/extracted-raw
asyncio.run(extract_documents_content_only("d:/temp"))
# Extract from custom folders
asyncio.run(extract_documents_content_only("c:/my_docs", "c:/my_docs/extracted"))
"""
if output_folder is None:
output_folder = f"{input_folder}/extracted-raw"
logger.info(f"Running content-only extraction from {input_folder} to {output_folder}")
# Check dependencies and imports
if not check_dependencies():
logger.error("Missing dependencies. Please install required packages.")
return False
if not check_module_imports():
logger.error("Cannot import required modules. Please check your setup.")
return False
# Create tester and run
tester = DocumentExtractionTester(
input_dir=input_folder,
output_dir=output_folder,
enable_ai=False
)
await tester.run_tests()
return True
# Example usage (uncomment to use):
# if __name__ == "__main__":
# # For content-only extraction from d:/temp to d:/temp/extracted-raw
# asyncio.run(extract_documents_content_only("d:/temp"))

189
test_excel_processing.py Normal file
View file

@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Simple test script for enhanced Excel processing functionality.
This script tests the DocumentExtraction class with Excel files.
"""
import os
import sys
import asyncio
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Add the gateway directory to the path
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
async def test_excel_processing():
"""Test Excel processing functionality."""
try:
# Import required modules
from modules.chat.documents.documentExtraction import DocumentExtraction
from modules.chat.serviceCenter import ServiceCenter
from modules.interfaces.interfaceAppModel import User, UserPrivilege, AuthAuthority
from modules.interfaces.interfaceChatModel import ChatWorkflow
from datetime import datetime, UTC
logger.info("Testing Excel processing functionality...")
# Create mock service center
mock_user = User(
id="test_user_001",
username="testuser",
email="test@example.com",
fullName="Test User",
language="en",
enabled=True,
privilege=UserPrivilege.USER,
authenticationAuthority=AuthAuthority.LOCAL,
mandateId="test_mandate_001"
)
current_time = datetime.now(UTC).isoformat()
mock_workflow = ChatWorkflow(
id="test_workflow_001",
mandateId="test_mandate_001",
status="active",
name="Test Excel Processing Workflow",
currentRound=1,
lastActivity=current_time,
startedAt=current_time,
logs=[],
messages=[],
stats=None,
tasks=[]
)
service_center = ServiceCenter(mock_user, mock_workflow)
logger.info("✓ ServiceCenter created successfully")
# Create DocumentExtraction instance
extractor = DocumentExtraction(service_center)
logger.info("✓ DocumentExtraction created successfully")
# Test with a sample Excel file if available
test_file_path = "d:/temp/test-extraction/test.xlsx"
if os.path.exists(test_file_path):
logger.info(f"Found test file: {test_file_path}")
# Read the file
with open(test_file_path, 'rb') as f:
file_data = f.read()
logger.info(f"File size: {len(file_data)} bytes")
# Process the Excel file
logger.info("Processing Excel file...")
result = await extractor.processFileData(
fileData=file_data,
filename="test.xlsx",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
base64Encoded=False,
prompt=None,
enableAI=False
)
logger.info(f"✓ Excel processing completed successfully!")
logger.info(f"Generated {len(result.contents)} content items:")
for i, content_item in enumerate(result.contents):
logger.info(f" Item {i+1}: {content_item.label}")
logger.info(f" MIME type: {content_item.metadata.mimeType}")
logger.info(f" Size: {content_item.metadata.size} bytes")
if content_item.data:
logger.info(f" Data preview: {content_item.data[:100]}...")
else:
logger.info(f" Data: None")
else:
logger.info("No test Excel file found. Creating a simple test...")
# Test the openpyxl library directly
try:
import openpyxl
from openpyxl import Workbook
# Create a test workbook
wb = Workbook()
ws = wb.active
ws.title = "Test Sheet"
# Add some test data
ws['A1'] = "Name"
ws['B1'] = "Age"
ws['C1'] = "City"
ws['A2'] = "John Doe"
ws['B2'] = 30
ws['C2'] = "New York"
ws['A3'] = "Jane Smith"
ws['B3'] = 25
ws['C3'] = "Los Angeles"
# Test properties
wb.properties.title = "Test Workbook"
wb.properties.creator = "Test User"
wb.properties.subject = "Test Subject"
logger.info("✓ Test workbook created successfully")
logger.info(f" Title: {wb.properties.title}")
logger.info(f" Creator: {wb.properties.creator}")
logger.info(f" Subject: {wb.properties.subject}")
logger.info(f" Sheets: {wb.sheetnames}")
# Test the DocumentExtraction with this workbook
from io import BytesIO
# Save to bytes
buffer = BytesIO()
wb.save(buffer)
buffer.seek(0)
file_data = buffer.getvalue()
logger.info(f"Test workbook size: {len(file_data)} bytes")
# Process with DocumentExtraction
result = await extractor.processFileData(
fileData=file_data,
filename="test_workbook.xlsx",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
base64Encoded=False,
prompt=None,
enableAI=False
)
logger.info(f"✓ Test workbook processing completed successfully!")
logger.info(f"Generated {len(result.contents)} content items:")
for i, content_item in enumerate(result.contents):
logger.info(f" Item {i+1}: {content_item.label}")
logger.info(f" MIME type: {content_item.metadata.mimeType}")
logger.info(f" Size: {content_item.metadata.size} bytes")
if content_item.data:
logger.info(f" Data preview: {content_item.data[:200]}...")
else:
logger.info(f" Data: None")
except ImportError as e:
logger.error(f"openpyxl not available: {e}")
except Exception as e:
logger.error(f"Error testing Excel functionality: {e}")
logger.info("Excel processing test completed!")
except ImportError as e:
logger.error(f"Failed to import required modules: {e}")
logger.error("Make sure you're running this script from the gateway directory")
except Exception as e:
logger.error(f"Unexpected error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_excel_processing())