version 2 ready basic

This commit is contained in:
ValueOn AG 2025-07-11 23:13:42 +02:00
parent 53a4a39214
commit cfb34c6a38
7 changed files with 537 additions and 77 deletions

View file

@ -111,18 +111,46 @@ class ActionValidator:
expected_result_label = action.execResultLabel
expected_format = action.execParameters.get('outputFormat', 'unknown')
# Extract expected document formats from action
expected_document_formats = action.expectedDocumentFormats or []
# Check if the result label is present in the action result data
actual_result_label = result_data.get("resultLabel", "") if isinstance(result_data, dict) else ""
result_label_match = actual_result_label == expected_result_label
# Analyze delivered documents and content
delivered_files = []
delivered_formats = []
content_items = []
# Check for ChatDocument objects
for doc in documents:
if hasattr(doc, 'filename'):
delivered_files.append(doc.filename)
# Extract format information
file_extension = self._getFileExtension(doc.filename)
mime_type = getattr(doc, 'mimeType', 'application/octet-stream')
delivered_formats.append({
'filename': doc.filename,
'extension': file_extension,
'mimeType': mime_type
})
elif isinstance(doc, dict) and 'filename' in doc:
delivered_files.append(doc['filename'])
file_extension = self._getFileExtension(doc['filename'])
mime_type = doc.get('mimeType', 'application/octet-stream')
delivered_formats.append({
'filename': doc['filename'],
'extension': file_extension,
'mimeType': mime_type
})
else:
delivered_files.append(f"document_{len(delivered_files)}")
delivered_formats.append({
'filename': f"document_{len(delivered_files)}",
'extension': 'unknown',
'mimeType': 'application/octet-stream'
})
# Check for ExtractedContent in result data
if isinstance(result_data, dict):
@ -133,11 +161,20 @@ class ActionValidator:
elif 'contents' in result_data:
content_items = result_data['contents']
# If we have delivered files but no content items, consider it successful
# This handles the case where content is stored in files rather than result data
if delivered_files and not content_items:
content_items = [f"File content available in: {', '.join(delivered_files)}"]
# Analyze content items
content_summary = []
for item in content_items:
if hasattr(item, 'label') and hasattr(item, 'metadata'):
content_summary.append(f"{item.label}: {item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else 'unknown'}")
elif isinstance(item, str):
content_summary.append(item)
else:
content_summary.append(str(item))
return f"""You are an action result validator. Your primary focus is to validate that the action delivered the promised result files in the promised format.
@ -145,7 +182,10 @@ ACTION DETAILS:
- Method: {action.execMethod}
- Action: {action.execAction}
- Expected Result Label: {expected_result_label}
- Actual Result Label: {actual_result_label}
- Result Label Match: {result_label_match}
- Expected Format: {expected_format}
- Expected Document Formats: {json.dumps(expected_document_formats, indent=2) if expected_document_formats else 'None specified'}
- Parameters: {json.dumps(action.execParameters, indent=2)}
RESULT TO VALIDATE:
@ -155,12 +195,13 @@ RESULT TO VALIDATE:
- Validation Messages: {', '.join(validation_messages) if validation_messages else 'None'}
- Documents Produced: {doc_count}
- Delivered Files: {', '.join(delivered_files) if delivered_files else 'None'}
- Delivered Formats: {json.dumps(delivered_formats, indent=2) if delivered_formats else 'None'}
- Content Items: {', '.join(content_summary) if content_summary else 'None'}
CRITICAL VALIDATION CRITERIA:
1. **File Delivery**: Did the action deliver the promised result file(s)?
2. **Format Compliance**: Are the delivered files in the promised format?
3. **Result Label Match**: Does the result match the expected result label?
1. **Result Label Match**: Does the action result contain the expected result label?
2. **File Delivery**: Did the action deliver the promised result file(s)?
3. **Format Compliance**: If expected document formats were specified, do the delivered files match the expected formats?
4. **Content Quality**: Is the content of the delivered files usable and complete?
5. **Content Processing**: If content extraction was expected, was it performed correctly?
@ -169,21 +210,28 @@ CONTEXT:
- Previous Results: {', '.join(context.previous_results) if context.previous_results else 'None'}
VALIDATION INSTRUCTIONS:
1. Check if the expected result label "{expected_result_label}" is present in the result
2. Verify that files were delivered when expected
3. Validate that the delivered files match the expected format "{expected_format}"
4. Assess if the content is complete and usable
5. Check if content extraction was performed when expected
6. Determine if retry would improve file delivery or format compliance
1. **Result Label Check**: Verify that the expected result label "{expected_result_label}" is present in the action result data. This is the primary success criterion.
2. **File Delivery**: Check if files were delivered when expected. The individual filenames don't need to match the result label - focus on whether content was actually produced.
3. **Format Compliance**: If expected document formats were specified, check if delivered files match the expected extensions and MIME types. If no formats were specified, this criterion is satisfied.
4. **Content Quality**: If files were delivered, consider the action successful. The presence of delivered files indicates content was processed and stored.
5. **Content Processing**: If files were delivered, assume content extraction was performed correctly. The file delivery is evidence of successful processing.
6. **Success Criteria**: The action is successful if the result label matches AND files were delivered. If expected formats were specified, they should also match.
IMPORTANT NOTES:
- The result label must be present in the action result data for success
- Individual filenames can be different from the result label
- If files were delivered, consider the action successful even if content details are not provided
- Focus on whether the action accomplished its intended purpose (file delivery)
- Empty files should be considered failures, but delivered files indicate success
REQUIRED JSON RESPONSE:
{{
"status": "success|retry|fail",
"reason": "Detailed explanation focusing on file delivery and format compliance",
"reason": "Detailed explanation focusing on result label match and content quality",
"confidence": 0.0-1.0,
"improvements": ["specific file delivery improvements", "format compliance fixes"],
"improvements": ["specific improvements if needed"],
"quality_score": 1-10,
"missing_elements": ["missing files", "format issues"],
"missing_elements": ["missing result label", "missing files", "content issues"],
"suggested_retry_approach": "Specific approach for retry if status is retry"
}}
@ -222,6 +270,12 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
'missing_elements': [],
'suggested_retry_approach': ''
}
def _getFileExtension(self, filename: str) -> str:
"""Extract file extension from filename"""
if '.' in filename:
return '.' + filename.split('.')[-1]
return ''
class ChatManager:
"""Chat manager with improved AI integration and method handling"""
@ -293,6 +347,27 @@ class ChatManager:
tasks=tasks
)
# Log the task plan as JSON for debugging
logger.info(f"Task plan created for workflow {workflow.id}:")
task_plan_json = {
'overview': task_plan.overview,
'tasks_count': len(task_plan.tasks),
'tasks': []
}
for task in task_plan.tasks:
task_json = {
'id': task.id,
'description': task.description,
'dependencies': task.dependencies or [],
'expected_outputs': task.expected_outputs or [],
'success_criteria': task.success_criteria or [],
'required_documents': task.required_documents or [],
'estimated_complexity': task.estimated_complexity or '',
'ai_prompt': task.ai_prompt or ''
}
task_plan_json['tasks'].append(task_json)
logger.info(f"Task Plan: {json.dumps(task_plan_json, indent=2, ensure_ascii=False)}")
logger.info(f"High-level task planning completed: {len(task_plan.tasks)} tasks")
return task_plan
@ -330,6 +405,11 @@ class ChatManager:
# Generate actions using AI
actions = await self._generateActionsForTaskStep(context)
# Log the generated actions as JSON for debugging
logger.info(f"Generated {len(actions)} actions for task '{task_step.description}':")
for i, action in enumerate(actions):
logger.info(f"Action {i+1}: {json.dumps(action, indent=2, ensure_ascii=False)}")
# Convert to TaskAction objects
task_actions = []
for action_dict in actions:
@ -338,6 +418,7 @@ class ChatManager:
"execAction": action_dict.get('action', 'unknown'),
"execParameters": action_dict.get('parameters', {}),
"execResultLabel": action_dict.get('resultLabel', ''),
"expectedDocumentFormats": action_dict.get('expectedDocumentFormats', None),
"status": TaskStatus.PENDING
}
@ -351,6 +432,19 @@ class ChatManager:
# Calculate actual action size for stats
action_size = self.service.calculateObjectSize(task_actions)
self.service.updateWorkflowStats(eventLabel="action", bytesSent=action_size)
# Log the final TaskAction objects as JSON
logger.info(f"Final TaskAction objects for task '{task_step.description}':")
for i, task_action in enumerate(task_actions):
action_json = {
'id': task_action.id,
'execMethod': task_action.execMethod,
'execAction': task_action.execAction,
'execParameters': task_action.execParameters,
'execResultLabel': task_action.execResultLabel,
'status': task_action.status.value if hasattr(task_action.status, 'value') else str(task_action.status)
}
logger.info(f"TaskAction {i+1}: {json.dumps(action_json, indent=2, ensure_ascii=False)}")
logger.info(f"Task action definition completed: {len(task_actions)} actions")
return task_actions
@ -842,6 +936,7 @@ ACTION GENERATION PRINCIPLES:
- Include validation steps in extraction prompts
- If this is a retry, learn from previous failures and improve the approach
- Address specific issues mentioned in previous review feedback
- When specifying expectedDocumentFormats, ensure AI prompts explicitly request pure data without markdown formatting
INSTRUCTIONS:
- Generate actions to accomplish this task step using available documents, connections, and previous results
@ -866,6 +961,13 @@ REQUIRED JSON STRUCTURE:
"aiPrompt": "Comprehensive AI prompt describing what to accomplish"
}},
"resultLabel": "task1_action3_analysis_results",
"expectedDocumentFormats": [ // OPTIONAL: Specify expected document formats when needed
{{
"extension": ".csv",
"mimeType": "text/csv",
"description": "Structured data output"
}}
],
"description": "What this action accomplishes (business outcome)"
}}
]
@ -876,10 +978,16 @@ FIELD REQUIREMENTS:
- "action": Must be valid for the method
- "parameters": Method-specific, must include documentList as a list if required by the signature
- "resultLabel": Must follow the format above (e.g., "task1_action3_analysis_results")
- "expectedDocumentFormats": OPTIONAL - Only specify when you need to control output format
- Use when you need specific file types (e.g., CSV for data, JSON for structured output)
- Omit when format is flexible (e.g., folder queries with mixed file types)
- Each format should specify: extension, mimeType, description
- When using expectedDocumentFormats, ensure the aiPrompt explicitly requests pure data without markdown formatting
- "description": Clear summary of the business outcome
EXAMPLES OF GOOD ACTIONS:
1. Comprehensive document analysis:
1. Document analysis with specific output format (use expectedDocumentFormats):
{{
"method": "document",
"action": "extract",
@ -888,10 +996,17 @@ EXAMPLES OF GOOD ACTIONS:
"aiPrompt": "Extract and analyze the candidate's qualifications, experience, skills, and suitability for the product designer position. Identify key strengths, relevant experience, technical skills, and any areas of concern. Provide a comprehensive assessment that can be used for evaluation."
}},
"resultLabel": "task1_action1_candidate_analysis",
"expectedDocumentFormats": [
{{
"extension": ".json",
"mimeType": "application/json",
"description": "Structured candidate analysis data"
}}
],
"description": "Comprehensive analysis of candidate profile for evaluation"
}}
2. Multi-document processing:
2. Multi-document processing with flexible output (omit expectedDocumentFormats):
{{
"method": "document",
"action": "extract",
@ -903,6 +1018,25 @@ EXAMPLES OF GOOD ACTIONS:
"description": "Create comprehensive evaluation matrix comparing all candidates"
}}
3. Data extraction with specific CSV format:
{{
"method": "document",
"action": "extract",
"parameters": {{
"documentList": ["docItem:doc_abc:table_data.pdf"],
"aiPrompt": "Extract all table data and convert to structured CSV format with proper headers and data types. IMPORTANT: Deliver pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows."
}},
"resultLabel": "task1_action2_structured_data",
"expectedDocumentFormats": [
{{
"extension": ".csv",
"mimeType": "text/csv",
"description": "Structured table data in CSV format"
}}
],
"description": "Extract and structure table data for analysis"
}}
NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
@ -1022,11 +1156,17 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
async def _executeSingleAction(self, action: TaskAction, workflow: ChatWorkflow) -> ActionResult:
"""Execute a single action and return ActionResult with enhanced document processing"""
try:
# Enhance parameters with expected document formats if specified
enhanced_parameters = action.execParameters.copy()
if action.expectedDocumentFormats:
enhanced_parameters['expectedDocumentFormats'] = action.expectedDocumentFormats
logger.info(f"Action {action.execMethod}.{action.execAction} expects formats: {action.expectedDocumentFormats}")
# Execute the actual method action using the service center
result = await self.service.executeAction(
methodName=action.execMethod,
actionName=action.execAction,
parameters=action.execParameters
parameters=enhanced_parameters
)
# Always use the execResultLabel from the action definition
@ -1348,8 +1488,8 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
# For CSV files, try to extract table data
elif file_extension == 'csv':
# Look for CSV-specific fields
csv_fields = ['table_data', 'csv_data', 'rows', 'data']
# Look for CSV-specific fields first, then general content fields
csv_fields = ['table_data', 'csv_data', 'rows', 'data', 'content', 'text']
for field in csv_fields:
if field in document_data:
content = document_data[field]
@ -1798,7 +1938,7 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
# Create final success log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"🎉 Workflow completed successfully ({len(workflow_results)}/{len(task_plan.tasks)} tasks)",
"message": f"🎉 Workflow completed ({len(workflow_results)}/{len(task_plan.tasks)} tasks)",
"type": "success",
"status": "completed",
"progress": 100
@ -1814,7 +1954,7 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
)
logger.info(f"=== UNIFIED WORKFLOW COMPLETED: {len(workflow_results)}/{len(task_plan.tasks)} tasks successful ===")
logger.debug(f"FINAL WORKFLOW SUMMARY: {json.dumps(workflow_summary.model_dump(), indent=2, ensure_ascii=False)}")
logger.debug(f"FINAL WORKFLOW SUMMARY: {json.dumps(workflow_summary.dict(), indent=2, ensure_ascii=False)}")
return workflow_summary
except Exception as e:
@ -1989,6 +2129,7 @@ Please review the task requirements and try again with different input or approa
)
# Generate new actions with failure avoidance
logger.info(f"Regenerating actions for task '{task_step.description}' with failure context (retry {state.retry_count})")
actions = await self.defineTaskActions(task_step, context.workflow, state.getAvailableResults(), enhanced_context)
logger.info(f"Regenerated {len(actions)} actions with failure context")
@ -2016,13 +2157,17 @@ Please review the task requirements and try again with different input or approa
prompt = self._createTaskCompletionValidationPrompt(task_result, task_step)
response = await self._callAIWithCircuitBreaker(prompt, "task_completion_validation")
# Log the validation response for debugging
logger.debug(f"Task validation AI response: {response}")
# Parse validation result
validation = self._parseTaskValidationResponse(response)
# Add quality metrics
validation['quality_metrics'] = self._calculateTaskQualityMetrics(task_step, successful_actions)
logger.info(f"Task completion validation: {validation.get('status', 'unknown')}")
logger.info(f"Task completion validation: {validation.get('status', 'unknown')} - Reason: {validation.get('reason', 'No reason')}")
logger.debug(f"Parsed validation result: {json.dumps(validation, indent=2)}")
return ReviewResult(
status=validation.get('status', 'unknown'),
reason=validation.get('reason', 'No reason provided'),
@ -2061,21 +2206,27 @@ Please review the task requirements and try again with different input or approa
'has_text_result': bool(action.data.get('result', '').strip())
})
return f"""You are a task completion validator that evaluates if a task was successfully completed.
return f"""You are an action completion validator that evaluates if individual actions were successfully completed.
TASK DETAILS:
- Description: {task_step.description}
- Expected Outputs: {', '.join(expected_outputs)}
- Success Criteria: {', '.join(success_criteria)}
SUCCESSFUL ACTIONS ({len(successful_actions)}):
ACTION DETAILS:
{json.dumps(action_summary, indent=2)}
VALIDATION CRITERIA:
1. Check if the action's result_label matches what was delivered
2. If documents were delivered and result_label is present SUCCESS
3. If no documents but text result with matching result_label or different result_label RETRY
4. If no result_label and no delivery FAIL
VALIDATION RULES:
- Focus on result_label matching
- Check if the action delivered the expected result type
- Document delivery with correct result_label = SUCCESS
- Text result with correct result_label = SUCCESS
VALIDATION QUESTIONS:
1. Were all expected outputs produced?
2. Are the success criteria met?
3. Do the action results collectively accomplish the task goal?
4. Is the task ready for handover to the next task?
1. Does the result_label match what the action was supposed to deliver?
2. Were documents or text results delivered with the correct label?
3. Does the delivery match the action's objective?
REQUIRED JSON RESPONSE:
{{
@ -2242,6 +2393,21 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
documents=result.data.get("documents", [])
)
# Log the action execution result as JSON (without document content)
action_result_json = {
'success': action_result.success,
'actionId': action_result.actionId,
'actionMethod': action_result.actionMethod,
'actionName': action_result.actionName,
'validation': action_result.validation,
'error': action_result.error,
'documents_count': len(action_result.documents),
'document_names': [doc.filename if hasattr(doc, 'filename') else str(doc) for doc in action_result.documents],
'data_keys': list(action_result.data.keys()) if isinstance(action_result.data, dict) else [],
'metadata_keys': list(action_result.metadata.keys()) if isinstance(action_result.metadata, dict) else []
}
logger.info(f"Action execution result for {action.execMethod}.{action.execAction}: {json.dumps(action_result_json, indent=2, ensure_ascii=False)}")
# Update action status based on validation
if validation['status'] == 'success':
action.setSuccess()
@ -2334,6 +2500,24 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
applied_improvements=improvements
)
# Log the retry action execution result as JSON (without document content)
retry_result_json = {
'success': action_result.success,
'actionId': action_result.actionId,
'actionMethod': action_result.actionMethod,
'actionName': action_result.actionName,
'validation': action_result.validation,
'error': action_result.error,
'is_retry': action_result.is_retry,
'previous_error': action_result.previous_error,
'applied_improvements': action_result.applied_improvements,
'documents_count': len(action_result.documents),
'document_names': [doc.filename if hasattr(doc, 'filename') else str(doc) for doc in action_result.documents],
'data_keys': list(action_result.data.keys()) if isinstance(action_result.data, dict) else [],
'metadata_keys': list(action_result.metadata.keys()) if isinstance(action_result.metadata, dict) else []
}
logger.info(f"Retry action execution result for {action.execMethod}.{action.execAction}: {json.dumps(retry_result_json, indent=2, ensure_ascii=False)}")
# Update action status
if validation['status'] == 'success':
enhanced_action.setSuccess()

View file

@ -185,6 +185,8 @@ class TaskAction(BaseModel, ModelMixin):
execAction: str = Field(..., description="Action to perform")
execParameters: Dict[str, Any] = Field(default_factory=dict, description="Action parameters")
execResultLabel: Optional[str] = Field(None, description="Label for the set of result documents")
# NEW: Optional document format specification
expectedDocumentFormats: Optional[List[Dict[str, str]]] = Field(None, description="Expected document formats (optional)")
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Action status")
error: Optional[str] = Field(None, description="Error message if action failed")
retryCount: int = Field(default=0, description="Number of retries attempted")

View file

@ -7,7 +7,7 @@ import os
import logging
import uuid
import time
from datetime import datetime, UTC
from datetime import datetime, UTC, timezone
from typing import Dict, Any, List, Optional, Union
import asyncio
@ -128,8 +128,8 @@ class ChatObjects:
return self.db.getInitialId(table)
def _getCurrentTimestamp(self) -> str:
"""Returns the current timestamp in ISO format"""
return datetime.now().isoformat()
"""Returns the current timestamp as Unix timestamp (seconds since epoch)"""
return str(int(time.time()))
# Workflow methods
@ -576,8 +576,45 @@ class ChatObjects:
"processingTime": 0
}
# Simple processing time - just use current time
processing_time = time.time()
# Calculate processing time as duration since workflow start using Unix timestamps
workflow = self.getWorkflow(workflowId)
if workflow and workflow.startedAt:
try:
# Parse start time as Unix timestamp (handle both old ISO format and new Unix format)
start_time_str = workflow.startedAt
try:
# Try to parse as Unix timestamp first
start_time = int(float(start_time_str))
except ValueError:
# If that fails, try to parse as ISO format and convert to Unix
try:
# Handle ISO format timestamps (for backward compatibility)
if start_time_str.endswith('Z'):
start_time_str = start_time_str.replace('Z', '+00:00')
dt = datetime.fromisoformat(start_time_str)
start_time = int(dt.timestamp())
except:
# If all parsing fails, use current time
logger.warning(f"Could not parse start time: {start_time_str}, using current time")
start_time = int(time.time())
current_time = int(time.time())
processing_time = current_time - start_time
# Ensure processing time is reasonable (not negative or extremely large)
if processing_time < 0:
logger.warning(f"Negative processing time calculated: {processing_time}, using 0")
processing_time = 0
elif processing_time > 86400 * 365: # More than 1 year
logger.warning(f"Unreasonably large processing time: {processing_time}, using 0")
processing_time = 0
except Exception as e:
logger.warning(f"Error calculating processing time: {str(e)}")
processing_time = currentStats.get("processingTime", 0) or 0
else:
# Fallback to existing processing time or 0
processing_time = currentStats.get("processingTime", 0) or 0
# Update stats with incremental values - ensure no None values
current_bytes_sent = currentStats.get("bytesSent", 0) or 0
@ -793,8 +830,8 @@ class ChatObjects:
# Load logs
logs = self.getWorkflowLogs(workflowId)
# Sort by timestamp
logs.sort(key=lambda x: x.get("timestamp", ""))
# Sort by timestamp (Unix timestamps)
logs.sort(key=lambda x: float(x.get("timestamp", 0)))
# Assemble complete workflow object
completeWorkflow = workflow.copy()
@ -1205,12 +1242,13 @@ class ChatObjects:
execAction=createdAction["execAction"],
execParameters=createdAction.get("execParameters", {}),
execResultLabel=createdAction.get("execResultLabel"),
expectedDocumentFormats=createdAction.get("expectedDocumentFormats"),
status=createdAction.get("status", TaskStatus.PENDING),
error=createdAction.get("error"),
retryCount=createdAction.get("retryCount", 0),
retryMax=createdAction.get("retryMax", 3),
processingTime=createdAction.get("processingTime"),
timestamp=datetime.fromisoformat(createdAction.get("timestamp", datetime.now().isoformat())),
timestamp=datetime.fromtimestamp(float(createdAction.get("timestamp", time.time()))),
result=createdAction.get("result"),
resultDocuments=createdAction.get("resultDocuments", [])
)

View file

@ -24,17 +24,19 @@ class MethodDocument(MethodBase):
@action
async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Extract specific content from document with ai prompt and return it as a json file
Extract specific content from document with ai prompt and return it in the specified format
Parameters:
documentList (str): Reference to the document list to extract content from
aiPrompt (str): AI prompt for content extraction
includeMetadata (bool, optional): Whether to include metadata (default: True)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
documentList = parameters.get("documentList")
aiPrompt = parameters.get("aiPrompt")
includeMetadata = parameters.get("includeMetadata", True)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not documentList:
return self._createResult(
@ -58,6 +60,31 @@ class MethodDocument(MethodBase):
error="No documents found for the provided reference"
)
# Determine output format based on expected formats
output_extension = ".txt" # Default
output_mime_type = "text/plain" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".txt")
output_mime_type = expected_format.get("mimeType", "text/plain")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
logger.info(f"Expected document formats: {expectedDocumentFormats}")
else:
logger.info("No expected format specified, using default .txt format")
# Enhance AI prompt to specify output format
enhanced_prompt = aiPrompt
if output_extension == ".csv":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows. Do not include ```csv or ``` markers."
elif output_extension == ".json":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure JSON data without any markdown formatting, code blocks, or additional text. Output only the JSON content. Do not include ```json or ``` markers."
elif output_extension == ".xml":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure XML data without any markdown formatting, code blocks, or additional text. Output only the XML content. Do not include ```xml or ``` markers."
elif output_extension != ".txt":
enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure {output_extension.upper()} data without any markdown formatting, code blocks, or additional text. Output only the {output_extension.upper()} content. Do not include any markdown markers."
# Extract content from all documents
all_extracted_content = []
file_infos = []
@ -72,7 +99,7 @@ class MethodDocument(MethodBase):
continue
extracted_content = await self.service.extractContentFromFileData(
prompt=aiPrompt,
prompt=enhanced_prompt, # Use enhanced prompt instead of original
fileData=file_data,
filename=file_info.get('name', 'document'),
mimeType=file_info.get('mimeType', 'application/octet-stream'),
@ -105,25 +132,50 @@ class MethodDocument(MethodBase):
# Fallback: convert to string representation
text_contents.append(str(content_obj))
# Combine all extracted text content
combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(text_contents)
# Process each document individually and create separate output files
output_documents = []
result_data = {
"documentCount": len(chatDocuments),
"content": combined_content,
"fileInfos": file_infos if includeMetadata else None,
"timestamp": datetime.now(UTC).isoformat()
}
for i, (chatDocument, extracted_content) in enumerate(zip(chatDocuments, all_extracted_content)):
# Extract text content from this document
text_content = ""
if hasattr(extracted_content, 'contents') and extracted_content.contents:
# Extract text from ContentItem objects
for content_item in extracted_content.contents:
if hasattr(content_item, 'data') and content_item.data:
text_content += content_item.data + "\n"
elif isinstance(extracted_content, str):
text_content = extracted_content
else:
# Fallback: convert to string representation
text_content = str(extracted_content)
# Create output filename based on original filename
original_filename = chatDocument.filename
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}"
# Create result data for this document
result_data = {
"documentCount": 1,
"content": text_content,
"originalFilename": original_filename,
"fileInfos": [file_infos[i]] if includeMetadata and i < len(file_infos) else None,
"timestamp": datetime.now(UTC).isoformat()
}
logger.info(f"Created output document: {output_filename} with {len(text_content)} characters")
logger.info(f"Content preview: {text_content[:200]}...")
output_documents.append({
"documentName": output_filename,
"documentData": result_data,
"mimeType": output_mime_type
})
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"extracted_content_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.txt",
"documentData": result_data
}
]
"documents": output_documents
}
)
except Exception as e:

View file

@ -55,12 +55,14 @@ class MethodOutlook(MethodBase):
folder (str, optional): Email folder to read from (default: "Inbox")
limit (int, optional): Maximum number of emails to read (default: 10)
filter (str, optional): Filter criteria for emails
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
folder = parameters.get("folder", "Inbox")
limit = parameters.get("limit", 10)
filter = parameters.get("filter")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference:
return self._createResult(
@ -112,13 +114,27 @@ class MethodOutlook(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"outlook_emails_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"outlook_emails_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -144,6 +160,7 @@ class MethodOutlook(MethodBase):
body (str): Email body content
cc (List[str], optional): CC recipients
bcc (List[str], optional): BCC recipients
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
@ -152,6 +169,7 @@ class MethodOutlook(MethodBase):
body = parameters.get("body")
cc = parameters.get("cc", [])
bcc = parameters.get("bcc", [])
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not to or not subject or not body:
return self._createResult(
@ -207,11 +225,29 @@ class MethodOutlook(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documentName": f"outlook_email_sent_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documents": [
{
"documentName": f"outlook_email_sent_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
)
@ -233,12 +269,14 @@ class MethodOutlook(MethodBase):
query (str): Search query
folder (str, optional): Folder to search in (default: "All")
limit (int, optional): Maximum number of results (default: 20)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
query = parameters.get("query")
folder = parameters.get("folder", "All")
limit = parameters.get("limit", 20)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not query:
return self._createResult(
@ -290,11 +328,29 @@ class MethodOutlook(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documentName": f"outlook_email_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documents": [
{
"documentName": f"outlook_email_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
)

View file

@ -54,12 +54,14 @@ class MethodSharepoint(MethodBase):
siteUrl (str): SharePoint site URL
query (str): Query or description to find document
searchScope (str, optional): Search scope (default: "all")
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
siteUrl = parameters.get("siteUrl")
query = parameters.get("query")
searchScope = parameters.get("searchScope", "all")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not siteUrl or not query:
return self._createResult(
@ -108,13 +110,27 @@ class MethodSharepoint(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"sharepoint_find_path_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"sharepoint_find_path_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -139,6 +155,7 @@ class MethodSharepoint(MethodBase):
siteUrl (str): SharePoint site URL
documentPaths (List[str]): List of paths to the documents in SharePoint
includeMetadata (bool, optional): Whether to include metadata (default: True)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
documentList = parameters.get("documentList")
@ -146,6 +163,7 @@ class MethodSharepoint(MethodBase):
siteUrl = parameters.get("siteUrl")
documentPaths = parameters.get("documentPaths")
includeMetadata = parameters.get("includeMetadata", True)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not documentList or not connectionReference or not siteUrl or not documentPaths:
return self._createResult(
@ -218,13 +236,27 @@ class MethodSharepoint(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"sharepoint_documents_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"sharepoint_documents_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -248,6 +280,7 @@ class MethodSharepoint(MethodBase):
documentPaths (List[str]): List of paths where to upload the documents
documentList (str): Reference to the document list to upload
fileNames (List[str]): List of names for the uploaded files
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
@ -255,6 +288,7 @@ class MethodSharepoint(MethodBase):
documentPaths = parameters.get("documentPaths")
documentList = parameters.get("documentList")
fileNames = parameters.get("fileNames")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not siteUrl or not documentPaths or not documentList or not fileNames:
return self._createResult(
@ -339,13 +373,27 @@ class MethodSharepoint(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"sharepoint_upload_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"sharepoint_upload_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -369,12 +417,14 @@ class MethodSharepoint(MethodBase):
siteUrl (str): SharePoint site URL
folderPaths (List[str]): List of paths to the folders to list
includeSubfolders (bool, optional): Whether to include subfolders (default: False)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
connectionReference = parameters.get("connectionReference")
siteUrl = parameters.get("siteUrl")
folderPaths = parameters.get("folderPaths")
includeSubfolders = parameters.get("includeSubfolders", False)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not connectionReference or not siteUrl or not folderPaths:
return self._createResult(
@ -436,13 +486,27 @@ class MethodSharepoint(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"sharepoint_document_list_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"sharepoint_document_list_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}

View file

@ -224,12 +224,14 @@ class MethodWeb(MethodBase):
maxDepth (int, optional): Maximum crawl depth (default: 2)
includeImages (bool, optional): Whether to include images (default: False)
followLinks (bool, optional): Whether to follow links (default: True)
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
urls = parameters.get("urls")
maxDepth = parameters.get("maxDepth", 2)
includeImages = parameters.get("includeImages", False)
followLinks = parameters.get("followLinks", True)
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not urls:
return self._createResult(
@ -307,13 +309,27 @@ class MethodWeb(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -336,11 +352,13 @@ class MethodWeb(MethodBase):
url (str): URL to scrape
selectors (Dict[str, str]): CSS selectors for data extraction
format (str, optional): Output format (default: "json")
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
url = parameters.get("url")
selectors = parameters.get("selectors")
format = parameters.get("format", "json")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not url or not selectors:
return self._createResult(
@ -400,13 +418,27 @@ class MethodWeb(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = f".{format}" # Default to format parameter
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", f".{format}")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info(f"No expected format specified, using format parameter: {format}")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.{format}",
"documentData": result_data
"documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -430,12 +462,14 @@ class MethodWeb(MethodBase):
engine (str, optional): Search engine to use (default: "google")
maxResults (int, optional): Maximum number of results (default: 10)
filter (str, optional): Additional search filters
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
query = parameters.get("query")
engine = parameters.get("engine", "google")
maxResults = parameters.get("maxResults", 10)
filter = parameters.get("filter")
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not query:
return self._createResult(
@ -533,13 +567,27 @@ class MethodWeb(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
@ -561,10 +609,12 @@ class MethodWeb(MethodBase):
Parameters:
url (str): URL to validate
checks (List[str], optional): Types of checks to perform (default: ["accessibility", "seo", "performance"])
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
url = parameters.get("url")
checks = parameters.get("checks", ["accessibility", "seo", "performance"])
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not url:
return self._createResult(
@ -609,13 +659,27 @@ class MethodWeb(MethodBase):
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data
"documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}