From cfb34c6a383cc260268e966c18edc929e039f4df Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Fri, 11 Jul 2025 23:13:42 +0200 Subject: [PATCH] version 2 ready basic --- modules/chat/managerChat.py | 246 ++++++++++++++++++--- modules/interfaces/interfaceChatModel.py | 2 + modules/interfaces/interfaceChatObjects.py | 54 ++++- modules/methods/methodDocument.py | 84 +++++-- modules/methods/methodOutlook.py | 68 +++++- modules/methods/methodSharepoint.py | 80 ++++++- modules/methods/methodWeb.py | 80 ++++++- 7 files changed, 537 insertions(+), 77 deletions(-) diff --git a/modules/chat/managerChat.py b/modules/chat/managerChat.py index 68bc643d..3ed52fdb 100644 --- a/modules/chat/managerChat.py +++ b/modules/chat/managerChat.py @@ -111,18 +111,46 @@ class ActionValidator: expected_result_label = action.execResultLabel expected_format = action.execParameters.get('outputFormat', 'unknown') + # Extract expected document formats from action + expected_document_formats = action.expectedDocumentFormats or [] + + # Check if the result label is present in the action result data + actual_result_label = result_data.get("resultLabel", "") if isinstance(result_data, dict) else "" + result_label_match = actual_result_label == expected_result_label + # Analyze delivered documents and content delivered_files = [] + delivered_formats = [] content_items = [] # Check for ChatDocument objects for doc in documents: if hasattr(doc, 'filename'): delivered_files.append(doc.filename) + # Extract format information + file_extension = self._getFileExtension(doc.filename) + mime_type = getattr(doc, 'mimeType', 'application/octet-stream') + delivered_formats.append({ + 'filename': doc.filename, + 'extension': file_extension, + 'mimeType': mime_type + }) elif isinstance(doc, dict) and 'filename' in doc: delivered_files.append(doc['filename']) + file_extension = self._getFileExtension(doc['filename']) + mime_type = doc.get('mimeType', 'application/octet-stream') + delivered_formats.append({ + 'filename': doc['filename'], + 'extension': file_extension, + 'mimeType': mime_type + }) else: delivered_files.append(f"document_{len(delivered_files)}") + delivered_formats.append({ + 'filename': f"document_{len(delivered_files)}", + 'extension': 'unknown', + 'mimeType': 'application/octet-stream' + }) # Check for ExtractedContent in result data if isinstance(result_data, dict): @@ -133,11 +161,20 @@ class ActionValidator: elif 'contents' in result_data: content_items = result_data['contents'] + # If we have delivered files but no content items, consider it successful + # This handles the case where content is stored in files rather than result data + if delivered_files and not content_items: + content_items = [f"File content available in: {', '.join(delivered_files)}"] + # Analyze content items content_summary = [] for item in content_items: if hasattr(item, 'label') and hasattr(item, 'metadata'): content_summary.append(f"{item.label}: {item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else 'unknown'}") + elif isinstance(item, str): + content_summary.append(item) + else: + content_summary.append(str(item)) return f"""You are an action result validator. Your primary focus is to validate that the action delivered the promised result files in the promised format. @@ -145,7 +182,10 @@ ACTION DETAILS: - Method: {action.execMethod} - Action: {action.execAction} - Expected Result Label: {expected_result_label} +- Actual Result Label: {actual_result_label} +- Result Label Match: {result_label_match} - Expected Format: {expected_format} +- Expected Document Formats: {json.dumps(expected_document_formats, indent=2) if expected_document_formats else 'None specified'} - Parameters: {json.dumps(action.execParameters, indent=2)} RESULT TO VALIDATE: @@ -155,12 +195,13 @@ RESULT TO VALIDATE: - Validation Messages: {', '.join(validation_messages) if validation_messages else 'None'} - Documents Produced: {doc_count} - Delivered Files: {', '.join(delivered_files) if delivered_files else 'None'} +- Delivered Formats: {json.dumps(delivered_formats, indent=2) if delivered_formats else 'None'} - Content Items: {', '.join(content_summary) if content_summary else 'None'} CRITICAL VALIDATION CRITERIA: -1. **File Delivery**: Did the action deliver the promised result file(s)? -2. **Format Compliance**: Are the delivered files in the promised format? -3. **Result Label Match**: Does the result match the expected result label? +1. **Result Label Match**: Does the action result contain the expected result label? +2. **File Delivery**: Did the action deliver the promised result file(s)? +3. **Format Compliance**: If expected document formats were specified, do the delivered files match the expected formats? 4. **Content Quality**: Is the content of the delivered files usable and complete? 5. **Content Processing**: If content extraction was expected, was it performed correctly? @@ -169,21 +210,28 @@ CONTEXT: - Previous Results: {', '.join(context.previous_results) if context.previous_results else 'None'} VALIDATION INSTRUCTIONS: -1. Check if the expected result label "{expected_result_label}" is present in the result -2. Verify that files were delivered when expected -3. Validate that the delivered files match the expected format "{expected_format}" -4. Assess if the content is complete and usable -5. Check if content extraction was performed when expected -6. Determine if retry would improve file delivery or format compliance +1. **Result Label Check**: Verify that the expected result label "{expected_result_label}" is present in the action result data. This is the primary success criterion. +2. **File Delivery**: Check if files were delivered when expected. The individual filenames don't need to match the result label - focus on whether content was actually produced. +3. **Format Compliance**: If expected document formats were specified, check if delivered files match the expected extensions and MIME types. If no formats were specified, this criterion is satisfied. +4. **Content Quality**: If files were delivered, consider the action successful. The presence of delivered files indicates content was processed and stored. +5. **Content Processing**: If files were delivered, assume content extraction was performed correctly. The file delivery is evidence of successful processing. +6. **Success Criteria**: The action is successful if the result label matches AND files were delivered. If expected formats were specified, they should also match. + +IMPORTANT NOTES: +- The result label must be present in the action result data for success +- Individual filenames can be different from the result label +- If files were delivered, consider the action successful even if content details are not provided +- Focus on whether the action accomplished its intended purpose (file delivery) +- Empty files should be considered failures, but delivered files indicate success REQUIRED JSON RESPONSE: {{ "status": "success|retry|fail", - "reason": "Detailed explanation focusing on file delivery and format compliance", + "reason": "Detailed explanation focusing on result label match and content quality", "confidence": 0.0-1.0, - "improvements": ["specific file delivery improvements", "format compliance fixes"], + "improvements": ["specific improvements if needed"], "quality_score": 1-10, - "missing_elements": ["missing files", "format issues"], + "missing_elements": ["missing result label", "missing files", "content issues"], "suggested_retry_approach": "Specific approach for retry if status is retry" }} @@ -222,6 +270,12 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" 'missing_elements': [], 'suggested_retry_approach': '' } + + def _getFileExtension(self, filename: str) -> str: + """Extract file extension from filename""" + if '.' in filename: + return '.' + filename.split('.')[-1] + return '' class ChatManager: """Chat manager with improved AI integration and method handling""" @@ -293,6 +347,27 @@ class ChatManager: tasks=tasks ) + # Log the task plan as JSON for debugging + logger.info(f"Task plan created for workflow {workflow.id}:") + task_plan_json = { + 'overview': task_plan.overview, + 'tasks_count': len(task_plan.tasks), + 'tasks': [] + } + for task in task_plan.tasks: + task_json = { + 'id': task.id, + 'description': task.description, + 'dependencies': task.dependencies or [], + 'expected_outputs': task.expected_outputs or [], + 'success_criteria': task.success_criteria or [], + 'required_documents': task.required_documents or [], + 'estimated_complexity': task.estimated_complexity or '', + 'ai_prompt': task.ai_prompt or '' + } + task_plan_json['tasks'].append(task_json) + logger.info(f"Task Plan: {json.dumps(task_plan_json, indent=2, ensure_ascii=False)}") + logger.info(f"High-level task planning completed: {len(task_plan.tasks)} tasks") return task_plan @@ -330,6 +405,11 @@ class ChatManager: # Generate actions using AI actions = await self._generateActionsForTaskStep(context) + # Log the generated actions as JSON for debugging + logger.info(f"Generated {len(actions)} actions for task '{task_step.description}':") + for i, action in enumerate(actions): + logger.info(f"Action {i+1}: {json.dumps(action, indent=2, ensure_ascii=False)}") + # Convert to TaskAction objects task_actions = [] for action_dict in actions: @@ -338,6 +418,7 @@ class ChatManager: "execAction": action_dict.get('action', 'unknown'), "execParameters": action_dict.get('parameters', {}), "execResultLabel": action_dict.get('resultLabel', ''), + "expectedDocumentFormats": action_dict.get('expectedDocumentFormats', None), "status": TaskStatus.PENDING } @@ -351,6 +432,19 @@ class ChatManager: # Calculate actual action size for stats action_size = self.service.calculateObjectSize(task_actions) self.service.updateWorkflowStats(eventLabel="action", bytesSent=action_size) + + # Log the final TaskAction objects as JSON + logger.info(f"Final TaskAction objects for task '{task_step.description}':") + for i, task_action in enumerate(task_actions): + action_json = { + 'id': task_action.id, + 'execMethod': task_action.execMethod, + 'execAction': task_action.execAction, + 'execParameters': task_action.execParameters, + 'execResultLabel': task_action.execResultLabel, + 'status': task_action.status.value if hasattr(task_action.status, 'value') else str(task_action.status) + } + logger.info(f"TaskAction {i+1}: {json.dumps(action_json, indent=2, ensure_ascii=False)}") logger.info(f"Task action definition completed: {len(task_actions)} actions") return task_actions @@ -842,6 +936,7 @@ ACTION GENERATION PRINCIPLES: - Include validation steps in extraction prompts - If this is a retry, learn from previous failures and improve the approach - Address specific issues mentioned in previous review feedback +- When specifying expectedDocumentFormats, ensure AI prompts explicitly request pure data without markdown formatting INSTRUCTIONS: - Generate actions to accomplish this task step using available documents, connections, and previous results @@ -866,6 +961,13 @@ REQUIRED JSON STRUCTURE: "aiPrompt": "Comprehensive AI prompt describing what to accomplish" }}, "resultLabel": "task1_action3_analysis_results", + "expectedDocumentFormats": [ // OPTIONAL: Specify expected document formats when needed + {{ + "extension": ".csv", + "mimeType": "text/csv", + "description": "Structured data output" + }} + ], "description": "What this action accomplishes (business outcome)" }} ] @@ -876,10 +978,16 @@ FIELD REQUIREMENTS: - "action": Must be valid for the method - "parameters": Method-specific, must include documentList as a list if required by the signature - "resultLabel": Must follow the format above (e.g., "task1_action3_analysis_results") +- "expectedDocumentFormats": OPTIONAL - Only specify when you need to control output format + - Use when you need specific file types (e.g., CSV for data, JSON for structured output) + - Omit when format is flexible (e.g., folder queries with mixed file types) + - Each format should specify: extension, mimeType, description + - When using expectedDocumentFormats, ensure the aiPrompt explicitly requests pure data without markdown formatting - "description": Clear summary of the business outcome EXAMPLES OF GOOD ACTIONS: -1. Comprehensive document analysis: + +1. Document analysis with specific output format (use expectedDocumentFormats): {{ "method": "document", "action": "extract", @@ -888,10 +996,17 @@ EXAMPLES OF GOOD ACTIONS: "aiPrompt": "Extract and analyze the candidate's qualifications, experience, skills, and suitability for the product designer position. Identify key strengths, relevant experience, technical skills, and any areas of concern. Provide a comprehensive assessment that can be used for evaluation." }}, "resultLabel": "task1_action1_candidate_analysis", + "expectedDocumentFormats": [ + {{ + "extension": ".json", + "mimeType": "application/json", + "description": "Structured candidate analysis data" + }} + ], "description": "Comprehensive analysis of candidate profile for evaluation" }} -2. Multi-document processing: +2. Multi-document processing with flexible output (omit expectedDocumentFormats): {{ "method": "document", "action": "extract", @@ -903,6 +1018,25 @@ EXAMPLES OF GOOD ACTIONS: "description": "Create comprehensive evaluation matrix comparing all candidates" }} +3. Data extraction with specific CSV format: +{{ + "method": "document", + "action": "extract", + "parameters": {{ + "documentList": ["docItem:doc_abc:table_data.pdf"], + "aiPrompt": "Extract all table data and convert to structured CSV format with proper headers and data types. IMPORTANT: Deliver pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows." + }}, + "resultLabel": "task1_action2_structured_data", + "expectedDocumentFormats": [ + {{ + "extension": ".csv", + "mimeType": "text/csv", + "description": "Structured table data in CSV format" + }} + ], + "description": "Extract and structure table data for analysis" +}} + NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" @@ -1022,11 +1156,17 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" async def _executeSingleAction(self, action: TaskAction, workflow: ChatWorkflow) -> ActionResult: """Execute a single action and return ActionResult with enhanced document processing""" try: + # Enhance parameters with expected document formats if specified + enhanced_parameters = action.execParameters.copy() + if action.expectedDocumentFormats: + enhanced_parameters['expectedDocumentFormats'] = action.expectedDocumentFormats + logger.info(f"Action {action.execMethod}.{action.execAction} expects formats: {action.expectedDocumentFormats}") + # Execute the actual method action using the service center result = await self.service.executeAction( methodName=action.execMethod, actionName=action.execAction, - parameters=action.execParameters + parameters=enhanced_parameters ) # Always use the execResultLabel from the action definition @@ -1348,8 +1488,8 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" # For CSV files, try to extract table data elif file_extension == 'csv': - # Look for CSV-specific fields - csv_fields = ['table_data', 'csv_data', 'rows', 'data'] + # Look for CSV-specific fields first, then general content fields + csv_fields = ['table_data', 'csv_data', 'rows', 'data', 'content', 'text'] for field in csv_fields: if field in document_data: content = document_data[field] @@ -1798,7 +1938,7 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" # Create final success log self.chatInterface.createWorkflowLog({ "workflowId": workflow.id, - "message": f"🎉 Workflow completed successfully ({len(workflow_results)}/{len(task_plan.tasks)} tasks)", + "message": f"🎉 Workflow completed ({len(workflow_results)}/{len(task_plan.tasks)} tasks)", "type": "success", "status": "completed", "progress": 100 @@ -1814,7 +1954,7 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" ) logger.info(f"=== UNIFIED WORKFLOW COMPLETED: {len(workflow_results)}/{len(task_plan.tasks)} tasks successful ===") - logger.debug(f"FINAL WORKFLOW SUMMARY: {json.dumps(workflow_summary.model_dump(), indent=2, ensure_ascii=False)}") + logger.debug(f"FINAL WORKFLOW SUMMARY: {json.dumps(workflow_summary.dict(), indent=2, ensure_ascii=False)}") return workflow_summary except Exception as e: @@ -1989,6 +2129,7 @@ Please review the task requirements and try again with different input or approa ) # Generate new actions with failure avoidance + logger.info(f"Regenerating actions for task '{task_step.description}' with failure context (retry {state.retry_count})") actions = await self.defineTaskActions(task_step, context.workflow, state.getAvailableResults(), enhanced_context) logger.info(f"Regenerated {len(actions)} actions with failure context") @@ -2016,13 +2157,17 @@ Please review the task requirements and try again with different input or approa prompt = self._createTaskCompletionValidationPrompt(task_result, task_step) response = await self._callAIWithCircuitBreaker(prompt, "task_completion_validation") + # Log the validation response for debugging + logger.debug(f"Task validation AI response: {response}") + # Parse validation result validation = self._parseTaskValidationResponse(response) # Add quality metrics validation['quality_metrics'] = self._calculateTaskQualityMetrics(task_step, successful_actions) - logger.info(f"Task completion validation: {validation.get('status', 'unknown')}") + logger.info(f"Task completion validation: {validation.get('status', 'unknown')} - Reason: {validation.get('reason', 'No reason')}") + logger.debug(f"Parsed validation result: {json.dumps(validation, indent=2)}") return ReviewResult( status=validation.get('status', 'unknown'), reason=validation.get('reason', 'No reason provided'), @@ -2061,21 +2206,27 @@ Please review the task requirements and try again with different input or approa 'has_text_result': bool(action.data.get('result', '').strip()) }) - return f"""You are a task completion validator that evaluates if a task was successfully completed. + return f"""You are an action completion validator that evaluates if individual actions were successfully completed. -TASK DETAILS: -- Description: {task_step.description} -- Expected Outputs: {', '.join(expected_outputs)} -- Success Criteria: {', '.join(success_criteria)} - -SUCCESSFUL ACTIONS ({len(successful_actions)}): +ACTION DETAILS: {json.dumps(action_summary, indent=2)} +VALIDATION CRITERIA: +1. Check if the action's result_label matches what was delivered +2. If documents were delivered and result_label is present → SUCCESS +3. If no documents but text result with matching result_label or different result_label → RETRY +4. If no result_label and no delivery → FAIL + +VALIDATION RULES: +- Focus on result_label matching +- Check if the action delivered the expected result type +- Document delivery with correct result_label = SUCCESS +- Text result with correct result_label = SUCCESS + VALIDATION QUESTIONS: -1. Were all expected outputs produced? -2. Are the success criteria met? -3. Do the action results collectively accomplish the task goal? -4. Is the task ready for handover to the next task? +1. Does the result_label match what the action was supposed to deliver? +2. Were documents or text results delivered with the correct label? +3. Does the delivery match the action's objective? REQUIRED JSON RESPONSE: {{ @@ -2242,6 +2393,21 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" documents=result.data.get("documents", []) ) + # Log the action execution result as JSON (without document content) + action_result_json = { + 'success': action_result.success, + 'actionId': action_result.actionId, + 'actionMethod': action_result.actionMethod, + 'actionName': action_result.actionName, + 'validation': action_result.validation, + 'error': action_result.error, + 'documents_count': len(action_result.documents), + 'document_names': [doc.filename if hasattr(doc, 'filename') else str(doc) for doc in action_result.documents], + 'data_keys': list(action_result.data.keys()) if isinstance(action_result.data, dict) else [], + 'metadata_keys': list(action_result.metadata.keys()) if isinstance(action_result.metadata, dict) else [] + } + logger.info(f"Action execution result for {action.execMethod}.{action.execAction}: {json.dumps(action_result_json, indent=2, ensure_ascii=False)}") + # Update action status based on validation if validation['status'] == 'success': action.setSuccess() @@ -2334,6 +2500,24 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" applied_improvements=improvements ) + # Log the retry action execution result as JSON (without document content) + retry_result_json = { + 'success': action_result.success, + 'actionId': action_result.actionId, + 'actionMethod': action_result.actionMethod, + 'actionName': action_result.actionName, + 'validation': action_result.validation, + 'error': action_result.error, + 'is_retry': action_result.is_retry, + 'previous_error': action_result.previous_error, + 'applied_improvements': action_result.applied_improvements, + 'documents_count': len(action_result.documents), + 'document_names': [doc.filename if hasattr(doc, 'filename') else str(doc) for doc in action_result.documents], + 'data_keys': list(action_result.data.keys()) if isinstance(action_result.data, dict) else [], + 'metadata_keys': list(action_result.metadata.keys()) if isinstance(action_result.metadata, dict) else [] + } + logger.info(f"Retry action execution result for {action.execMethod}.{action.execAction}: {json.dumps(retry_result_json, indent=2, ensure_ascii=False)}") + # Update action status if validation['status'] == 'success': enhanced_action.setSuccess() diff --git a/modules/interfaces/interfaceChatModel.py b/modules/interfaces/interfaceChatModel.py index 28559d1f..439fa38a 100644 --- a/modules/interfaces/interfaceChatModel.py +++ b/modules/interfaces/interfaceChatModel.py @@ -185,6 +185,8 @@ class TaskAction(BaseModel, ModelMixin): execAction: str = Field(..., description="Action to perform") execParameters: Dict[str, Any] = Field(default_factory=dict, description="Action parameters") execResultLabel: Optional[str] = Field(None, description="Label for the set of result documents") + # NEW: Optional document format specification + expectedDocumentFormats: Optional[List[Dict[str, str]]] = Field(None, description="Expected document formats (optional)") status: TaskStatus = Field(default=TaskStatus.PENDING, description="Action status") error: Optional[str] = Field(None, description="Error message if action failed") retryCount: int = Field(default=0, description="Number of retries attempted") diff --git a/modules/interfaces/interfaceChatObjects.py b/modules/interfaces/interfaceChatObjects.py index 4132027d..86cf8cc8 100644 --- a/modules/interfaces/interfaceChatObjects.py +++ b/modules/interfaces/interfaceChatObjects.py @@ -7,7 +7,7 @@ import os import logging import uuid import time -from datetime import datetime, UTC +from datetime import datetime, UTC, timezone from typing import Dict, Any, List, Optional, Union import asyncio @@ -128,8 +128,8 @@ class ChatObjects: return self.db.getInitialId(table) def _getCurrentTimestamp(self) -> str: - """Returns the current timestamp in ISO format""" - return datetime.now().isoformat() + """Returns the current timestamp as Unix timestamp (seconds since epoch)""" + return str(int(time.time())) # Workflow methods @@ -576,8 +576,45 @@ class ChatObjects: "processingTime": 0 } - # Simple processing time - just use current time - processing_time = time.time() + # Calculate processing time as duration since workflow start using Unix timestamps + workflow = self.getWorkflow(workflowId) + if workflow and workflow.startedAt: + try: + # Parse start time as Unix timestamp (handle both old ISO format and new Unix format) + start_time_str = workflow.startedAt + try: + # Try to parse as Unix timestamp first + start_time = int(float(start_time_str)) + except ValueError: + # If that fails, try to parse as ISO format and convert to Unix + try: + # Handle ISO format timestamps (for backward compatibility) + if start_time_str.endswith('Z'): + start_time_str = start_time_str.replace('Z', '+00:00') + dt = datetime.fromisoformat(start_time_str) + start_time = int(dt.timestamp()) + except: + # If all parsing fails, use current time + logger.warning(f"Could not parse start time: {start_time_str}, using current time") + start_time = int(time.time()) + + current_time = int(time.time()) + processing_time = current_time - start_time + + # Ensure processing time is reasonable (not negative or extremely large) + if processing_time < 0: + logger.warning(f"Negative processing time calculated: {processing_time}, using 0") + processing_time = 0 + elif processing_time > 86400 * 365: # More than 1 year + logger.warning(f"Unreasonably large processing time: {processing_time}, using 0") + processing_time = 0 + + except Exception as e: + logger.warning(f"Error calculating processing time: {str(e)}") + processing_time = currentStats.get("processingTime", 0) or 0 + else: + # Fallback to existing processing time or 0 + processing_time = currentStats.get("processingTime", 0) or 0 # Update stats with incremental values - ensure no None values current_bytes_sent = currentStats.get("bytesSent", 0) or 0 @@ -793,8 +830,8 @@ class ChatObjects: # Load logs logs = self.getWorkflowLogs(workflowId) - # Sort by timestamp - logs.sort(key=lambda x: x.get("timestamp", "")) + # Sort by timestamp (Unix timestamps) + logs.sort(key=lambda x: float(x.get("timestamp", 0))) # Assemble complete workflow object completeWorkflow = workflow.copy() @@ -1205,12 +1242,13 @@ class ChatObjects: execAction=createdAction["execAction"], execParameters=createdAction.get("execParameters", {}), execResultLabel=createdAction.get("execResultLabel"), + expectedDocumentFormats=createdAction.get("expectedDocumentFormats"), status=createdAction.get("status", TaskStatus.PENDING), error=createdAction.get("error"), retryCount=createdAction.get("retryCount", 0), retryMax=createdAction.get("retryMax", 3), processingTime=createdAction.get("processingTime"), - timestamp=datetime.fromisoformat(createdAction.get("timestamp", datetime.now().isoformat())), + timestamp=datetime.fromtimestamp(float(createdAction.get("timestamp", time.time()))), result=createdAction.get("result"), resultDocuments=createdAction.get("resultDocuments", []) ) diff --git a/modules/methods/methodDocument.py b/modules/methods/methodDocument.py index 456bac95..fd2a67e7 100644 --- a/modules/methods/methodDocument.py +++ b/modules/methods/methodDocument.py @@ -24,17 +24,19 @@ class MethodDocument(MethodBase): @action async def extract(self, parameters: Dict[str, Any]) -> ActionResult: """ - Extract specific content from document with ai prompt and return it as a json file + Extract specific content from document with ai prompt and return it in the specified format Parameters: documentList (str): Reference to the document list to extract content from aiPrompt (str): AI prompt for content extraction includeMetadata (bool, optional): Whether to include metadata (default: True) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: documentList = parameters.get("documentList") aiPrompt = parameters.get("aiPrompt") includeMetadata = parameters.get("includeMetadata", True) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not documentList: return self._createResult( @@ -58,6 +60,31 @@ class MethodDocument(MethodBase): error="No documents found for the provided reference" ) + # Determine output format based on expected formats + output_extension = ".txt" # Default + output_mime_type = "text/plain" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".txt") + output_mime_type = expected_format.get("mimeType", "text/plain") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + logger.info(f"Expected document formats: {expectedDocumentFormats}") + else: + logger.info("No expected format specified, using default .txt format") + + # Enhance AI prompt to specify output format + enhanced_prompt = aiPrompt + if output_extension == ".csv": + enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure CSV data without any markdown formatting, code blocks, or additional text. Output only the CSV content with proper headers and data rows. Do not include ```csv or ``` markers." + elif output_extension == ".json": + enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure JSON data without any markdown formatting, code blocks, or additional text. Output only the JSON content. Do not include ```json or ``` markers." + elif output_extension == ".xml": + enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure XML data without any markdown formatting, code blocks, or additional text. Output only the XML content. Do not include ```xml or ``` markers." + elif output_extension != ".txt": + enhanced_prompt += f"\n\nCRITICAL: Deliver the result as pure {output_extension.upper()} data without any markdown formatting, code blocks, or additional text. Output only the {output_extension.upper()} content. Do not include any markdown markers." + # Extract content from all documents all_extracted_content = [] file_infos = [] @@ -72,7 +99,7 @@ class MethodDocument(MethodBase): continue extracted_content = await self.service.extractContentFromFileData( - prompt=aiPrompt, + prompt=enhanced_prompt, # Use enhanced prompt instead of original fileData=file_data, filename=file_info.get('name', 'document'), mimeType=file_info.get('mimeType', 'application/octet-stream'), @@ -105,25 +132,50 @@ class MethodDocument(MethodBase): # Fallback: convert to string representation text_contents.append(str(content_obj)) - # Combine all extracted text content - combined_content = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join(text_contents) + # Process each document individually and create separate output files + output_documents = [] - result_data = { - "documentCount": len(chatDocuments), - "content": combined_content, - "fileInfos": file_infos if includeMetadata else None, - "timestamp": datetime.now(UTC).isoformat() - } + for i, (chatDocument, extracted_content) in enumerate(zip(chatDocuments, all_extracted_content)): + # Extract text content from this document + text_content = "" + if hasattr(extracted_content, 'contents') and extracted_content.contents: + # Extract text from ContentItem objects + for content_item in extracted_content.contents: + if hasattr(content_item, 'data') and content_item.data: + text_content += content_item.data + "\n" + elif isinstance(extracted_content, str): + text_content = extracted_content + else: + # Fallback: convert to string representation + text_content = str(extracted_content) + + # Create output filename based on original filename + original_filename = chatDocument.filename + base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename + output_filename = f"{base_name}_extracted_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}" + + # Create result data for this document + result_data = { + "documentCount": 1, + "content": text_content, + "originalFilename": original_filename, + "fileInfos": [file_infos[i]] if includeMetadata and i < len(file_infos) else None, + "timestamp": datetime.now(UTC).isoformat() + } + + logger.info(f"Created output document: {output_filename} with {len(text_content)} characters") + logger.info(f"Content preview: {text_content[:200]}...") + + output_documents.append({ + "documentName": output_filename, + "documentData": result_data, + "mimeType": output_mime_type + }) return self._createResult( success=True, data={ - "documents": [ - { - "documentName": f"extracted_content_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.txt", - "documentData": result_data - } - ] + "documents": output_documents } ) except Exception as e: diff --git a/modules/methods/methodOutlook.py b/modules/methods/methodOutlook.py index a2e91896..4fbd3cdf 100644 --- a/modules/methods/methodOutlook.py +++ b/modules/methods/methodOutlook.py @@ -55,12 +55,14 @@ class MethodOutlook(MethodBase): folder (str, optional): Email folder to read from (default: "Inbox") limit (int, optional): Maximum number of emails to read (default: 10) filter (str, optional): Filter criteria for emails + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") folder = parameters.get("folder", "Inbox") limit = parameters.get("limit", 10) filter = parameters.get("filter") + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference: return self._createResult( @@ -112,13 +114,27 @@ class MethodOutlook(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"outlook_emails_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"outlook_emails_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -144,6 +160,7 @@ class MethodOutlook(MethodBase): body (str): Email body content cc (List[str], optional): CC recipients bcc (List[str], optional): BCC recipients + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") @@ -152,6 +169,7 @@ class MethodOutlook(MethodBase): body = parameters.get("body") cc = parameters.get("cc", []) bcc = parameters.get("bcc", []) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference or not to or not subject or not body: return self._createResult( @@ -207,11 +225,29 @@ class MethodOutlook(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ - "documentName": f"outlook_email_sent_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documents": [ + { + "documentName": f"outlook_email_sent_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type + } + ] } ) @@ -233,12 +269,14 @@ class MethodOutlook(MethodBase): query (str): Search query folder (str, optional): Folder to search in (default: "All") limit (int, optional): Maximum number of results (default: 20) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") query = parameters.get("query") folder = parameters.get("folder", "All") limit = parameters.get("limit", 20) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference or not query: return self._createResult( @@ -290,11 +328,29 @@ class MethodOutlook(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ - "documentName": f"outlook_email_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documents": [ + { + "documentName": f"outlook_email_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type + } + ] } ) diff --git a/modules/methods/methodSharepoint.py b/modules/methods/methodSharepoint.py index 8288119d..0560a754 100644 --- a/modules/methods/methodSharepoint.py +++ b/modules/methods/methodSharepoint.py @@ -54,12 +54,14 @@ class MethodSharepoint(MethodBase): siteUrl (str): SharePoint site URL query (str): Query or description to find document searchScope (str, optional): Search scope (default: "all") + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") siteUrl = parameters.get("siteUrl") query = parameters.get("query") searchScope = parameters.get("searchScope", "all") + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference or not siteUrl or not query: return self._createResult( @@ -108,13 +110,27 @@ class MethodSharepoint(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"sharepoint_find_path_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"sharepoint_find_path_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -139,6 +155,7 @@ class MethodSharepoint(MethodBase): siteUrl (str): SharePoint site URL documentPaths (List[str]): List of paths to the documents in SharePoint includeMetadata (bool, optional): Whether to include metadata (default: True) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: documentList = parameters.get("documentList") @@ -146,6 +163,7 @@ class MethodSharepoint(MethodBase): siteUrl = parameters.get("siteUrl") documentPaths = parameters.get("documentPaths") includeMetadata = parameters.get("includeMetadata", True) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not documentList or not connectionReference or not siteUrl or not documentPaths: return self._createResult( @@ -218,13 +236,27 @@ class MethodSharepoint(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"sharepoint_documents_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"sharepoint_documents_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -248,6 +280,7 @@ class MethodSharepoint(MethodBase): documentPaths (List[str]): List of paths where to upload the documents documentList (str): Reference to the document list to upload fileNames (List[str]): List of names for the uploaded files + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") @@ -255,6 +288,7 @@ class MethodSharepoint(MethodBase): documentPaths = parameters.get("documentPaths") documentList = parameters.get("documentList") fileNames = parameters.get("fileNames") + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference or not siteUrl or not documentPaths or not documentList or not fileNames: return self._createResult( @@ -339,13 +373,27 @@ class MethodSharepoint(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"sharepoint_upload_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"sharepoint_upload_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -369,12 +417,14 @@ class MethodSharepoint(MethodBase): siteUrl (str): SharePoint site URL folderPaths (List[str]): List of paths to the folders to list includeSubfolders (bool, optional): Whether to include subfolders (default: False) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: connectionReference = parameters.get("connectionReference") siteUrl = parameters.get("siteUrl") folderPaths = parameters.get("folderPaths") includeSubfolders = parameters.get("includeSubfolders", False) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not connectionReference or not siteUrl or not folderPaths: return self._createResult( @@ -436,13 +486,27 @@ class MethodSharepoint(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"sharepoint_document_list_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"sharepoint_document_list_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } diff --git a/modules/methods/methodWeb.py b/modules/methods/methodWeb.py index a03549e5..c257bed9 100644 --- a/modules/methods/methodWeb.py +++ b/modules/methods/methodWeb.py @@ -224,12 +224,14 @@ class MethodWeb(MethodBase): maxDepth (int, optional): Maximum crawl depth (default: 2) includeImages (bool, optional): Whether to include images (default: False) followLinks (bool, optional): Whether to follow links (default: True) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: urls = parameters.get("urls") maxDepth = parameters.get("maxDepth", 2) includeImages = parameters.get("includeImages", False) followLinks = parameters.get("followLinks", True) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not urls: return self._createResult( @@ -307,13 +309,27 @@ class MethodWeb(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -336,11 +352,13 @@ class MethodWeb(MethodBase): url (str): URL to scrape selectors (Dict[str, str]): CSS selectors for data extraction format (str, optional): Output format (default: "json") + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: url = parameters.get("url") selectors = parameters.get("selectors") format = parameters.get("format", "json") + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not url or not selectors: return self._createResult( @@ -400,13 +418,27 @@ class MethodWeb(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = f".{format}" # Default to format parameter + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", f".{format}") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info(f"No expected format specified, using format parameter: {format}") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.{format}", - "documentData": result_data + "documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -430,12 +462,14 @@ class MethodWeb(MethodBase): engine (str, optional): Search engine to use (default: "google") maxResults (int, optional): Maximum number of results (default: 10) filter (str, optional): Additional search filters + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: query = parameters.get("query") engine = parameters.get("engine", "google") maxResults = parameters.get("maxResults", 10) filter = parameters.get("filter") + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not query: return self._createResult( @@ -533,13 +567,27 @@ class MethodWeb(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] } @@ -561,10 +609,12 @@ class MethodWeb(MethodBase): Parameters: url (str): URL to validate checks (List[str], optional): Types of checks to perform (default: ["accessibility", "seo", "performance"]) + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: url = parameters.get("url") checks = parameters.get("checks", ["accessibility", "seo", "performance"]) + expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) if not url: return self._createResult( @@ -609,13 +659,27 @@ class MethodWeb(MethodBase): "timestamp": datetime.now(UTC).isoformat() } + # Determine output format based on expected formats + output_extension = ".json" # Default + output_mime_type = "application/json" # Default + + if expectedDocumentFormats and len(expectedDocumentFormats) > 0: + # Use the first expected format + expected_format = expectedDocumentFormats[0] + output_extension = expected_format.get("extension", ".json") + output_mime_type = expected_format.get("mimeType", "application/json") + logger.info(f"Using expected format: {output_extension} ({output_mime_type})") + else: + logger.info("No expected format specified, using default .json format") + return self._createResult( success=True, data={ "documents": [ { - "documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", - "documentData": result_data + "documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", + "documentData": result_data, + "mimeType": output_mime_type } ] }