From b1be8dd81ca1ddb68b63a007a25bf9d31eca68d5 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sat, 12 Jul 2025 16:26:20 +0200 Subject: [PATCH] web refactored --- app.py | 20 +- modules/chat/managerChat.py | 481 ++++++++++- modules/connectors/connectorAiAnthropic.py | 18 +- modules/interfaces/interfaceChatObjects.py | 26 +- modules/methods/methodWeb.py | 881 +++++++++++++++++---- modules/workflow/managerWorkflow.py | 2 +- 6 files changed, 1242 insertions(+), 186 deletions(-) diff --git a/app.py b/app.py index 05b54a9c..fda05eaa 100644 --- a/app.py +++ b/app.py @@ -37,6 +37,22 @@ def initLogging(): ('.well-known/appspecific/com.chrome.devtools.json' in record.msg or 'Request: /index.html' in record.msg)) + # Add filter to exclude HTTP debug messages + class HTTPDebugFilter(logging.Filter): + def filter(self, record): + if isinstance(record.msg, str): + # Filter out HTTP debug messages + http_debug_patterns = [ + 'receive_response_body.started', + 'receive_response_body.complete', + 'response_closed.started', + '_send_single_request', + 'httpcore.http11', + 'httpx._client' + ] + return not any(pattern in record.msg for pattern in http_debug_patterns) + return True + # Configure handlers based on config handlers = [] @@ -45,6 +61,7 @@ def initLogging(): consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(consoleFormatter) consoleHandler.addFilter(ChromeDevToolsFilter()) + consoleHandler.addFilter(HTTPDebugFilter()) handlers.append(consoleHandler) # Add file handler if enabled @@ -71,6 +88,7 @@ def initLogging(): ) fileHandler.setFormatter(fileFormatter) fileHandler.addFilter(ChromeDevToolsFilter()) + fileHandler.addFilter(HTTPDebugFilter()) handlers.append(fileHandler) # Configure the root logger @@ -83,7 +101,7 @@ def initLogging(): ) # Silence noisy third-party libraries - use the same level as the root logger - noisyLoggers = ["httpx", "urllib3", "asyncio", "fastapi.security.oauth2"] + noisyLoggers = ["httpx", "httpcore", "urllib3", "asyncio", "fastapi.security.oauth2"] for loggerName in noisyLoggers: logging.getLogger(loggerName).setLevel(logLevel) diff --git a/modules/chat/managerChat.py b/modules/chat/managerChat.py index 3ed52fdb..1598c27a 100644 --- a/modules/chat/managerChat.py +++ b/modules/chat/managerChat.py @@ -317,8 +317,8 @@ class ChatManager: 'workflow_id': workflow.id }) - # Get AI response - response = await self.service.callAiTextAdvanced(prompt) + # Get AI response with fallback mechanism + response = await self._callAIWithCircuitBreaker(prompt, "task_planning") # Parse and validate task plan task_plan_dict = self._parseTaskPlanResponse(response) @@ -372,8 +372,22 @@ class ChatManager: return task_plan except Exception as e: - logger.error(f"Error in high-level task planning: {str(e)}") - raise Exception(f"AI is required for task planning but failed: {str(e)}") + error_message = str(e) + logger.error(f"Error in high-level task planning: {error_message}") + + # Provide more specific error messages based on the error type + if "overloaded" in error_message.lower() or "529" in error_message: + detailed_error = "AI service is currently overloaded. Please try again in a few minutes." + elif "rate limit" in error_message.lower() or "429" in error_message: + detailed_error = "Rate limit exceeded. Please wait before making another request." + elif "api key" in error_message.lower() or "401" in error_message: + detailed_error = "Invalid API key. Please check your AI service configuration." + elif "timeout" in error_message.lower(): + detailed_error = "AI service request timed out. Please try again." + else: + detailed_error = f"AI service error: {error_message}" + + raise Exception(detailed_error) # Phase 2: Task Definition and Action Generation async def defineTaskActions(self, task_step: TaskStep, workflow: ChatWorkflow, previous_results: List[str] = None, @@ -641,29 +655,127 @@ class ChatManager: # ===== Enhanced Task Planning Methods ===== async def _callAIWithCircuitBreaker(self, prompt: str, context: str) -> str: - """Call AI with circuit breaker pattern for fault tolerance""" - try: - # Check circuit breaker - if self._isCircuitBreakerOpen(): - raise Exception("AI circuit breaker is open - too many recent failures") - - # Call AI with timeout - logger.debug(f"ACTION GENERATION PROMPT: {prompt}") - response = await asyncio.wait_for( - self._callAI(prompt, context), - timeout=self.ai_call_timeout - ) - - # Reset failure count on success - self.ai_failure_count = 0 - return response - - except asyncio.TimeoutError: - self._recordAIFailure("Timeout") - raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds") - except Exception as e: - self._recordAIFailure(str(e)) - raise + """Call AI with intelligent routing based on complexity and circuit breaker pattern""" + max_retries = 3 + base_delay = 2 # Start with 2 seconds + + for attempt in range(max_retries): + try: + # Check circuit breaker + if self._isCircuitBreakerOpen(): + raise Exception("AI circuit breaker is open - too many recent failures") + + # Determine which AI service to use based on complexity + ai_choice = self._determineAIChoice(prompt, context) + logger.debug(f"AI choice for {context}: {ai_choice} (attempt {attempt + 1}/{max_retries})") + + if ai_choice == "advanced": + # Use advanced AI for complex tasks + try: + response = await asyncio.wait_for( + self._callAdvancedAI(prompt, context), + timeout=self.ai_call_timeout + ) + + # Reset failure count on success + self.ai_failure_count = 0 + logger.info(f"Advanced AI call successful for {context}") + return response + + except Exception as advanced_error: + error_message = str(advanced_error) + logger.warning(f"Advanced AI call failed for {context}: {error_message}") + + # Fall back to basic AI for complex tasks + logger.info(f"Falling back to basic AI for complex task: {context}") + try: + response = await asyncio.wait_for( + self._callStandardAI(prompt, context), + timeout=self.ai_call_timeout + ) + + # Reset failure count on success + self.ai_failure_count = 0 + logger.info(f"Basic AI fallback successful for complex task: {context}") + return response + + except Exception as standard_error: + # Both failed for complex task + error_message = f"Advanced AI failed: {str(advanced_error)}. Basic AI failed: {str(standard_error)}" + raise Exception(error_message) + + else: # basic + # Use basic AI for simple tasks + try: + response = await asyncio.wait_for( + self._callStandardAI(prompt, context), + timeout=self.ai_call_timeout + ) + + # Reset failure count on success + self.ai_failure_count = 0 + logger.info(f"Basic AI call successful for {context}") + return response + + except Exception as basic_error: + error_message = str(basic_error) + logger.warning(f"Basic AI call failed for {context}: {error_message}") + + # Only upgrade to advanced AI for critical simple tasks + if self._isCriticalTask(context): + logger.info(f"Upgrading to advanced AI for critical simple task: {context}") + try: + response = await asyncio.wait_for( + self._callAdvancedAI(prompt, context), + timeout=self.ai_call_timeout + ) + + # Reset failure count on success + self.ai_failure_count = 0 + logger.info(f"Advanced AI upgrade successful for critical task: {context}") + return response + + except Exception as advanced_error: + # Both failed for critical task + error_message = f"Basic AI failed: {str(basic_error)}. Advanced AI failed: {str(advanced_error)}" + raise Exception(error_message) + else: + # Non-critical simple task failed + raise Exception(f"Basic AI failed for simple task: {error_message}") + + except asyncio.TimeoutError: + self._recordAIFailure("Timeout") + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) # Exponential backoff + logger.warning(f"AI call timed out, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(delay) + continue + else: + raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds") + + except Exception as e: + error_message = str(e) + + # Special handling for overloaded service (529 error) + if "overloaded" in error_message.lower() or "529" in error_message: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) # Exponential backoff + logger.warning(f"AI service overloaded, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(delay) + continue + else: + # Don't record this as a circuit breaker failure since it's a service issue + raise Exception("AI service is currently overloaded. Please try again in a few minutes.") + + # For other errors, record failure and potentially retry + self._recordAIFailure(error_message) + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) # Exponential backoff + logger.warning(f"AI call failed, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries}): {error_message}") + await asyncio.sleep(delay) + continue + else: + raise def _isCircuitBreakerOpen(self) -> bool: """Check if circuit breaker is open""" @@ -678,6 +790,146 @@ class ChatManager: self.ai_last_failure_time = None return False + def _determineAIChoice(self, prompt: str, context: str) -> str: + """Determine whether to use advanced or basic AI based on task complexity""" + + # Check for forced AI choice based on context + forced_choice = self._getForcedAIChoice(context) + if forced_choice: + logger.debug(f"Forced AI choice for {context}: {forced_choice}") + return forced_choice + + # Define complex task patterns that require advanced AI + complex_patterns = [ + # Task planning and workflow management + "task_planning", "action_generation", "result_review", "task_completion_validation", + + # Complex document analysis + "document", "extract", "analysis", "comprehensive", "detailed analysis", + + # Multi-step reasoning + "plan", "strategy", "evaluate", "assess", "compare", "analyze", + + # Complex business logic + "workflow", "task", "action", "validation", "review", "assessment", + + # Critical decision making + "decision", "recommendation", "evaluation", "quality", "success criteria", + + # Complex prompts + "JSON", "structured", "format", "validation", "improvements", "quality_score" + ] + + # Define simple task patterns that can use basic AI + simple_patterns = [ + # Basic text processing + "summarize", "translate", "format", "convert", "extract text", + + # Simple queries + "find", "search", "list", "get", "retrieve", + + # Basic operations + "send", "upload", "download", "create", "delete", + + # Simple responses + "confirm", "acknowledge", "status", "info" + ] + + # Check prompt and context for complexity indicators + combined_text = f"{prompt} {context}".lower() + + # Count complex indicators + complex_count = sum(1 for pattern in complex_patterns if pattern in combined_text) + + # Count simple indicators + simple_count = sum(1 for pattern in simple_patterns if pattern in combined_text) + + # Additional complexity factors + prompt_length = len(prompt) + has_json_requirement = "json" in combined_text and ("{" in prompt or "}" in prompt) + has_structured_output = any(word in combined_text for word in ["format", "structure", "template"]) + has_validation = any(word in combined_text for word in ["validate", "check", "verify", "quality"]) + + # Calculate complexity score + complexity_score = 0 + complexity_score += complex_count * 2 # Complex patterns worth more + complexity_score += simple_count * 1 # Simple patterns worth less + complexity_score += (prompt_length > 1000) * 3 # Long prompts are complex + complexity_score += has_json_requirement * 5 # JSON requirements are complex + complexity_score += has_structured_output * 3 # Structured output is complex + complexity_score += has_validation * 4 # Validation is complex + + # Determine AI choice based on complexity score + if complexity_score >= 5: + logger.debug(f"Complex task detected (score: {complexity_score}) - using advanced AI for {context}") + return "advanced" + else: + logger.debug(f"Simple task detected (score: {complexity_score}) - using basic AI for {context}") + return "basic" + + def _getForcedAIChoice(self, context: str) -> str: + """Get forced AI choice for specific contexts (can be overridden)""" + + # Define contexts that always use advanced AI + advanced_contexts = [ + "task_planning", # Always use advanced for task planning + "action_generation", # Always use advanced for action generation + "result_review", # Always use advanced for result review + "task_completion_validation" # Always use advanced for validation + ] + + # Define contexts that always use basic AI + basic_contexts = [ + "summarize", # Always use basic for summarization + "translate", # Always use basic for translation + "format", # Always use basic for formatting + "status", # Always use basic for status updates + "info" # Always use basic for info queries + ] + + context_lower = context.lower() + + # Check for forced advanced AI + for advanced_context in advanced_contexts: + if advanced_context in context_lower: + return "advanced" + + # Check for forced basic AI + for basic_context in basic_contexts: + if basic_context in context_lower: + return "basic" + + # No forced choice + return None + + def _isCriticalTask(self, context: str) -> bool: + """Determine if a simple task is critical enough to warrant advanced AI upgrade""" + + # Define critical task patterns + critical_patterns = [ + # Workflow critical tasks + "task_planning", "workflow", "critical", "essential", + + # User-facing decisions + "decision", "recommendation", "evaluation", "assessment", + + # Quality-sensitive tasks + "quality", "validation", "review", "check", + + # Business-critical operations + "business", "strategy", "planning", "analysis" + ] + + context_lower = context.lower() + + # Check if context contains critical patterns + is_critical = any(pattern in context_lower for pattern in critical_patterns) + + if is_critical: + logger.debug(f"Critical task detected - {context}") + + return is_critical + def _recordAIFailure(self, error: str): """Record AI failure for circuit breaker""" self.ai_failure_count += 1 @@ -1753,14 +2005,35 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" logger.error(f"Error parsing review response: {str(e)}") return {'status': 'failed', 'reason': f'Parse error: {str(e)}'} - async def _callAI(self, prompt: str, context: str) -> str: - """Call AI service with prompt""" + async def _callAdvancedAI(self, prompt: str, context: str) -> str: + """Call advanced AI service with prompt (primary method)""" try: - # Use the existing AI call mechanism through service + # Use the advanced AI call mechanism through service if hasattr(self, 'service') and self.service: - # Ensure service is properly initialized + # Try advanced AI call first + if hasattr(self.service, 'callAiTextAdvanced'): + response = await self.service.callAiTextAdvanced(prompt) + logger.debug(f"Advanced AI call successful for {context}") + return response + else: + raise Exception("Service does not have callAiTextAdvanced method") + else: + raise Exception("No service available for AI calls") + + except Exception as e: + error_message = str(e) + logger.warning(f"Advanced AI call failed for {context}: {error_message}") + raise Exception(f"Advanced AI failed: {error_message}") + + async def _callStandardAI(self, prompt: str, context: str) -> str: + """Call standard AI service with prompt (fallback method)""" + try: + # Use the standard AI call mechanism through service + if hasattr(self, 'service') and self.service: + # Try standard AI call as fallback if hasattr(self.service, 'callAiTextBasic'): response = await self.service.callAiTextBasic(prompt) + logger.debug(f"Standard AI call successful for {context}") return response else: raise Exception("Service does not have callAiTextBasic method") @@ -1768,8 +2041,26 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" raise Exception("No service available for AI calls") except Exception as e: - logger.error(f"Error calling AI for {context}: {str(e)}") - raise + error_message = str(e) + logger.error(f"Standard AI call failed for {context}: {error_message}") + + # Provide more specific error messages based on the error type + if "overloaded" in error_message.lower() or "529" in error_message: + detailed_error = "AI service is currently overloaded. Please try again in a few minutes." + elif "rate limit" in error_message.lower() or "429" in error_message: + detailed_error = "Rate limit exceeded. Please wait before making another request." + elif "api key" in error_message.lower() or "401" in error_message: + detailed_error = "Invalid API key. Please check your AI service configuration." + elif "timeout" in error_message.lower(): + detailed_error = "AI service request timed out. Please try again." + else: + detailed_error = f"AI service error: {error_message}" + + raise Exception(detailed_error) + + async def _callAI(self, prompt: str, context: str) -> str: + """Call AI service with prompt (legacy method - now uses the circuit breaker)""" + return await self._callAIWithCircuitBreaker(prompt, context) # ===== WORKFLOW FEEDBACK GENERATION ===== @@ -1866,11 +2157,20 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" task_description = task_step.description logger.info(f"=== PROCESSING TASK {i+1}/{len(task_plan.tasks)}: {task_description} ===") - # Create user-friendly task start log + # Create user-friendly task start log with action details progress = 20 + (i * 60 // len(task_plan.tasks)) + + # Get actions for this task to show in the log + task_actions = await self.defineTaskActions(task_step, workflow, previous_results) + action_details = [] + for j, action in enumerate(task_actions): + action_details.append(f" {j+1}. {action.execMethod}.{action.execAction}") + + action_summary = "\n".join(action_details) if action_details else " (Actions will be generated during execution)" + self.chatInterface.createWorkflowLog({ "workflowId": workflow.id, - "message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}", + "message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}\nActions to be executed:\n{action_summary}", "type": "info", "status": "running", "progress": progress, @@ -2032,6 +2332,16 @@ Please review the task requirements and try again with different input or approa for i, action in enumerate(actions): logger.info(f"Executing action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}") + # Add action start log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"Starting action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}", + "type": "info", + "status": "running", + "progress": 0, + "agentName": "System" + }) + # Execute action with validation result = await self.executeActionWithValidation(action, workflow, context) @@ -2039,7 +2349,37 @@ Please review the task requirements and try again with different input or approa state.addSuccessfulAction(result) logger.info(f"Action {i+1} completed successfully") + # Add action completion message with result documents + documents_info = "" + if result.documents and len(result.documents) > 0: + doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}" + for j, doc in enumerate(result.documents)] + documents_info = f"\nšŸ“„ Generated documents: {', '.join(doc_names)}" + + # Create completion message + completion_message = f"āœ… Action {i+1}/{len(actions)} completed: {action.execMethod}.{action.execAction}{documents_info}" + + # Add as log entry instead of message + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": completion_message, + "type": "success", + "status": "running", + "progress": 0, + "agentName": "System" + }) + elif result.validation.get('status') == 'retry': + # Add retry log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"šŸ”„ Action {i+1}/{len(actions)} needs retry: {action.execMethod}.{action.execAction}", + "type": "warning", + "status": "running", + "progress": 0, + "agentName": "System" + }) + # Retry individual action improvements = result.validation.get('improvements', []) retry_result = await self.retryActionWithImprovements(action, result, improvements) @@ -2047,15 +2387,51 @@ Please review the task requirements and try again with different input or approa if retry_result.validation.get('status') == 'success': state.addSuccessfulAction(retry_result) logger.info(f"Action {i+1} retry successful") + + # Add retry success log + retry_documents_info = "" + if retry_result.documents and len(retry_result.documents) > 0: + doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}" + for j, doc in enumerate(retry_result.documents)] + retry_documents_info = f"\nšŸ“„ Generated documents: {', '.join(doc_names)}" + + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"āœ… Action {i+1}/{len(actions)} retry successful: {action.execMethod}.{action.execAction}{retry_documents_info}", + "type": "success", + "status": "running", + "progress": 0, + "agentName": "System" + }) else: state.addFailedAction(retry_result) logger.error(f"Action {i+1} retry failed") + + # Add retry failure log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"āŒ Action {i+1}/{len(actions)} retry failed: {action.execMethod}.{action.execAction}", + "type": "error", + "status": "running", + "progress": 0, + "agentName": "System" + }) # Action failed after retry - stop task execution and regenerate break else: # fail state.addFailedAction(result) logger.error(f"Action {i+1} failed validation - stopping task execution") + + # Add failure log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"āŒ Action {i+1}/{len(actions)} failed: {action.execMethod}.{action.execAction}", + "type": "error", + "status": "running", + "progress": 0, + "agentName": "System" + }) # Action failed - stop task execution and regenerate break @@ -2412,12 +2788,47 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text.""" if validation['status'] == 'success': action.setSuccess() logger.info(f"Action {action.execMethod}.{action.execAction} validated successfully") + + # Only create action message if documents were produced + if result.documents and len(result.documents) > 0: + await self._createActionMessage(action, result, workflow, action.execResultLabel) + else: + # Add validation success log instead of message + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"āœ… Action validation successful: {action.execMethod}.{action.execAction}", + "type": "success", + "status": "running", + "progress": 0, + "agentName": "System" + }) + elif validation['status'] == 'retry': action.status = TaskStatus.PENDING # Keep pending for retry logger.warning(f"Action {action.execMethod}.{action.execAction} needs retry: {validation.get('reason', 'No reason')}") + + # Add validation retry log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"šŸ”„ Action validation requires retry: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}", + "type": "warning", + "status": "running", + "progress": 0, + "agentName": "System" + }) else: # fail action.setError(validation.get('reason', 'Action failed validation')) logger.error(f"Action {action.execMethod}.{action.execAction} failed validation: {validation.get('reason', 'No reason')}") + + # Add validation failure log + self.chatInterface.createWorkflowLog({ + "workflowId": workflow.id, + "message": f"āŒ Action validation failed: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}", + "type": "error", + "status": "running", + "progress": 0, + "agentName": "System" + }) return action_result diff --git a/modules/connectors/connectorAiAnthropic.py b/modules/connectors/connectorAiAnthropic.py index fce51e97..f9ce2f03 100644 --- a/modules/connectors/connectorAiAnthropic.py +++ b/modules/connectors/connectorAiAnthropic.py @@ -76,8 +76,22 @@ class AiAnthropic: ) if response.status_code != 200: - logger.error(f"Anthropic API error: {response.status_code} - {response.text}") - raise HTTPException(status_code=500, detail="Error communicating with Anthropic API") + error_detail = f"Anthropic API error: {response.status_code} - {response.text}" + logger.error(error_detail) + + # Provide more specific error messages based on status code + if response.status_code == 529: + error_message = "Anthropic API is currently overloaded. Please try again in a few minutes." + elif response.status_code == 429: + error_message = "Rate limit exceeded. Please wait before making another request." + elif response.status_code == 401: + error_message = "Invalid API key. Please check your Anthropic API configuration." + elif response.status_code == 400: + error_message = f"Invalid request to Anthropic API: {response.text}" + else: + error_message = f"Anthropic API error ({response.status_code}): {response.text}" + + raise HTTPException(status_code=500, detail=error_message) # Parse response anthropicResponse = response.json() diff --git a/modules/interfaces/interfaceChatObjects.py b/modules/interfaces/interfaceChatObjects.py index 86cf8cc8..689b5b6c 100644 --- a/modules/interfaces/interfaceChatObjects.py +++ b/modules/interfaces/interfaceChatObjects.py @@ -269,6 +269,10 @@ class ChatObjects: # Get messages for this workflow messages = self.db.getRecordset("workflowMessages", recordFilter={"workflowId": workflowId}) + + # Sort messages by publishedAt timestamp to ensure chronological order + messages.sort(key=lambda x: x.get("publishedAt", x.get("timestamp", "0"))) + return [ChatMessage(**msg) for msg in messages] def createWorkflowMessage(self, messageData: Dict[str, Any]) -> ChatMessage: @@ -545,7 +549,12 @@ class ChatObjects: return [] # Get logs for this workflow - return [ChatLog(**log) for log in self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId})] + logs = self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId}) + + # Sort logs by timestamp (Unix timestamps) + logs.sort(key=lambda x: float(x.get("timestamp", 0))) + + return [ChatLog(**log) for log in logs] def updateWorkflowStats(self, workflowId: str, bytesSent: int = 0, bytesReceived: int = 0) -> bool: """Updates workflow statistics during execution with incremental values.""" @@ -867,12 +876,25 @@ class ChatObjects: raise ValueError(f"Workflow {workflowId} not found") - # Update workflow + # Update workflow - set status back to running for resumed workflows self.updateWorkflow(workflowId, { + "status": "running", # Set status back to running for resumed workflows "lastActivity": currentTime, "currentRound": workflow.currentRound + 1 }) + # Update the workflow object status as well + workflow.status = "running" + + # Add log entry for workflow resumption + self.createWorkflowLog({ + "workflowId": workflowId, + "message": f"Workflow resumed (round {workflow.currentRound + 1})", + "type": "info", + "status": "running", + "progress": 0 + }) + else: # Create new workflow workflowData = { diff --git a/modules/methods/methodWeb.py b/modules/methods/methodWeb.py index c257bed9..c10b3a80 100644 --- a/modules/methods/methodWeb.py +++ b/modules/methods/methodWeb.py @@ -10,6 +10,7 @@ import requests from bs4 import BeautifulSoup import time import uuid +import json # Added for JSON parsing from modules.chat.methodBase import MethodBase, ActionResult, action from modules.shared.configuration import APP_CONFIG @@ -38,40 +39,105 @@ class MethodWeb(MethodBase): self.timeout = 30 def _readUrl(self, url: str) -> BeautifulSoup: - """Read a URL and return a BeautifulSoup parser for the content""" + """Read a URL and return a BeautifulSoup parser for the content with enhanced error handling""" if not url or not url.startswith(('http://', 'https://')): + logger.error(f"Invalid URL: {url}") return None - + + # Enhanced headers to mimic real browser headers = { - 'User-Agent': self.user_agent, - 'Accept': 'text/html,application/xhtml+xml,application/xml', - 'Accept-Language': 'en-US,en;q=0.9', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9,de;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0' } try: - # Initial request - response = requests.get(url, headers=headers, timeout=self.timeout) + # Use session for better connection handling + session = requests.Session() + session.headers.update(headers) - # Handling for status 202 - if response.status_code == 202: - # Retry with backoff - backoff_times = [0.5, 1.0, 2.0, 5.0] + # Initial request with allow_redirects + response = session.get(url, timeout=self.timeout, allow_redirects=True) + + # Handle various status codes + if response.status_code == 200: + # Success - parse content + logger.debug(f"Successfully read URL: {url}") + return BeautifulSoup(response.text, 'html.parser') + + elif response.status_code == 202: + # Accepted - retry with backoff + logger.info(f"Status 202 for {url}, retrying with backoff...") + backoff_times = [1.0, 2.0, 5.0, 10.0] for wait_time in backoff_times: time.sleep(wait_time) - response = requests.get(url, headers=headers, timeout=self.timeout) + retry_response = session.get(url, timeout=self.timeout, allow_redirects=True) - if response.status_code != 202: + if retry_response.status_code == 200: + logger.debug(f"Successfully read URL after retry: {url}") + return BeautifulSoup(retry_response.text, 'html.parser') + elif retry_response.status_code != 202: break - - # Raise for error status codes - response.raise_for_status() - - # Parse HTML - return BeautifulSoup(response.text, 'html.parser') - + + logger.warning(f"Failed to read URL after retries: {url}") + return None + + elif response.status_code in [301, 302, 307, 308]: + # Redirect - should be handled by allow_redirects=True + logger.warning(f"Unexpected redirect status {response.status_code} for {url}") + return None + + elif response.status_code == 403: + # Forbidden - try with different user agent + logger.warning(f"403 Forbidden for {url}, trying with different user agent...") + headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + session.headers.update(headers) + + retry_response = session.get(url, timeout=self.timeout, allow_redirects=True) + if retry_response.status_code == 200: + logger.debug(f"Successfully read URL with different user agent: {url}") + return BeautifulSoup(retry_response.text, 'html.parser') + else: + logger.error(f"Still getting {retry_response.status_code} for {url}") + return None + + elif response.status_code == 429: + # Rate limited - wait and retry + logger.warning(f"Rate limited for {url}, waiting 30 seconds...") + time.sleep(30) + retry_response = session.get(url, timeout=self.timeout, allow_redirects=True) + if retry_response.status_code == 200: + logger.debug(f"Successfully read URL after rate limit: {url}") + return BeautifulSoup(retry_response.text, 'html.parser') + else: + logger.error(f"Still getting {retry_response.status_code} after rate limit wait for {url}") + return None + + else: + # Other error status codes + logger.error(f"HTTP {response.status_code} for {url}") + return None + + except requests.exceptions.Timeout: + logger.error(f"Timeout reading URL: {url}") + return None + except requests.exceptions.ConnectionError: + logger.error(f"Connection error reading URL: {url}") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Request error reading URL {url}: {str(e)}") + return None except Exception as e: - logger.error(f"Error reading URL {url}: {str(e)}") + logger.error(f"Unexpected error reading URL {url}: {str(e)}") return None def _extractTitle(self, soup: BeautifulSoup, url: str) -> str: @@ -91,32 +157,109 @@ class MethodWeb(MethodBase): return title - def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 10000) -> str: - """Extract the main content from an HTML page""" + def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 50000) -> str: + """Extract the main content from an HTML page with enhanced content detection""" if not soup: return "" - # Try to find main content elements in priority order + # Try to find main content elements in priority order with more selectors main_content = None - for selector in ['main', 'article', '#content', '.content', '#main', '.main']: + content_selectors = [ + 'main', + 'article', + '#content', + '.content', + '#main', + '.main', + '.post-content', + '.entry-content', + '.article-content', + '.page-content', + '[role="main"]', + '.container', + '.wrapper' + ] + + for selector in content_selectors: content = soup.select_one(selector) if content: main_content = content + logger.debug(f"Found main content using selector: {selector}") break # If no main content found, use the body if not main_content: main_content = soup.find('body') or soup + logger.debug("Using body as main content") - # Remove script, style, nav, footer elements that don't contribute to main content - for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'): - element.extract() + # Create a copy to avoid modifying the original + content_copy = main_content.copy() - # Extract text content - text_content = main_content.get_text(separator=' ', strip=True) + # Remove elements that don't contribute to main content (less aggressive) + elements_to_remove = [ + 'script', 'style', 'noscript', + 'nav', 'footer', 'header', 'aside', + '.sidebar', '#sidebar', '.comments', '#comments', + '.advertisement', '.ads', '.ad', '.banner', + 'iframe', '.social-share', '.share-buttons', + '.breadcrumb', '.breadcrumbs', '.pagination', + '.related-posts', '.related-articles', + '.newsletter', '.subscribe', '.signup', + '.cookie-notice', '.privacy-notice', + '.popup', '.modal', '.overlay' + ] - # Limit to max_chars - return text_content[:max_chars] + for selector in elements_to_remove: + for element in content_copy.select(selector): + element.extract() + + # Extract text content with better formatting + text_content = content_copy.get_text(separator='\n', strip=True) + + # Clean up the text + lines = text_content.split('\n') + cleaned_lines = [] + + for line in lines: + line = line.strip() + if line and len(line) > 10: # Only keep meaningful lines + cleaned_lines.append(line) + + # Join lines with proper spacing + cleaned_content = '\n\n'.join(cleaned_lines) + + # If content is too short, try alternative extraction + if len(cleaned_content) < 500: + logger.debug("Content too short, trying alternative extraction...") + + # Try to extract from all paragraphs + paragraphs = soup.find_all(['p', 'div', 'section']) + alt_content = [] + + for p in paragraphs: + text = p.get_text(strip=True) + if text and len(text) > 20: # Only meaningful paragraphs + alt_content.append(text) + + if alt_content: + cleaned_content = '\n\n'.join(alt_content[:20]) # Limit to first 20 paragraphs + + # Limit to max_chars but preserve complete sentences + if len(cleaned_content) > max_chars: + # Try to cut at a sentence boundary + sentences = cleaned_content.split('. ') + truncated_content = "" + + for sentence in sentences: + if len(truncated_content + sentence) < max_chars: + truncated_content += sentence + ". " + else: + break + + cleaned_content = truncated_content.strip() + + logger.debug(f"Extracted {len(cleaned_content)} characters of content") + return cleaned_content def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]: """Check basic accessibility features""" @@ -214,10 +357,413 @@ class MethodWeb(MethodBase): } } + def _detectJavaScriptRendering(self, soup: BeautifulSoup) -> bool: + """Detect if a page likely requires JavaScript rendering""" + if not soup: + return False + + # Check for common indicators of JavaScript-rendered content + indicators = [ + # Angular, React, Vue indicators + soup.find('div', {'ng-app': True}), + soup.find('div', {'id': 'root'}), + soup.find('div', {'id': 'app'}), + soup.find('div', {'id': 'react-root'}), + + # SPA indicators + soup.find('div', {'id': 'spa-root'}), + soup.find('div', {'class': 'spa-container'}), + + # Modern framework indicators + soup.find('div', {'data-reactroot': True}), + soup.find('div', {'data-ng-controller': True}), + + # Empty content with scripts + len(soup.get_text(strip=True)) < 100 and len(soup.find_all('script')) > 2 + ] + + return any(indicators) + + def _extractMetaInformation(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]: + """Extract meta information from the page""" + meta_info = { + "url": url, + "title": self._extractTitle(soup, url), + "description": "", + "keywords": "", + "author": "", + "language": "", + "robots": "", + "viewport": "", + "charset": "", + "canonical": "" + } + + # Extract meta tags + meta_tags = soup.find_all('meta') + for meta in meta_tags: + name = meta.get('name', '').lower() + property = meta.get('property', '').lower() + content = meta.get('content', '') + + if name == 'description' or property == 'og:description': + meta_info['description'] = content + elif name == 'keywords': + meta_info['keywords'] = content + elif name == 'author': + meta_info['author'] = content + elif name == 'language': + meta_info['language'] = content + elif name == 'robots': + meta_info['robots'] = content + elif name == 'viewport': + meta_info['viewport'] = content + elif property == 'og:title': + meta_info['title'] = content + elif property == 'og:url': + meta_info['canonical'] = content + + # Extract charset + charset_meta = soup.find('meta', charset=True) + if charset_meta: + meta_info['charset'] = charset_meta.get('charset', '') + + # Extract canonical URL + canonical_link = soup.find('link', rel='canonical') + if canonical_link: + meta_info['canonical'] = canonical_link.get('href', '') + + return meta_info + + def _getAlternativeApproaches(self, url: str, requires_js: bool, content_length: int) -> List[str]: + """Get alternative approaches for sites that are difficult to crawl""" + approaches = [] + + if requires_js: + approaches.extend([ + "Site requires JavaScript rendering - consider using a headless browser", + "Try accessing the site's API endpoints directly", + "Look for RSS feeds or sitemaps", + "Check if the site has a mobile version that's easier to parse" + ]) + + if content_length < 100: + approaches.extend([ + "Site may have anti-bot protection - try with different user agents", + "Check if the site requires authentication", + "Look for alternative URLs (www vs non-www, http vs https)", + "Try accessing the site's robots.txt for crawling guidelines" + ]) + + # Add general suggestions + approaches.extend([ + "Use the web.search action to find alternative sources", + "Try the web.scrape action with specific CSS selectors", + "Check if the site has a public API or data export" + ]) + + return approaches + + async def _tryAdvancedAIWebResearch(self, action_type: str, parameters: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Try to get web research results using advanced AI first + + Args: + action_type: Type of action ('crawl', 'scrape', or 'search') + parameters: Action parameters + + Returns: + Dict with AI results if successful, None if AI call fails + """ + try: + # Create appropriate prompt based on action type + if action_type == "crawl": + prompt = self._createCrawlAIPrompt(parameters) + elif action_type == "scrape": + prompt = self._createScrapeAIPrompt(parameters) + elif action_type == "search": + prompt = self._createSearchAIPrompt(parameters) + else: + logger.warning(f"Unknown action type for AI research: {action_type}") + return None + + # Try advanced AI call + if hasattr(self.service, 'callAiTextAdvanced'): + logger.info(f"Attempting advanced AI web research for {action_type}") + response = await self.service.callAiTextAdvanced(prompt) + + # Parse the AI response + parsed_result = self._parseAIWebResponse(response, action_type) + if parsed_result: + logger.info(f"Advanced AI web research successful for {action_type}") + return parsed_result + else: + logger.warning(f"Failed to parse AI response for {action_type}") + return None + else: + logger.warning("Service does not have callAiTextAdvanced method") + return None + + except Exception as e: + logger.warning(f"Advanced AI web research failed for {action_type}: {str(e)}") + return None + + def _createCrawlAIPrompt(self, parameters: Dict[str, Any]) -> str: + """Create AI prompt for web crawling""" + urls = parameters.get("urls", []) + maxDepth = parameters.get("maxDepth", 2) + includeImages = parameters.get("includeImages", False) + followLinks = parameters.get("followLinks", True) + + prompt = f""" +You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URLs based on your extensive training data and knowledge. + +URLs to research: {urls} +Max depth: {maxDepth} +Include images: {includeImages} +Follow links: {followLinks} + +For each URL, please provide comprehensive information including: +1. Company/organization information and background +2. Main business activities and services +3. Key personnel and leadership +4. Contact information and locations +5. Recent news and developments +6. Industry analysis and market position +7. Related companies and partnerships +8. Website structure and key pages +9. Business model and revenue streams +10. Regulatory compliance and certifications + +For each URL, provide: +- url: The original URL +- title: Company/organization name +- content: Comprehensive description and analysis +- content_length: Number of characters in content +- meta_info: Business information object +- links: Related companies and important connections +- images: Company logos or key visuals if known +- requires_javascript: Boolean (usually false for static info) +- alternative_approaches: Additional research suggestions +- timestamp: Current timestamp + +Return the results in this exact JSON format: +{{ + "urls": {urls}, + "maxDepth": {maxDepth}, + "includeImages": {includeImages}, + "followLinks": {followLinks}, + "crawlResults": [ + {{ + "url": "url_here", + "depth": {maxDepth}, + "followLinks": {followLinks}, + "extractContent": true, + "title": "company_name", + "content": "comprehensive_company_analysis", + "content_length": 1234, + "meta_info": {{ + "url": "url_here", + "title": "company_name", + "description": "business_description", + "keywords": "industry_keywords", + "author": "company_info", + "language": "language_code", + "robots": "robots_info", + "viewport": "viewport_info", + "charset": "charset_info", + "canonical": "canonical_url" + }}, + "links": [ + {{ + "url": "related_company_url", + "text": "company_name" + }} + ], + "images": [ + {{ + "src": "logo_url", + "alt": "company_logo", + "title": "company_name", + "width": "width_value", + "height": "height_value" + }} + ], + "requires_javascript": false, + "alternative_approaches": ["approach1", "approach2"], + "timestamp": "2024-01-01T00:00:00Z" + }} + ], + "summary": {{ + "total_urls": {len(urls)}, + "successful_crawls": 0, + "failed_crawls": 0, + "total_content_chars": 0 + }}, + "timestamp": "2024-01-01T00:00:00Z" +}} + +Please provide accurate, comprehensive information about each company/organization based on your knowledge. If you don't have specific information about a URL, provide general industry analysis and suggest alternative research approaches. +""" + return prompt + + def _createScrapeAIPrompt(self, parameters: Dict[str, Any]) -> str: + """Create AI prompt for web scraping""" + url = parameters.get("url") + selectors = parameters.get("selectors", {}) + format = parameters.get("format", "json") + + prompt = f""" +You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URL and the specific data requested based on your extensive training data and knowledge. + +URL to research: {url} +Data selectors: {selectors} +Output format: {format} + +Please provide comprehensive information including: +1. Company/organization background and history +2. Business activities and services offered +3. Key personnel and leadership information +4. Financial information and performance data +5. Market position and competitive analysis +6. Recent news and developments +7. Contact information and locations +8. Industry trends and insights +9. Related companies and partnerships +10. Regulatory and compliance information + +For each data selector requested, provide relevant information in the specified format (text, html, or json). + +Return the results in this exact JSON format: +{{ + "url": "{url}", + "selectors": {selectors}, + "format": "{format}", + "scrapedData": {{ + "url": "{url}", + "selectors": {selectors}, + "format": "{format}", + "content": {{ + "company_info": ["comprehensive_company_analysis"], + "business_activities": ["detailed_business_description"], + "leadership": ["key_personnel_information"], + "financial_data": ["financial_performance_analysis"], + "market_position": ["competitive_analysis"], + "recent_news": ["latest_developments"], + "contact_info": ["contact_details"], + "industry_insights": ["market_trends"], + "partnerships": ["related_companies"], + "compliance": ["regulatory_information"] + }}, + "timestamp": "2024-01-01T00:00:00Z" + }}, + "timestamp": "2024-01-01T00:00:00Z" +}} + +Please provide accurate, comprehensive information about the company/organization based on your knowledge. If you don't have specific information about the URL, provide general industry analysis and suggest alternative research approaches. +""" + return prompt + + def _createSearchAIPrompt(self, parameters: Dict[str, Any]) -> str: + """Create AI prompt for web search""" + query = parameters.get("query") + engine = parameters.get("engine", "google") + maxResults = parameters.get("maxResults", 10) + filter = parameters.get("filter") + + prompt = f""" +You are an advanced AI research assistant with comprehensive knowledge about companies, industries, and business information. Please provide detailed information about the following search query based on your extensive training data and knowledge. + +Search query: {query} +Search engine: {engine} +Max results: {maxResults} +Filter: {filter} + +Please provide comprehensive research results including: +1. Relevant company/organization information +2. Industry analysis and market insights +3. Key personnel and leadership details +4. Business activities and services +5. Financial performance and metrics +6. Recent news and developments +7. Competitive landscape analysis +8. Market trends and opportunities +9. Regulatory and compliance information +10. Related companies and partnerships + +For each search result, provide: +- title: Company/organization name +- url: Official website or primary source +- snippet: Brief description and key highlights +- content: Comprehensive analysis and insights + +Return the results in this exact JSON format: +{{ + "query": "{query}", + "engine": "{engine}", + "maxResults": {maxResults}, + "filter": "{filter}", + "searchResults": {{ + "query": "{query}", + "maxResults": {maxResults}, + "results": [ + {{ + "title": "company_name", + "url": "official_website", + "snippet": "brief_description", + "content": "comprehensive_analysis" + }} + ], + "totalFound": 0, + "timestamp": "2024-01-01T00:00:00Z" + }}, + "timestamp": "2024-01-01T00:00:00Z" +}} + +Please provide accurate, comprehensive information about the search query based on your knowledge. If you don't have specific information about the query, provide general industry analysis and suggest alternative research approaches. +""" + return prompt + + def _parseAIWebResponse(self, response: str, action_type: str) -> Optional[Dict[str, Any]]: + """Parse AI response into structured data""" + try: + # Extract JSON from response + json_start = response.find('{') + json_end = response.rfind('}') + 1 + if json_start == -1 or json_end == 0: + logger.warning(f"No JSON found in AI response: {response}") + return None + + json_str = response[json_start:json_end] + parsed_data = json.loads(json_str) + + # Validate basic structure based on action type + if action_type == "crawl": + if "crawlResults" not in parsed_data: + logger.warning("Invalid crawl response structure") + return None + elif action_type == "scrape": + if "scrapedData" not in parsed_data: + logger.warning("Invalid scrape response structure") + return None + elif action_type == "search": + if "searchResults" not in parsed_data: + logger.warning("Invalid search response structure") + return None + + return parsed_data + + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse AI response JSON: {str(e)}") + return None + except Exception as e: + logger.warning(f"Error parsing AI response: {str(e)}") + return None + @action async def crawl(self, parameters: Dict[str, Any]) -> ActionResult: """ - Crawl web pages and extract content + Crawl web pages and extract content with enhanced error handling and content detection Parameters: urls (List[str]): List of URLs to crawl @@ -240,23 +786,76 @@ class MethodWeb(MethodBase): error="URLs are required" ) + # Try advanced AI research first + ai_result = await self._tryAdvancedAIWebResearch("crawl", parameters) + if ai_result: + logger.info("Using advanced AI web research for crawl") + # Reconstruct the result data from the AI response + result_data = { + "urls": ai_result.get("urls", []), + "maxDepth": ai_result.get("maxDepth", 2), + "includeImages": ai_result.get("includeImages", False), + "followLinks": ai_result.get("followLinks", True), + "crawlResults": ai_result.get("crawlResults", []), + "summary": ai_result.get("summary", {}), + "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat()) + } + return self._createResult( + success=True, + data={ + "documents": [ + { + "documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", + "documentData": result_data, + "mimeType": "application/json" + } + ] + } + ) + else: + logger.info("Advanced AI web research failed, falling back to regular web crawling") + # Crawl each URL crawl_results = [] for url in urls: try: - # Read the URL + logger.info(f"Crawling URL: {url}") + + # Read the URL with enhanced error handling soup = self._readUrl(url) if not soup: + logger.error(f"Failed to read URL: {url}") crawl_results.append({ - "error": "Failed to read URL", - "url": url + "error": "Failed to read URL - check if the site is accessible and not blocking crawlers", + "url": url, + "suggestions": [ + "Try accessing the URL directly in a browser", + "Check if the site requires JavaScript", + "Verify the URL is correct and accessible" + ] }) continue - # Extract basic information + # Extract comprehensive information title = self._extractTitle(soup, url) - content = self._extractMainContent(soup) if True else "" + content = self._extractMainContent(soup) + meta_info = self._extractMetaInformation(soup, url) + + # Check if content is meaningful + content_length = len(content) + if content_length < 100: + logger.warning(f"Very little content extracted from {url} ({content_length} chars)") + crawl_results.append({ + "url": url, + "title": title, + "content": content, + "content_length": content_length, + "warning": "Very little content extracted - site may require JavaScript or have anti-bot protection", + "meta_info": meta_info, + "timestamp": datetime.now(UTC).isoformat() + }) + continue # Extract links if requested links = [] @@ -264,21 +863,32 @@ class MethodWeb(MethodBase): for link in soup.find_all('a', href=True): href = link.get('href') if href and href.startswith(('http://', 'https://')): - links.append({ - 'url': href, - 'text': link.get_text(strip=True)[:100] + link_text = link.get_text(strip=True) + if link_text: # Only include links with text + links.append({ + 'url': href, + 'text': link_text[:100] + }) + + # Extract images if requested + images = [] + if includeImages: + for img in soup.find_all('img', src=True): + src = img.get('src') + if src: + images.append({ + 'src': src, + 'alt': img.get('alt', ''), + 'title': img.get('title', ''), + 'width': img.get('width', ''), + 'height': img.get('height', '') }) - # Extract images - images = [] - for img in soup.find_all('img', src=True): - src = img.get('src') - if src: - images.append({ - 'src': src, - 'alt': img.get('alt', ''), - 'title': img.get('title', '') - }) + # Check for JavaScript rendering requirements + requires_js = self._detectJavaScriptRendering(soup) + + # Get alternative approaches if needed + alternative_approaches = self._getAlternativeApproaches(url, requires_js, content_length) crawl_results.append({ "url": url, @@ -287,16 +897,27 @@ class MethodWeb(MethodBase): "extractContent": True, "title": title, "content": content, - "links": links[:10], # Limit to first 10 links - "images": images[:10], # Limit to first 10 images + "content_length": content_length, + "meta_info": meta_info, + "links": links[:20], # Limit to first 20 links + "images": images[:20], # Limit to first 20 images + "requires_javascript": requires_js, + "alternative_approaches": alternative_approaches, "timestamp": datetime.now(UTC).isoformat() }) + logger.info(f"Successfully crawled {url} - extracted {content_length} characters") + except Exception as e: logger.error(f"Error crawling web page {url}: {str(e)}") crawl_results.append({ "error": str(e), - "url": url + "url": url, + "suggestions": [ + "Check if the URL is accessible", + "Try with a different user agent", + "Verify the site doesn't block automated access" + ] }) # Create result data @@ -306,6 +927,12 @@ class MethodWeb(MethodBase): "includeImages": includeImages, "followLinks": followLinks, "crawlResults": crawl_results, + "summary": { + "total_urls": len(urls), + "successful_crawls": len([r for r in crawl_results if "error" not in r]), + "failed_crawls": len([r for r in crawl_results if "error" in r]), + "total_content_chars": sum([r.get("content_length", 0) for r in crawl_results if "content_length" in r]) + }, "timestamp": datetime.now(UTC).isoformat() } @@ -367,6 +994,33 @@ class MethodWeb(MethodBase): error="URL and selectors are required" ) + # Try advanced AI research first + ai_result = await self._tryAdvancedAIWebResearch("scrape", parameters) + if ai_result: + logger.info("Using advanced AI web research for scrape") + # Reconstruct the result data from the AI response + result_data = { + "url": ai_result.get("url"), + "selectors": ai_result.get("selectors"), + "format": ai_result.get("format"), + "scrapedData": ai_result.get("scrapedData"), + "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat()) + } + return self._createResult( + success=True, + data={ + "documents": [ + { + "documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", + "documentData": result_data, + "mimeType": "application/json" + } + ] + } + ) + else: + logger.info("Advanced AI web research failed, falling back to regular web scraping") + # Read the URL soup = self._readUrl(url) if not soup: @@ -478,6 +1132,34 @@ class MethodWeb(MethodBase): error="Search query is required" ) + # Try advanced AI research first + ai_result = await self._tryAdvancedAIWebResearch("search", parameters) + if ai_result: + logger.info("Using advanced AI web research for search") + # Reconstruct the result data from the AI response + result_data = { + "query": ai_result.get("query"), + "engine": ai_result.get("engine"), + "maxResults": ai_result.get("maxResults"), + "filter": ai_result.get("filter"), + "searchResults": ai_result.get("searchResults"), + "timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat()) + } + return self._createResult( + success=True, + data={ + "documents": [ + { + "documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json", + "documentData": result_data, + "mimeType": "application/json" + } + ] + } + ) + else: + logger.info("Advanced AI web research failed, falling back to regular web search") + # Search web content using Google search via SerpAPI try: if not self.srcApikey: @@ -601,94 +1283,3 @@ class MethodWeb(MethodBase): error=str(e) ) - @action - async def validate(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Validate web pages for various criteria - - Parameters: - url (str): URL to validate - checks (List[str], optional): Types of checks to perform (default: ["accessibility", "seo", "performance"]) - expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description - """ - try: - url = parameters.get("url") - checks = parameters.get("checks", ["accessibility", "seo", "performance"]) - expectedDocumentFormats = parameters.get("expectedDocumentFormats", []) - - if not url: - return self._createResult( - success=False, - data={}, - error="URL is required" - ) - - # Read the URL - soup = self._readUrl(url) - if not soup: - return self._createResult( - success=False, - data={}, - error="Failed to read URL" - ) - - validation_results = {} - - for check in checks: - if check == "accessibility": - validation_results["accessibility"] = self._checkAccessibility(soup) - elif check == "seo": - validation_results["seo"] = self._checkSEO(soup) - elif check == "performance": - validation_results["performance"] = self._checkPerformance(soup, url) - else: - validation_results[check] = {"status": "unknown", "message": f"Unknown check type: {check}"} - - validation_result = { - "url": url, - "checks": checks, - "results": validation_results, - "timestamp": datetime.now(UTC).isoformat() - } - - # Create result data - result_data = { - "url": url, - "checks": checks, - "validationResult": validation_result, - "timestamp": datetime.now(UTC).isoformat() - } - - # Determine output format based on expected formats - output_extension = ".json" # Default - output_mime_type = "application/json" # Default - - if expectedDocumentFormats and len(expectedDocumentFormats) > 0: - # Use the first expected format - expected_format = expectedDocumentFormats[0] - output_extension = expected_format.get("extension", ".json") - output_mime_type = expected_format.get("mimeType", "application/json") - logger.info(f"Using expected format: {output_extension} ({output_mime_type})") - else: - logger.info("No expected format specified, using default .json format") - - return self._createResult( - success=True, - data={ - "documents": [ - { - "documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}", - "documentData": result_data, - "mimeType": output_mime_type - } - ] - } - ) - - except Exception as e: - logger.error(f"Error validating web page: {str(e)}") - return self._createResult( - success=False, - data={}, - error=str(e) - ) \ No newline at end of file diff --git a/modules/workflow/managerWorkflow.py b/modules/workflow/managerWorkflow.py index cb4294d2..1cae6f73 100644 --- a/modules/workflow/managerWorkflow.py +++ b/modules/workflow/managerWorkflow.py @@ -170,7 +170,7 @@ class WorkflowManager: # Add completion log entry self.chatInterface.createWorkflowLog({ "workflowId": workflow.id, - "message": "Workflow completed successfully", + "message": "Workflow completed", "type": "success", "status": "completed", "progress": 100