web refactored

This commit is contained in:
ValueOn AG 2025-07-12 16:26:20 +02:00
parent cfb34c6a38
commit b1be8dd81c
6 changed files with 1242 additions and 186 deletions

20
app.py
View file

@ -37,6 +37,22 @@ def initLogging():
('.well-known/appspecific/com.chrome.devtools.json' in record.msg or ('.well-known/appspecific/com.chrome.devtools.json' in record.msg or
'Request: /index.html' in record.msg)) 'Request: /index.html' in record.msg))
# Add filter to exclude HTTP debug messages
class HTTPDebugFilter(logging.Filter):
def filter(self, record):
if isinstance(record.msg, str):
# Filter out HTTP debug messages
http_debug_patterns = [
'receive_response_body.started',
'receive_response_body.complete',
'response_closed.started',
'_send_single_request',
'httpcore.http11',
'httpx._client'
]
return not any(pattern in record.msg for pattern in http_debug_patterns)
return True
# Configure handlers based on config # Configure handlers based on config
handlers = [] handlers = []
@ -45,6 +61,7 @@ def initLogging():
consoleHandler = logging.StreamHandler() consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(consoleFormatter) consoleHandler.setFormatter(consoleFormatter)
consoleHandler.addFilter(ChromeDevToolsFilter()) consoleHandler.addFilter(ChromeDevToolsFilter())
consoleHandler.addFilter(HTTPDebugFilter())
handlers.append(consoleHandler) handlers.append(consoleHandler)
# Add file handler if enabled # Add file handler if enabled
@ -71,6 +88,7 @@ def initLogging():
) )
fileHandler.setFormatter(fileFormatter) fileHandler.setFormatter(fileFormatter)
fileHandler.addFilter(ChromeDevToolsFilter()) fileHandler.addFilter(ChromeDevToolsFilter())
fileHandler.addFilter(HTTPDebugFilter())
handlers.append(fileHandler) handlers.append(fileHandler)
# Configure the root logger # Configure the root logger
@ -83,7 +101,7 @@ def initLogging():
) )
# Silence noisy third-party libraries - use the same level as the root logger # Silence noisy third-party libraries - use the same level as the root logger
noisyLoggers = ["httpx", "urllib3", "asyncio", "fastapi.security.oauth2"] noisyLoggers = ["httpx", "httpcore", "urllib3", "asyncio", "fastapi.security.oauth2"]
for loggerName in noisyLoggers: for loggerName in noisyLoggers:
logging.getLogger(loggerName).setLevel(logLevel) logging.getLogger(loggerName).setLevel(logLevel)

View file

@ -317,8 +317,8 @@ class ChatManager:
'workflow_id': workflow.id 'workflow_id': workflow.id
}) })
# Get AI response # Get AI response with fallback mechanism
response = await self.service.callAiTextAdvanced(prompt) response = await self._callAIWithCircuitBreaker(prompt, "task_planning")
# Parse and validate task plan # Parse and validate task plan
task_plan_dict = self._parseTaskPlanResponse(response) task_plan_dict = self._parseTaskPlanResponse(response)
@ -372,8 +372,22 @@ class ChatManager:
return task_plan return task_plan
except Exception as e: except Exception as e:
logger.error(f"Error in high-level task planning: {str(e)}") error_message = str(e)
raise Exception(f"AI is required for task planning but failed: {str(e)}") logger.error(f"Error in high-level task planning: {error_message}")
# Provide more specific error messages based on the error type
if "overloaded" in error_message.lower() or "529" in error_message:
detailed_error = "AI service is currently overloaded. Please try again in a few minutes."
elif "rate limit" in error_message.lower() or "429" in error_message:
detailed_error = "Rate limit exceeded. Please wait before making another request."
elif "api key" in error_message.lower() or "401" in error_message:
detailed_error = "Invalid API key. Please check your AI service configuration."
elif "timeout" in error_message.lower():
detailed_error = "AI service request timed out. Please try again."
else:
detailed_error = f"AI service error: {error_message}"
raise Exception(detailed_error)
# Phase 2: Task Definition and Action Generation # Phase 2: Task Definition and Action Generation
async def defineTaskActions(self, task_step: TaskStep, workflow: ChatWorkflow, previous_results: List[str] = None, async def defineTaskActions(self, task_step: TaskStep, workflow: ChatWorkflow, previous_results: List[str] = None,
@ -641,29 +655,127 @@ class ChatManager:
# ===== Enhanced Task Planning Methods ===== # ===== Enhanced Task Planning Methods =====
async def _callAIWithCircuitBreaker(self, prompt: str, context: str) -> str: async def _callAIWithCircuitBreaker(self, prompt: str, context: str) -> str:
"""Call AI with circuit breaker pattern for fault tolerance""" """Call AI with intelligent routing based on complexity and circuit breaker pattern"""
try: max_retries = 3
# Check circuit breaker base_delay = 2 # Start with 2 seconds
if self._isCircuitBreakerOpen():
raise Exception("AI circuit breaker is open - too many recent failures") for attempt in range(max_retries):
try:
# Call AI with timeout # Check circuit breaker
logger.debug(f"ACTION GENERATION PROMPT: {prompt}") if self._isCircuitBreakerOpen():
response = await asyncio.wait_for( raise Exception("AI circuit breaker is open - too many recent failures")
self._callAI(prompt, context),
timeout=self.ai_call_timeout # Determine which AI service to use based on complexity
) ai_choice = self._determineAIChoice(prompt, context)
logger.debug(f"AI choice for {context}: {ai_choice} (attempt {attempt + 1}/{max_retries})")
# Reset failure count on success
self.ai_failure_count = 0 if ai_choice == "advanced":
return response # Use advanced AI for complex tasks
try:
except asyncio.TimeoutError: response = await asyncio.wait_for(
self._recordAIFailure("Timeout") self._callAdvancedAI(prompt, context),
raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds") timeout=self.ai_call_timeout
except Exception as e: )
self._recordAIFailure(str(e))
raise # Reset failure count on success
self.ai_failure_count = 0
logger.info(f"Advanced AI call successful for {context}")
return response
except Exception as advanced_error:
error_message = str(advanced_error)
logger.warning(f"Advanced AI call failed for {context}: {error_message}")
# Fall back to basic AI for complex tasks
logger.info(f"Falling back to basic AI for complex task: {context}")
try:
response = await asyncio.wait_for(
self._callStandardAI(prompt, context),
timeout=self.ai_call_timeout
)
# Reset failure count on success
self.ai_failure_count = 0
logger.info(f"Basic AI fallback successful for complex task: {context}")
return response
except Exception as standard_error:
# Both failed for complex task
error_message = f"Advanced AI failed: {str(advanced_error)}. Basic AI failed: {str(standard_error)}"
raise Exception(error_message)
else: # basic
# Use basic AI for simple tasks
try:
response = await asyncio.wait_for(
self._callStandardAI(prompt, context),
timeout=self.ai_call_timeout
)
# Reset failure count on success
self.ai_failure_count = 0
logger.info(f"Basic AI call successful for {context}")
return response
except Exception as basic_error:
error_message = str(basic_error)
logger.warning(f"Basic AI call failed for {context}: {error_message}")
# Only upgrade to advanced AI for critical simple tasks
if self._isCriticalTask(context):
logger.info(f"Upgrading to advanced AI for critical simple task: {context}")
try:
response = await asyncio.wait_for(
self._callAdvancedAI(prompt, context),
timeout=self.ai_call_timeout
)
# Reset failure count on success
self.ai_failure_count = 0
logger.info(f"Advanced AI upgrade successful for critical task: {context}")
return response
except Exception as advanced_error:
# Both failed for critical task
error_message = f"Basic AI failed: {str(basic_error)}. Advanced AI failed: {str(advanced_error)}"
raise Exception(error_message)
else:
# Non-critical simple task failed
raise Exception(f"Basic AI failed for simple task: {error_message}")
except asyncio.TimeoutError:
self._recordAIFailure("Timeout")
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt) # Exponential backoff
logger.warning(f"AI call timed out, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(delay)
continue
else:
raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds")
except Exception as e:
error_message = str(e)
# Special handling for overloaded service (529 error)
if "overloaded" in error_message.lower() or "529" in error_message:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt) # Exponential backoff
logger.warning(f"AI service overloaded, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(delay)
continue
else:
# Don't record this as a circuit breaker failure since it's a service issue
raise Exception("AI service is currently overloaded. Please try again in a few minutes.")
# For other errors, record failure and potentially retry
self._recordAIFailure(error_message)
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt) # Exponential backoff
logger.warning(f"AI call failed, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries}): {error_message}")
await asyncio.sleep(delay)
continue
else:
raise
def _isCircuitBreakerOpen(self) -> bool: def _isCircuitBreakerOpen(self) -> bool:
"""Check if circuit breaker is open""" """Check if circuit breaker is open"""
@ -678,6 +790,146 @@ class ChatManager:
self.ai_last_failure_time = None self.ai_last_failure_time = None
return False return False
def _determineAIChoice(self, prompt: str, context: str) -> str:
"""Determine whether to use advanced or basic AI based on task complexity"""
# Check for forced AI choice based on context
forced_choice = self._getForcedAIChoice(context)
if forced_choice:
logger.debug(f"Forced AI choice for {context}: {forced_choice}")
return forced_choice
# Define complex task patterns that require advanced AI
complex_patterns = [
# Task planning and workflow management
"task_planning", "action_generation", "result_review", "task_completion_validation",
# Complex document analysis
"document", "extract", "analysis", "comprehensive", "detailed analysis",
# Multi-step reasoning
"plan", "strategy", "evaluate", "assess", "compare", "analyze",
# Complex business logic
"workflow", "task", "action", "validation", "review", "assessment",
# Critical decision making
"decision", "recommendation", "evaluation", "quality", "success criteria",
# Complex prompts
"JSON", "structured", "format", "validation", "improvements", "quality_score"
]
# Define simple task patterns that can use basic AI
simple_patterns = [
# Basic text processing
"summarize", "translate", "format", "convert", "extract text",
# Simple queries
"find", "search", "list", "get", "retrieve",
# Basic operations
"send", "upload", "download", "create", "delete",
# Simple responses
"confirm", "acknowledge", "status", "info"
]
# Check prompt and context for complexity indicators
combined_text = f"{prompt} {context}".lower()
# Count complex indicators
complex_count = sum(1 for pattern in complex_patterns if pattern in combined_text)
# Count simple indicators
simple_count = sum(1 for pattern in simple_patterns if pattern in combined_text)
# Additional complexity factors
prompt_length = len(prompt)
has_json_requirement = "json" in combined_text and ("{" in prompt or "}" in prompt)
has_structured_output = any(word in combined_text for word in ["format", "structure", "template"])
has_validation = any(word in combined_text for word in ["validate", "check", "verify", "quality"])
# Calculate complexity score
complexity_score = 0
complexity_score += complex_count * 2 # Complex patterns worth more
complexity_score += simple_count * 1 # Simple patterns worth less
complexity_score += (prompt_length > 1000) * 3 # Long prompts are complex
complexity_score += has_json_requirement * 5 # JSON requirements are complex
complexity_score += has_structured_output * 3 # Structured output is complex
complexity_score += has_validation * 4 # Validation is complex
# Determine AI choice based on complexity score
if complexity_score >= 5:
logger.debug(f"Complex task detected (score: {complexity_score}) - using advanced AI for {context}")
return "advanced"
else:
logger.debug(f"Simple task detected (score: {complexity_score}) - using basic AI for {context}")
return "basic"
def _getForcedAIChoice(self, context: str) -> str:
"""Get forced AI choice for specific contexts (can be overridden)"""
# Define contexts that always use advanced AI
advanced_contexts = [
"task_planning", # Always use advanced for task planning
"action_generation", # Always use advanced for action generation
"result_review", # Always use advanced for result review
"task_completion_validation" # Always use advanced for validation
]
# Define contexts that always use basic AI
basic_contexts = [
"summarize", # Always use basic for summarization
"translate", # Always use basic for translation
"format", # Always use basic for formatting
"status", # Always use basic for status updates
"info" # Always use basic for info queries
]
context_lower = context.lower()
# Check for forced advanced AI
for advanced_context in advanced_contexts:
if advanced_context in context_lower:
return "advanced"
# Check for forced basic AI
for basic_context in basic_contexts:
if basic_context in context_lower:
return "basic"
# No forced choice
return None
def _isCriticalTask(self, context: str) -> bool:
"""Determine if a simple task is critical enough to warrant advanced AI upgrade"""
# Define critical task patterns
critical_patterns = [
# Workflow critical tasks
"task_planning", "workflow", "critical", "essential",
# User-facing decisions
"decision", "recommendation", "evaluation", "assessment",
# Quality-sensitive tasks
"quality", "validation", "review", "check",
# Business-critical operations
"business", "strategy", "planning", "analysis"
]
context_lower = context.lower()
# Check if context contains critical patterns
is_critical = any(pattern in context_lower for pattern in critical_patterns)
if is_critical:
logger.debug(f"Critical task detected - {context}")
return is_critical
def _recordAIFailure(self, error: str): def _recordAIFailure(self, error: str):
"""Record AI failure for circuit breaker""" """Record AI failure for circuit breaker"""
self.ai_failure_count += 1 self.ai_failure_count += 1
@ -1753,14 +2005,35 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
logger.error(f"Error parsing review response: {str(e)}") logger.error(f"Error parsing review response: {str(e)}")
return {'status': 'failed', 'reason': f'Parse error: {str(e)}'} return {'status': 'failed', 'reason': f'Parse error: {str(e)}'}
async def _callAI(self, prompt: str, context: str) -> str: async def _callAdvancedAI(self, prompt: str, context: str) -> str:
"""Call AI service with prompt""" """Call advanced AI service with prompt (primary method)"""
try: try:
# Use the existing AI call mechanism through service # Use the advanced AI call mechanism through service
if hasattr(self, 'service') and self.service: if hasattr(self, 'service') and self.service:
# Ensure service is properly initialized # Try advanced AI call first
if hasattr(self.service, 'callAiTextAdvanced'):
response = await self.service.callAiTextAdvanced(prompt)
logger.debug(f"Advanced AI call successful for {context}")
return response
else:
raise Exception("Service does not have callAiTextAdvanced method")
else:
raise Exception("No service available for AI calls")
except Exception as e:
error_message = str(e)
logger.warning(f"Advanced AI call failed for {context}: {error_message}")
raise Exception(f"Advanced AI failed: {error_message}")
async def _callStandardAI(self, prompt: str, context: str) -> str:
"""Call standard AI service with prompt (fallback method)"""
try:
# Use the standard AI call mechanism through service
if hasattr(self, 'service') and self.service:
# Try standard AI call as fallback
if hasattr(self.service, 'callAiTextBasic'): if hasattr(self.service, 'callAiTextBasic'):
response = await self.service.callAiTextBasic(prompt) response = await self.service.callAiTextBasic(prompt)
logger.debug(f"Standard AI call successful for {context}")
return response return response
else: else:
raise Exception("Service does not have callAiTextBasic method") raise Exception("Service does not have callAiTextBasic method")
@ -1768,8 +2041,26 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
raise Exception("No service available for AI calls") raise Exception("No service available for AI calls")
except Exception as e: except Exception as e:
logger.error(f"Error calling AI for {context}: {str(e)}") error_message = str(e)
raise logger.error(f"Standard AI call failed for {context}: {error_message}")
# Provide more specific error messages based on the error type
if "overloaded" in error_message.lower() or "529" in error_message:
detailed_error = "AI service is currently overloaded. Please try again in a few minutes."
elif "rate limit" in error_message.lower() or "429" in error_message:
detailed_error = "Rate limit exceeded. Please wait before making another request."
elif "api key" in error_message.lower() or "401" in error_message:
detailed_error = "Invalid API key. Please check your AI service configuration."
elif "timeout" in error_message.lower():
detailed_error = "AI service request timed out. Please try again."
else:
detailed_error = f"AI service error: {error_message}"
raise Exception(detailed_error)
async def _callAI(self, prompt: str, context: str) -> str:
"""Call AI service with prompt (legacy method - now uses the circuit breaker)"""
return await self._callAIWithCircuitBreaker(prompt, context)
# ===== WORKFLOW FEEDBACK GENERATION ===== # ===== WORKFLOW FEEDBACK GENERATION =====
@ -1866,11 +2157,20 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
task_description = task_step.description task_description = task_step.description
logger.info(f"=== PROCESSING TASK {i+1}/{len(task_plan.tasks)}: {task_description} ===") logger.info(f"=== PROCESSING TASK {i+1}/{len(task_plan.tasks)}: {task_description} ===")
# Create user-friendly task start log # Create user-friendly task start log with action details
progress = 20 + (i * 60 // len(task_plan.tasks)) progress = 20 + (i * 60 // len(task_plan.tasks))
# Get actions for this task to show in the log
task_actions = await self.defineTaskActions(task_step, workflow, previous_results)
action_details = []
for j, action in enumerate(task_actions):
action_details.append(f" {j+1}. {action.execMethod}.{action.execAction}")
action_summary = "\n".join(action_details) if action_details else " (Actions will be generated during execution)"
self.chatInterface.createWorkflowLog({ self.chatInterface.createWorkflowLog({
"workflowId": workflow.id, "workflowId": workflow.id,
"message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}", "message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}\nActions to be executed:\n{action_summary}",
"type": "info", "type": "info",
"status": "running", "status": "running",
"progress": progress, "progress": progress,
@ -2032,6 +2332,16 @@ Please review the task requirements and try again with different input or approa
for i, action in enumerate(actions): for i, action in enumerate(actions):
logger.info(f"Executing action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}") logger.info(f"Executing action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}")
# Add action start log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"Starting action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}",
"type": "info",
"status": "running",
"progress": 0,
"agentName": "System"
})
# Execute action with validation # Execute action with validation
result = await self.executeActionWithValidation(action, workflow, context) result = await self.executeActionWithValidation(action, workflow, context)
@ -2039,7 +2349,37 @@ Please review the task requirements and try again with different input or approa
state.addSuccessfulAction(result) state.addSuccessfulAction(result)
logger.info(f"Action {i+1} completed successfully") logger.info(f"Action {i+1} completed successfully")
# Add action completion message with result documents
documents_info = ""
if result.documents and len(result.documents) > 0:
doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}"
for j, doc in enumerate(result.documents)]
documents_info = f"\n📄 Generated documents: {', '.join(doc_names)}"
# Create completion message
completion_message = f"✅ Action {i+1}/{len(actions)} completed: {action.execMethod}.{action.execAction}{documents_info}"
# Add as log entry instead of message
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": completion_message,
"type": "success",
"status": "running",
"progress": 0,
"agentName": "System"
})
elif result.validation.get('status') == 'retry': elif result.validation.get('status') == 'retry':
# Add retry log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"🔄 Action {i+1}/{len(actions)} needs retry: {action.execMethod}.{action.execAction}",
"type": "warning",
"status": "running",
"progress": 0,
"agentName": "System"
})
# Retry individual action # Retry individual action
improvements = result.validation.get('improvements', []) improvements = result.validation.get('improvements', [])
retry_result = await self.retryActionWithImprovements(action, result, improvements) retry_result = await self.retryActionWithImprovements(action, result, improvements)
@ -2047,15 +2387,51 @@ Please review the task requirements and try again with different input or approa
if retry_result.validation.get('status') == 'success': if retry_result.validation.get('status') == 'success':
state.addSuccessfulAction(retry_result) state.addSuccessfulAction(retry_result)
logger.info(f"Action {i+1} retry successful") logger.info(f"Action {i+1} retry successful")
# Add retry success log
retry_documents_info = ""
if retry_result.documents and len(retry_result.documents) > 0:
doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}"
for j, doc in enumerate(retry_result.documents)]
retry_documents_info = f"\n📄 Generated documents: {', '.join(doc_names)}"
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"✅ Action {i+1}/{len(actions)} retry successful: {action.execMethod}.{action.execAction}{retry_documents_info}",
"type": "success",
"status": "running",
"progress": 0,
"agentName": "System"
})
else: else:
state.addFailedAction(retry_result) state.addFailedAction(retry_result)
logger.error(f"Action {i+1} retry failed") logger.error(f"Action {i+1} retry failed")
# Add retry failure log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"❌ Action {i+1}/{len(actions)} retry failed: {action.execMethod}.{action.execAction}",
"type": "error",
"status": "running",
"progress": 0,
"agentName": "System"
})
# Action failed after retry - stop task execution and regenerate # Action failed after retry - stop task execution and regenerate
break break
else: # fail else: # fail
state.addFailedAction(result) state.addFailedAction(result)
logger.error(f"Action {i+1} failed validation - stopping task execution") logger.error(f"Action {i+1} failed validation - stopping task execution")
# Add failure log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"❌ Action {i+1}/{len(actions)} failed: {action.execMethod}.{action.execAction}",
"type": "error",
"status": "running",
"progress": 0,
"agentName": "System"
})
# Action failed - stop task execution and regenerate # Action failed - stop task execution and regenerate
break break
@ -2412,12 +2788,47 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
if validation['status'] == 'success': if validation['status'] == 'success':
action.setSuccess() action.setSuccess()
logger.info(f"Action {action.execMethod}.{action.execAction} validated successfully") logger.info(f"Action {action.execMethod}.{action.execAction} validated successfully")
# Only create action message if documents were produced
if result.documents and len(result.documents) > 0:
await self._createActionMessage(action, result, workflow, action.execResultLabel)
else:
# Add validation success log instead of message
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"✅ Action validation successful: {action.execMethod}.{action.execAction}",
"type": "success",
"status": "running",
"progress": 0,
"agentName": "System"
})
elif validation['status'] == 'retry': elif validation['status'] == 'retry':
action.status = TaskStatus.PENDING # Keep pending for retry action.status = TaskStatus.PENDING # Keep pending for retry
logger.warning(f"Action {action.execMethod}.{action.execAction} needs retry: {validation.get('reason', 'No reason')}") logger.warning(f"Action {action.execMethod}.{action.execAction} needs retry: {validation.get('reason', 'No reason')}")
# Add validation retry log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"🔄 Action validation requires retry: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}",
"type": "warning",
"status": "running",
"progress": 0,
"agentName": "System"
})
else: # fail else: # fail
action.setError(validation.get('reason', 'Action failed validation')) action.setError(validation.get('reason', 'Action failed validation'))
logger.error(f"Action {action.execMethod}.{action.execAction} failed validation: {validation.get('reason', 'No reason')}") logger.error(f"Action {action.execMethod}.{action.execAction} failed validation: {validation.get('reason', 'No reason')}")
# Add validation failure log
self.chatInterface.createWorkflowLog({
"workflowId": workflow.id,
"message": f"❌ Action validation failed: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}",
"type": "error",
"status": "running",
"progress": 0,
"agentName": "System"
})
return action_result return action_result

View file

@ -76,8 +76,22 @@ class AiAnthropic:
) )
if response.status_code != 200: if response.status_code != 200:
logger.error(f"Anthropic API error: {response.status_code} - {response.text}") error_detail = f"Anthropic API error: {response.status_code} - {response.text}"
raise HTTPException(status_code=500, detail="Error communicating with Anthropic API") logger.error(error_detail)
# Provide more specific error messages based on status code
if response.status_code == 529:
error_message = "Anthropic API is currently overloaded. Please try again in a few minutes."
elif response.status_code == 429:
error_message = "Rate limit exceeded. Please wait before making another request."
elif response.status_code == 401:
error_message = "Invalid API key. Please check your Anthropic API configuration."
elif response.status_code == 400:
error_message = f"Invalid request to Anthropic API: {response.text}"
else:
error_message = f"Anthropic API error ({response.status_code}): {response.text}"
raise HTTPException(status_code=500, detail=error_message)
# Parse response # Parse response
anthropicResponse = response.json() anthropicResponse = response.json()

View file

@ -269,6 +269,10 @@ class ChatObjects:
# Get messages for this workflow # Get messages for this workflow
messages = self.db.getRecordset("workflowMessages", recordFilter={"workflowId": workflowId}) messages = self.db.getRecordset("workflowMessages", recordFilter={"workflowId": workflowId})
# Sort messages by publishedAt timestamp to ensure chronological order
messages.sort(key=lambda x: x.get("publishedAt", x.get("timestamp", "0")))
return [ChatMessage(**msg) for msg in messages] return [ChatMessage(**msg) for msg in messages]
def createWorkflowMessage(self, messageData: Dict[str, Any]) -> ChatMessage: def createWorkflowMessage(self, messageData: Dict[str, Any]) -> ChatMessage:
@ -545,7 +549,12 @@ class ChatObjects:
return [] return []
# Get logs for this workflow # Get logs for this workflow
return [ChatLog(**log) for log in self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId})] logs = self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId})
# Sort logs by timestamp (Unix timestamps)
logs.sort(key=lambda x: float(x.get("timestamp", 0)))
return [ChatLog(**log) for log in logs]
def updateWorkflowStats(self, workflowId: str, bytesSent: int = 0, bytesReceived: int = 0) -> bool: def updateWorkflowStats(self, workflowId: str, bytesSent: int = 0, bytesReceived: int = 0) -> bool:
"""Updates workflow statistics during execution with incremental values.""" """Updates workflow statistics during execution with incremental values."""
@ -867,12 +876,25 @@ class ChatObjects:
raise ValueError(f"Workflow {workflowId} not found") raise ValueError(f"Workflow {workflowId} not found")
# Update workflow # Update workflow - set status back to running for resumed workflows
self.updateWorkflow(workflowId, { self.updateWorkflow(workflowId, {
"status": "running", # Set status back to running for resumed workflows
"lastActivity": currentTime, "lastActivity": currentTime,
"currentRound": workflow.currentRound + 1 "currentRound": workflow.currentRound + 1
}) })
# Update the workflow object status as well
workflow.status = "running"
# Add log entry for workflow resumption
self.createWorkflowLog({
"workflowId": workflowId,
"message": f"Workflow resumed (round {workflow.currentRound + 1})",
"type": "info",
"status": "running",
"progress": 0
})
else: else:
# Create new workflow # Create new workflow
workflowData = { workflowData = {

View file

@ -10,6 +10,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import time import time
import uuid import uuid
import json # Added for JSON parsing
from modules.chat.methodBase import MethodBase, ActionResult, action from modules.chat.methodBase import MethodBase, ActionResult, action
from modules.shared.configuration import APP_CONFIG from modules.shared.configuration import APP_CONFIG
@ -38,40 +39,105 @@ class MethodWeb(MethodBase):
self.timeout = 30 self.timeout = 30
def _readUrl(self, url: str) -> BeautifulSoup: def _readUrl(self, url: str) -> BeautifulSoup:
"""Read a URL and return a BeautifulSoup parser for the content""" """Read a URL and return a BeautifulSoup parser for the content with enhanced error handling"""
if not url or not url.startswith(('http://', 'https://')): if not url or not url.startswith(('http://', 'https://')):
logger.error(f"Invalid URL: {url}")
return None return None
# Enhanced headers to mimic real browser
headers = { headers = {
'User-Agent': self.user_agent, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0'
} }
try: try:
# Initial request # Use session for better connection handling
response = requests.get(url, headers=headers, timeout=self.timeout) session = requests.Session()
session.headers.update(headers)
# Handling for status 202 # Initial request with allow_redirects
if response.status_code == 202: response = session.get(url, timeout=self.timeout, allow_redirects=True)
# Retry with backoff
backoff_times = [0.5, 1.0, 2.0, 5.0] # Handle various status codes
if response.status_code == 200:
# Success - parse content
logger.debug(f"Successfully read URL: {url}")
return BeautifulSoup(response.text, 'html.parser')
elif response.status_code == 202:
# Accepted - retry with backoff
logger.info(f"Status 202 for {url}, retrying with backoff...")
backoff_times = [1.0, 2.0, 5.0, 10.0]
for wait_time in backoff_times: for wait_time in backoff_times:
time.sleep(wait_time) time.sleep(wait_time)
response = requests.get(url, headers=headers, timeout=self.timeout) retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
if response.status_code != 202: if retry_response.status_code == 200:
logger.debug(f"Successfully read URL after retry: {url}")
return BeautifulSoup(retry_response.text, 'html.parser')
elif retry_response.status_code != 202:
break break
# Raise for error status codes logger.warning(f"Failed to read URL after retries: {url}")
response.raise_for_status() return None
# Parse HTML elif response.status_code in [301, 302, 307, 308]:
return BeautifulSoup(response.text, 'html.parser') # Redirect - should be handled by allow_redirects=True
logger.warning(f"Unexpected redirect status {response.status_code} for {url}")
return None
elif response.status_code == 403:
# Forbidden - try with different user agent
logger.warning(f"403 Forbidden for {url}, trying with different user agent...")
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
session.headers.update(headers)
retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
if retry_response.status_code == 200:
logger.debug(f"Successfully read URL with different user agent: {url}")
return BeautifulSoup(retry_response.text, 'html.parser')
else:
logger.error(f"Still getting {retry_response.status_code} for {url}")
return None
elif response.status_code == 429:
# Rate limited - wait and retry
logger.warning(f"Rate limited for {url}, waiting 30 seconds...")
time.sleep(30)
retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
if retry_response.status_code == 200:
logger.debug(f"Successfully read URL after rate limit: {url}")
return BeautifulSoup(retry_response.text, 'html.parser')
else:
logger.error(f"Still getting {retry_response.status_code} after rate limit wait for {url}")
return None
else:
# Other error status codes
logger.error(f"HTTP {response.status_code} for {url}")
return None
except requests.exceptions.Timeout:
logger.error(f"Timeout reading URL: {url}")
return None
except requests.exceptions.ConnectionError:
logger.error(f"Connection error reading URL: {url}")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Request error reading URL {url}: {str(e)}")
return None
except Exception as e: except Exception as e:
logger.error(f"Error reading URL {url}: {str(e)}") logger.error(f"Unexpected error reading URL {url}: {str(e)}")
return None return None
def _extractTitle(self, soup: BeautifulSoup, url: str) -> str: def _extractTitle(self, soup: BeautifulSoup, url: str) -> str:
@ -91,32 +157,109 @@ class MethodWeb(MethodBase):
return title return title
def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 10000) -> str: def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 50000) -> str:
"""Extract the main content from an HTML page""" """Extract the main content from an HTML page with enhanced content detection"""
if not soup: if not soup:
return "" return ""
# Try to find main content elements in priority order # Try to find main content elements in priority order with more selectors
main_content = None main_content = None
for selector in ['main', 'article', '#content', '.content', '#main', '.main']: content_selectors = [
'main',
'article',
'#content',
'.content',
'#main',
'.main',
'.post-content',
'.entry-content',
'.article-content',
'.page-content',
'[role="main"]',
'.container',
'.wrapper'
]
for selector in content_selectors:
content = soup.select_one(selector) content = soup.select_one(selector)
if content: if content:
main_content = content main_content = content
logger.debug(f"Found main content using selector: {selector}")
break break
# If no main content found, use the body # If no main content found, use the body
if not main_content: if not main_content:
main_content = soup.find('body') or soup main_content = soup.find('body') or soup
logger.debug("Using body as main content")
# Remove script, style, nav, footer elements that don't contribute to main content # Create a copy to avoid modifying the original
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'): content_copy = main_content.copy()
element.extract()
# Extract text content # Remove elements that don't contribute to main content (less aggressive)
text_content = main_content.get_text(separator=' ', strip=True) elements_to_remove = [
'script', 'style', 'noscript',
'nav', 'footer', 'header', 'aside',
'.sidebar', '#sidebar', '.comments', '#comments',
'.advertisement', '.ads', '.ad', '.banner',
'iframe', '.social-share', '.share-buttons',
'.breadcrumb', '.breadcrumbs', '.pagination',
'.related-posts', '.related-articles',
'.newsletter', '.subscribe', '.signup',
'.cookie-notice', '.privacy-notice',
'.popup', '.modal', '.overlay'
]
# Limit to max_chars for selector in elements_to_remove:
return text_content[:max_chars] for element in content_copy.select(selector):
element.extract()
# Extract text content with better formatting
text_content = content_copy.get_text(separator='\n', strip=True)
# Clean up the text
lines = text_content.split('\n')
cleaned_lines = []
for line in lines:
line = line.strip()
if line and len(line) > 10: # Only keep meaningful lines
cleaned_lines.append(line)
# Join lines with proper spacing
cleaned_content = '\n\n'.join(cleaned_lines)
# If content is too short, try alternative extraction
if len(cleaned_content) < 500:
logger.debug("Content too short, trying alternative extraction...")
# Try to extract from all paragraphs
paragraphs = soup.find_all(['p', 'div', 'section'])
alt_content = []
for p in paragraphs:
text = p.get_text(strip=True)
if text and len(text) > 20: # Only meaningful paragraphs
alt_content.append(text)
if alt_content:
cleaned_content = '\n\n'.join(alt_content[:20]) # Limit to first 20 paragraphs
# Limit to max_chars but preserve complete sentences
if len(cleaned_content) > max_chars:
# Try to cut at a sentence boundary
sentences = cleaned_content.split('. ')
truncated_content = ""
for sentence in sentences:
if len(truncated_content + sentence) < max_chars:
truncated_content += sentence + ". "
else:
break
cleaned_content = truncated_content.strip()
logger.debug(f"Extracted {len(cleaned_content)} characters of content")
return cleaned_content
def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]: def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]:
"""Check basic accessibility features""" """Check basic accessibility features"""
@ -214,10 +357,413 @@ class MethodWeb(MethodBase):
} }
} }
def _detectJavaScriptRendering(self, soup: BeautifulSoup) -> bool:
"""Detect if a page likely requires JavaScript rendering"""
if not soup:
return False
# Check for common indicators of JavaScript-rendered content
indicators = [
# Angular, React, Vue indicators
soup.find('div', {'ng-app': True}),
soup.find('div', {'id': 'root'}),
soup.find('div', {'id': 'app'}),
soup.find('div', {'id': 'react-root'}),
# SPA indicators
soup.find('div', {'id': 'spa-root'}),
soup.find('div', {'class': 'spa-container'}),
# Modern framework indicators
soup.find('div', {'data-reactroot': True}),
soup.find('div', {'data-ng-controller': True}),
# Empty content with scripts
len(soup.get_text(strip=True)) < 100 and len(soup.find_all('script')) > 2
]
return any(indicators)
def _extractMetaInformation(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
"""Extract meta information from the page"""
meta_info = {
"url": url,
"title": self._extractTitle(soup, url),
"description": "",
"keywords": "",
"author": "",
"language": "",
"robots": "",
"viewport": "",
"charset": "",
"canonical": ""
}
# Extract meta tags
meta_tags = soup.find_all('meta')
for meta in meta_tags:
name = meta.get('name', '').lower()
property = meta.get('property', '').lower()
content = meta.get('content', '')
if name == 'description' or property == 'og:description':
meta_info['description'] = content
elif name == 'keywords':
meta_info['keywords'] = content
elif name == 'author':
meta_info['author'] = content
elif name == 'language':
meta_info['language'] = content
elif name == 'robots':
meta_info['robots'] = content
elif name == 'viewport':
meta_info['viewport'] = content
elif property == 'og:title':
meta_info['title'] = content
elif property == 'og:url':
meta_info['canonical'] = content
# Extract charset
charset_meta = soup.find('meta', charset=True)
if charset_meta:
meta_info['charset'] = charset_meta.get('charset', '')
# Extract canonical URL
canonical_link = soup.find('link', rel='canonical')
if canonical_link:
meta_info['canonical'] = canonical_link.get('href', '')
return meta_info
def _getAlternativeApproaches(self, url: str, requires_js: bool, content_length: int) -> List[str]:
"""Get alternative approaches for sites that are difficult to crawl"""
approaches = []
if requires_js:
approaches.extend([
"Site requires JavaScript rendering - consider using a headless browser",
"Try accessing the site's API endpoints directly",
"Look for RSS feeds or sitemaps",
"Check if the site has a mobile version that's easier to parse"
])
if content_length < 100:
approaches.extend([
"Site may have anti-bot protection - try with different user agents",
"Check if the site requires authentication",
"Look for alternative URLs (www vs non-www, http vs https)",
"Try accessing the site's robots.txt for crawling guidelines"
])
# Add general suggestions
approaches.extend([
"Use the web.search action to find alternative sources",
"Try the web.scrape action with specific CSS selectors",
"Check if the site has a public API or data export"
])
return approaches
async def _tryAdvancedAIWebResearch(self, action_type: str, parameters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Try to get web research results using advanced AI first
Args:
action_type: Type of action ('crawl', 'scrape', or 'search')
parameters: Action parameters
Returns:
Dict with AI results if successful, None if AI call fails
"""
try:
# Create appropriate prompt based on action type
if action_type == "crawl":
prompt = self._createCrawlAIPrompt(parameters)
elif action_type == "scrape":
prompt = self._createScrapeAIPrompt(parameters)
elif action_type == "search":
prompt = self._createSearchAIPrompt(parameters)
else:
logger.warning(f"Unknown action type for AI research: {action_type}")
return None
# Try advanced AI call
if hasattr(self.service, 'callAiTextAdvanced'):
logger.info(f"Attempting advanced AI web research for {action_type}")
response = await self.service.callAiTextAdvanced(prompt)
# Parse the AI response
parsed_result = self._parseAIWebResponse(response, action_type)
if parsed_result:
logger.info(f"Advanced AI web research successful for {action_type}")
return parsed_result
else:
logger.warning(f"Failed to parse AI response for {action_type}")
return None
else:
logger.warning("Service does not have callAiTextAdvanced method")
return None
except Exception as e:
logger.warning(f"Advanced AI web research failed for {action_type}: {str(e)}")
return None
def _createCrawlAIPrompt(self, parameters: Dict[str, Any]) -> str:
"""Create AI prompt for web crawling"""
urls = parameters.get("urls", [])
maxDepth = parameters.get("maxDepth", 2)
includeImages = parameters.get("includeImages", False)
followLinks = parameters.get("followLinks", True)
prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URLs based on your extensive training data and knowledge.
URLs to research: {urls}
Max depth: {maxDepth}
Include images: {includeImages}
Follow links: {followLinks}
For each URL, please provide comprehensive information including:
1. Company/organization information and background
2. Main business activities and services
3. Key personnel and leadership
4. Contact information and locations
5. Recent news and developments
6. Industry analysis and market position
7. Related companies and partnerships
8. Website structure and key pages
9. Business model and revenue streams
10. Regulatory compliance and certifications
For each URL, provide:
- url: The original URL
- title: Company/organization name
- content: Comprehensive description and analysis
- content_length: Number of characters in content
- meta_info: Business information object
- links: Related companies and important connections
- images: Company logos or key visuals if known
- requires_javascript: Boolean (usually false for static info)
- alternative_approaches: Additional research suggestions
- timestamp: Current timestamp
Return the results in this exact JSON format:
{{
"urls": {urls},
"maxDepth": {maxDepth},
"includeImages": {includeImages},
"followLinks": {followLinks},
"crawlResults": [
{{
"url": "url_here",
"depth": {maxDepth},
"followLinks": {followLinks},
"extractContent": true,
"title": "company_name",
"content": "comprehensive_company_analysis",
"content_length": 1234,
"meta_info": {{
"url": "url_here",
"title": "company_name",
"description": "business_description",
"keywords": "industry_keywords",
"author": "company_info",
"language": "language_code",
"robots": "robots_info",
"viewport": "viewport_info",
"charset": "charset_info",
"canonical": "canonical_url"
}},
"links": [
{{
"url": "related_company_url",
"text": "company_name"
}}
],
"images": [
{{
"src": "logo_url",
"alt": "company_logo",
"title": "company_name",
"width": "width_value",
"height": "height_value"
}}
],
"requires_javascript": false,
"alternative_approaches": ["approach1", "approach2"],
"timestamp": "2024-01-01T00:00:00Z"
}}
],
"summary": {{
"total_urls": {len(urls)},
"successful_crawls": 0,
"failed_crawls": 0,
"total_content_chars": 0
}},
"timestamp": "2024-01-01T00:00:00Z"
}}
Please provide accurate, comprehensive information about each company/organization based on your knowledge. If you don't have specific information about a URL, provide general industry analysis and suggest alternative research approaches.
"""
return prompt
def _createScrapeAIPrompt(self, parameters: Dict[str, Any]) -> str:
"""Create AI prompt for web scraping"""
url = parameters.get("url")
selectors = parameters.get("selectors", {})
format = parameters.get("format", "json")
prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URL and the specific data requested based on your extensive training data and knowledge.
URL to research: {url}
Data selectors: {selectors}
Output format: {format}
Please provide comprehensive information including:
1. Company/organization background and history
2. Business activities and services offered
3. Key personnel and leadership information
4. Financial information and performance data
5. Market position and competitive analysis
6. Recent news and developments
7. Contact information and locations
8. Industry trends and insights
9. Related companies and partnerships
10. Regulatory and compliance information
For each data selector requested, provide relevant information in the specified format (text, html, or json).
Return the results in this exact JSON format:
{{
"url": "{url}",
"selectors": {selectors},
"format": "{format}",
"scrapedData": {{
"url": "{url}",
"selectors": {selectors},
"format": "{format}",
"content": {{
"company_info": ["comprehensive_company_analysis"],
"business_activities": ["detailed_business_description"],
"leadership": ["key_personnel_information"],
"financial_data": ["financial_performance_analysis"],
"market_position": ["competitive_analysis"],
"recent_news": ["latest_developments"],
"contact_info": ["contact_details"],
"industry_insights": ["market_trends"],
"partnerships": ["related_companies"],
"compliance": ["regulatory_information"]
}},
"timestamp": "2024-01-01T00:00:00Z"
}},
"timestamp": "2024-01-01T00:00:00Z"
}}
Please provide accurate, comprehensive information about the company/organization based on your knowledge. If you don't have specific information about the URL, provide general industry analysis and suggest alternative research approaches.
"""
return prompt
def _createSearchAIPrompt(self, parameters: Dict[str, Any]) -> str:
"""Create AI prompt for web search"""
query = parameters.get("query")
engine = parameters.get("engine", "google")
maxResults = parameters.get("maxResults", 10)
filter = parameters.get("filter")
prompt = f"""
You are an advanced AI research assistant with comprehensive knowledge about companies, industries, and business information. Please provide detailed information about the following search query based on your extensive training data and knowledge.
Search query: {query}
Search engine: {engine}
Max results: {maxResults}
Filter: {filter}
Please provide comprehensive research results including:
1. Relevant company/organization information
2. Industry analysis and market insights
3. Key personnel and leadership details
4. Business activities and services
5. Financial performance and metrics
6. Recent news and developments
7. Competitive landscape analysis
8. Market trends and opportunities
9. Regulatory and compliance information
10. Related companies and partnerships
For each search result, provide:
- title: Company/organization name
- url: Official website or primary source
- snippet: Brief description and key highlights
- content: Comprehensive analysis and insights
Return the results in this exact JSON format:
{{
"query": "{query}",
"engine": "{engine}",
"maxResults": {maxResults},
"filter": "{filter}",
"searchResults": {{
"query": "{query}",
"maxResults": {maxResults},
"results": [
{{
"title": "company_name",
"url": "official_website",
"snippet": "brief_description",
"content": "comprehensive_analysis"
}}
],
"totalFound": 0,
"timestamp": "2024-01-01T00:00:00Z"
}},
"timestamp": "2024-01-01T00:00:00Z"
}}
Please provide accurate, comprehensive information about the search query based on your knowledge. If you don't have specific information about the query, provide general industry analysis and suggest alternative research approaches.
"""
return prompt
def _parseAIWebResponse(self, response: str, action_type: str) -> Optional[Dict[str, Any]]:
"""Parse AI response into structured data"""
try:
# Extract JSON from response
json_start = response.find('{')
json_end = response.rfind('}') + 1
if json_start == -1 or json_end == 0:
logger.warning(f"No JSON found in AI response: {response}")
return None
json_str = response[json_start:json_end]
parsed_data = json.loads(json_str)
# Validate basic structure based on action type
if action_type == "crawl":
if "crawlResults" not in parsed_data:
logger.warning("Invalid crawl response structure")
return None
elif action_type == "scrape":
if "scrapedData" not in parsed_data:
logger.warning("Invalid scrape response structure")
return None
elif action_type == "search":
if "searchResults" not in parsed_data:
logger.warning("Invalid search response structure")
return None
return parsed_data
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse AI response JSON: {str(e)}")
return None
except Exception as e:
logger.warning(f"Error parsing AI response: {str(e)}")
return None
@action @action
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult: async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
""" """
Crawl web pages and extract content Crawl web pages and extract content with enhanced error handling and content detection
Parameters: Parameters:
urls (List[str]): List of URLs to crawl urls (List[str]): List of URLs to crawl
@ -240,23 +786,76 @@ class MethodWeb(MethodBase):
error="URLs are required" error="URLs are required"
) )
# Try advanced AI research first
ai_result = await self._tryAdvancedAIWebResearch("crawl", parameters)
if ai_result:
logger.info("Using advanced AI web research for crawl")
# Reconstruct the result data from the AI response
result_data = {
"urls": ai_result.get("urls", []),
"maxDepth": ai_result.get("maxDepth", 2),
"includeImages": ai_result.get("includeImages", False),
"followLinks": ai_result.get("followLinks", True),
"crawlResults": ai_result.get("crawlResults", []),
"summary": ai_result.get("summary", {}),
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
}
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data,
"mimeType": "application/json"
}
]
}
)
else:
logger.info("Advanced AI web research failed, falling back to regular web crawling")
# Crawl each URL # Crawl each URL
crawl_results = [] crawl_results = []
for url in urls: for url in urls:
try: try:
# Read the URL logger.info(f"Crawling URL: {url}")
# Read the URL with enhanced error handling
soup = self._readUrl(url) soup = self._readUrl(url)
if not soup: if not soup:
logger.error(f"Failed to read URL: {url}")
crawl_results.append({ crawl_results.append({
"error": "Failed to read URL", "error": "Failed to read URL - check if the site is accessible and not blocking crawlers",
"url": url "url": url,
"suggestions": [
"Try accessing the URL directly in a browser",
"Check if the site requires JavaScript",
"Verify the URL is correct and accessible"
]
}) })
continue continue
# Extract basic information # Extract comprehensive information
title = self._extractTitle(soup, url) title = self._extractTitle(soup, url)
content = self._extractMainContent(soup) if True else "" content = self._extractMainContent(soup)
meta_info = self._extractMetaInformation(soup, url)
# Check if content is meaningful
content_length = len(content)
if content_length < 100:
logger.warning(f"Very little content extracted from {url} ({content_length} chars)")
crawl_results.append({
"url": url,
"title": title,
"content": content,
"content_length": content_length,
"warning": "Very little content extracted - site may require JavaScript or have anti-bot protection",
"meta_info": meta_info,
"timestamp": datetime.now(UTC).isoformat()
})
continue
# Extract links if requested # Extract links if requested
links = [] links = []
@ -264,21 +863,32 @@ class MethodWeb(MethodBase):
for link in soup.find_all('a', href=True): for link in soup.find_all('a', href=True):
href = link.get('href') href = link.get('href')
if href and href.startswith(('http://', 'https://')): if href and href.startswith(('http://', 'https://')):
links.append({ link_text = link.get_text(strip=True)
'url': href, if link_text: # Only include links with text
'text': link.get_text(strip=True)[:100] links.append({
'url': href,
'text': link_text[:100]
})
# Extract images if requested
images = []
if includeImages:
for img in soup.find_all('img', src=True):
src = img.get('src')
if src:
images.append({
'src': src,
'alt': img.get('alt', ''),
'title': img.get('title', ''),
'width': img.get('width', ''),
'height': img.get('height', '')
}) })
# Extract images # Check for JavaScript rendering requirements
images = [] requires_js = self._detectJavaScriptRendering(soup)
for img in soup.find_all('img', src=True):
src = img.get('src') # Get alternative approaches if needed
if src: alternative_approaches = self._getAlternativeApproaches(url, requires_js, content_length)
images.append({
'src': src,
'alt': img.get('alt', ''),
'title': img.get('title', '')
})
crawl_results.append({ crawl_results.append({
"url": url, "url": url,
@ -287,16 +897,27 @@ class MethodWeb(MethodBase):
"extractContent": True, "extractContent": True,
"title": title, "title": title,
"content": content, "content": content,
"links": links[:10], # Limit to first 10 links "content_length": content_length,
"images": images[:10], # Limit to first 10 images "meta_info": meta_info,
"links": links[:20], # Limit to first 20 links
"images": images[:20], # Limit to first 20 images
"requires_javascript": requires_js,
"alternative_approaches": alternative_approaches,
"timestamp": datetime.now(UTC).isoformat() "timestamp": datetime.now(UTC).isoformat()
}) })
logger.info(f"Successfully crawled {url} - extracted {content_length} characters")
except Exception as e: except Exception as e:
logger.error(f"Error crawling web page {url}: {str(e)}") logger.error(f"Error crawling web page {url}: {str(e)}")
crawl_results.append({ crawl_results.append({
"error": str(e), "error": str(e),
"url": url "url": url,
"suggestions": [
"Check if the URL is accessible",
"Try with a different user agent",
"Verify the site doesn't block automated access"
]
}) })
# Create result data # Create result data
@ -306,6 +927,12 @@ class MethodWeb(MethodBase):
"includeImages": includeImages, "includeImages": includeImages,
"followLinks": followLinks, "followLinks": followLinks,
"crawlResults": crawl_results, "crawlResults": crawl_results,
"summary": {
"total_urls": len(urls),
"successful_crawls": len([r for r in crawl_results if "error" not in r]),
"failed_crawls": len([r for r in crawl_results if "error" in r]),
"total_content_chars": sum([r.get("content_length", 0) for r in crawl_results if "content_length" in r])
},
"timestamp": datetime.now(UTC).isoformat() "timestamp": datetime.now(UTC).isoformat()
} }
@ -367,6 +994,33 @@ class MethodWeb(MethodBase):
error="URL and selectors are required" error="URL and selectors are required"
) )
# Try advanced AI research first
ai_result = await self._tryAdvancedAIWebResearch("scrape", parameters)
if ai_result:
logger.info("Using advanced AI web research for scrape")
# Reconstruct the result data from the AI response
result_data = {
"url": ai_result.get("url"),
"selectors": ai_result.get("selectors"),
"format": ai_result.get("format"),
"scrapedData": ai_result.get("scrapedData"),
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
}
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data,
"mimeType": "application/json"
}
]
}
)
else:
logger.info("Advanced AI web research failed, falling back to regular web scraping")
# Read the URL # Read the URL
soup = self._readUrl(url) soup = self._readUrl(url)
if not soup: if not soup:
@ -478,6 +1132,34 @@ class MethodWeb(MethodBase):
error="Search query is required" error="Search query is required"
) )
# Try advanced AI research first
ai_result = await self._tryAdvancedAIWebResearch("search", parameters)
if ai_result:
logger.info("Using advanced AI web research for search")
# Reconstruct the result data from the AI response
result_data = {
"query": ai_result.get("query"),
"engine": ai_result.get("engine"),
"maxResults": ai_result.get("maxResults"),
"filter": ai_result.get("filter"),
"searchResults": ai_result.get("searchResults"),
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
}
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
"documentData": result_data,
"mimeType": "application/json"
}
]
}
)
else:
logger.info("Advanced AI web research failed, falling back to regular web search")
# Search web content using Google search via SerpAPI # Search web content using Google search via SerpAPI
try: try:
if not self.srcApikey: if not self.srcApikey:
@ -601,94 +1283,3 @@ class MethodWeb(MethodBase):
error=str(e) error=str(e)
) )
@action
async def validate(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Validate web pages for various criteria
Parameters:
url (str): URL to validate
checks (List[str], optional): Types of checks to perform (default: ["accessibility", "seo", "performance"])
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
url = parameters.get("url")
checks = parameters.get("checks", ["accessibility", "seo", "performance"])
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
if not url:
return self._createResult(
success=False,
data={},
error="URL is required"
)
# Read the URL
soup = self._readUrl(url)
if not soup:
return self._createResult(
success=False,
data={},
error="Failed to read URL"
)
validation_results = {}
for check in checks:
if check == "accessibility":
validation_results["accessibility"] = self._checkAccessibility(soup)
elif check == "seo":
validation_results["seo"] = self._checkSEO(soup)
elif check == "performance":
validation_results["performance"] = self._checkPerformance(soup, url)
else:
validation_results[check] = {"status": "unknown", "message": f"Unknown check type: {check}"}
validation_result = {
"url": url,
"checks": checks,
"results": validation_results,
"timestamp": datetime.now(UTC).isoformat()
}
# Create result data
result_data = {
"url": url,
"checks": checks,
"validationResult": validation_result,
"timestamp": datetime.now(UTC).isoformat()
}
# Determine output format based on expected formats
output_extension = ".json" # Default
output_mime_type = "application/json" # Default
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
# Use the first expected format
expected_format = expectedDocumentFormats[0]
output_extension = expected_format.get("extension", ".json")
output_mime_type = expected_format.get("mimeType", "application/json")
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
else:
logger.info("No expected format specified, using default .json format")
return self._createResult(
success=True,
data={
"documents": [
{
"documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
"documentData": result_data,
"mimeType": output_mime_type
}
]
}
)
except Exception as e:
logger.error(f"Error validating web page: {str(e)}")
return self._createResult(
success=False,
data={},
error=str(e)
)

View file

@ -170,7 +170,7 @@ class WorkflowManager:
# Add completion log entry # Add completion log entry
self.chatInterface.createWorkflowLog({ self.chatInterface.createWorkflowLog({
"workflowId": workflow.id, "workflowId": workflow.id,
"message": "Workflow completed successfully", "message": "Workflow completed",
"type": "success", "type": "success",
"status": "completed", "status": "completed",
"progress": 100 "progress": 100