web refactored
This commit is contained in:
parent
cfb34c6a38
commit
b1be8dd81c
6 changed files with 1242 additions and 186 deletions
20
app.py
20
app.py
|
|
@ -37,6 +37,22 @@ def initLogging():
|
||||||
('.well-known/appspecific/com.chrome.devtools.json' in record.msg or
|
('.well-known/appspecific/com.chrome.devtools.json' in record.msg or
|
||||||
'Request: /index.html' in record.msg))
|
'Request: /index.html' in record.msg))
|
||||||
|
|
||||||
|
# Add filter to exclude HTTP debug messages
|
||||||
|
class HTTPDebugFilter(logging.Filter):
|
||||||
|
def filter(self, record):
|
||||||
|
if isinstance(record.msg, str):
|
||||||
|
# Filter out HTTP debug messages
|
||||||
|
http_debug_patterns = [
|
||||||
|
'receive_response_body.started',
|
||||||
|
'receive_response_body.complete',
|
||||||
|
'response_closed.started',
|
||||||
|
'_send_single_request',
|
||||||
|
'httpcore.http11',
|
||||||
|
'httpx._client'
|
||||||
|
]
|
||||||
|
return not any(pattern in record.msg for pattern in http_debug_patterns)
|
||||||
|
return True
|
||||||
|
|
||||||
# Configure handlers based on config
|
# Configure handlers based on config
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
|
|
@ -45,6 +61,7 @@ def initLogging():
|
||||||
consoleHandler = logging.StreamHandler()
|
consoleHandler = logging.StreamHandler()
|
||||||
consoleHandler.setFormatter(consoleFormatter)
|
consoleHandler.setFormatter(consoleFormatter)
|
||||||
consoleHandler.addFilter(ChromeDevToolsFilter())
|
consoleHandler.addFilter(ChromeDevToolsFilter())
|
||||||
|
consoleHandler.addFilter(HTTPDebugFilter())
|
||||||
handlers.append(consoleHandler)
|
handlers.append(consoleHandler)
|
||||||
|
|
||||||
# Add file handler if enabled
|
# Add file handler if enabled
|
||||||
|
|
@ -71,6 +88,7 @@ def initLogging():
|
||||||
)
|
)
|
||||||
fileHandler.setFormatter(fileFormatter)
|
fileHandler.setFormatter(fileFormatter)
|
||||||
fileHandler.addFilter(ChromeDevToolsFilter())
|
fileHandler.addFilter(ChromeDevToolsFilter())
|
||||||
|
fileHandler.addFilter(HTTPDebugFilter())
|
||||||
handlers.append(fileHandler)
|
handlers.append(fileHandler)
|
||||||
|
|
||||||
# Configure the root logger
|
# Configure the root logger
|
||||||
|
|
@ -83,7 +101,7 @@ def initLogging():
|
||||||
)
|
)
|
||||||
|
|
||||||
# Silence noisy third-party libraries - use the same level as the root logger
|
# Silence noisy third-party libraries - use the same level as the root logger
|
||||||
noisyLoggers = ["httpx", "urllib3", "asyncio", "fastapi.security.oauth2"]
|
noisyLoggers = ["httpx", "httpcore", "urllib3", "asyncio", "fastapi.security.oauth2"]
|
||||||
for loggerName in noisyLoggers:
|
for loggerName in noisyLoggers:
|
||||||
logging.getLogger(loggerName).setLevel(logLevel)
|
logging.getLogger(loggerName).setLevel(logLevel)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -317,8 +317,8 @@ class ChatManager:
|
||||||
'workflow_id': workflow.id
|
'workflow_id': workflow.id
|
||||||
})
|
})
|
||||||
|
|
||||||
# Get AI response
|
# Get AI response with fallback mechanism
|
||||||
response = await self.service.callAiTextAdvanced(prompt)
|
response = await self._callAIWithCircuitBreaker(prompt, "task_planning")
|
||||||
|
|
||||||
# Parse and validate task plan
|
# Parse and validate task plan
|
||||||
task_plan_dict = self._parseTaskPlanResponse(response)
|
task_plan_dict = self._parseTaskPlanResponse(response)
|
||||||
|
|
@ -372,8 +372,22 @@ class ChatManager:
|
||||||
return task_plan
|
return task_plan
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in high-level task planning: {str(e)}")
|
error_message = str(e)
|
||||||
raise Exception(f"AI is required for task planning but failed: {str(e)}")
|
logger.error(f"Error in high-level task planning: {error_message}")
|
||||||
|
|
||||||
|
# Provide more specific error messages based on the error type
|
||||||
|
if "overloaded" in error_message.lower() or "529" in error_message:
|
||||||
|
detailed_error = "AI service is currently overloaded. Please try again in a few minutes."
|
||||||
|
elif "rate limit" in error_message.lower() or "429" in error_message:
|
||||||
|
detailed_error = "Rate limit exceeded. Please wait before making another request."
|
||||||
|
elif "api key" in error_message.lower() or "401" in error_message:
|
||||||
|
detailed_error = "Invalid API key. Please check your AI service configuration."
|
||||||
|
elif "timeout" in error_message.lower():
|
||||||
|
detailed_error = "AI service request timed out. Please try again."
|
||||||
|
else:
|
||||||
|
detailed_error = f"AI service error: {error_message}"
|
||||||
|
|
||||||
|
raise Exception(detailed_error)
|
||||||
|
|
||||||
# Phase 2: Task Definition and Action Generation
|
# Phase 2: Task Definition and Action Generation
|
||||||
async def defineTaskActions(self, task_step: TaskStep, workflow: ChatWorkflow, previous_results: List[str] = None,
|
async def defineTaskActions(self, task_step: TaskStep, workflow: ChatWorkflow, previous_results: List[str] = None,
|
||||||
|
|
@ -641,29 +655,127 @@ class ChatManager:
|
||||||
# ===== Enhanced Task Planning Methods =====
|
# ===== Enhanced Task Planning Methods =====
|
||||||
|
|
||||||
async def _callAIWithCircuitBreaker(self, prompt: str, context: str) -> str:
|
async def _callAIWithCircuitBreaker(self, prompt: str, context: str) -> str:
|
||||||
"""Call AI with circuit breaker pattern for fault tolerance"""
|
"""Call AI with intelligent routing based on complexity and circuit breaker pattern"""
|
||||||
try:
|
max_retries = 3
|
||||||
# Check circuit breaker
|
base_delay = 2 # Start with 2 seconds
|
||||||
if self._isCircuitBreakerOpen():
|
|
||||||
raise Exception("AI circuit breaker is open - too many recent failures")
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
# Call AI with timeout
|
# Check circuit breaker
|
||||||
logger.debug(f"ACTION GENERATION PROMPT: {prompt}")
|
if self._isCircuitBreakerOpen():
|
||||||
response = await asyncio.wait_for(
|
raise Exception("AI circuit breaker is open - too many recent failures")
|
||||||
self._callAI(prompt, context),
|
|
||||||
timeout=self.ai_call_timeout
|
# Determine which AI service to use based on complexity
|
||||||
)
|
ai_choice = self._determineAIChoice(prompt, context)
|
||||||
|
logger.debug(f"AI choice for {context}: {ai_choice} (attempt {attempt + 1}/{max_retries})")
|
||||||
# Reset failure count on success
|
|
||||||
self.ai_failure_count = 0
|
if ai_choice == "advanced":
|
||||||
return response
|
# Use advanced AI for complex tasks
|
||||||
|
try:
|
||||||
except asyncio.TimeoutError:
|
response = await asyncio.wait_for(
|
||||||
self._recordAIFailure("Timeout")
|
self._callAdvancedAI(prompt, context),
|
||||||
raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds")
|
timeout=self.ai_call_timeout
|
||||||
except Exception as e:
|
)
|
||||||
self._recordAIFailure(str(e))
|
|
||||||
raise
|
# Reset failure count on success
|
||||||
|
self.ai_failure_count = 0
|
||||||
|
logger.info(f"Advanced AI call successful for {context}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as advanced_error:
|
||||||
|
error_message = str(advanced_error)
|
||||||
|
logger.warning(f"Advanced AI call failed for {context}: {error_message}")
|
||||||
|
|
||||||
|
# Fall back to basic AI for complex tasks
|
||||||
|
logger.info(f"Falling back to basic AI for complex task: {context}")
|
||||||
|
try:
|
||||||
|
response = await asyncio.wait_for(
|
||||||
|
self._callStandardAI(prompt, context),
|
||||||
|
timeout=self.ai_call_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reset failure count on success
|
||||||
|
self.ai_failure_count = 0
|
||||||
|
logger.info(f"Basic AI fallback successful for complex task: {context}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as standard_error:
|
||||||
|
# Both failed for complex task
|
||||||
|
error_message = f"Advanced AI failed: {str(advanced_error)}. Basic AI failed: {str(standard_error)}"
|
||||||
|
raise Exception(error_message)
|
||||||
|
|
||||||
|
else: # basic
|
||||||
|
# Use basic AI for simple tasks
|
||||||
|
try:
|
||||||
|
response = await asyncio.wait_for(
|
||||||
|
self._callStandardAI(prompt, context),
|
||||||
|
timeout=self.ai_call_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reset failure count on success
|
||||||
|
self.ai_failure_count = 0
|
||||||
|
logger.info(f"Basic AI call successful for {context}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as basic_error:
|
||||||
|
error_message = str(basic_error)
|
||||||
|
logger.warning(f"Basic AI call failed for {context}: {error_message}")
|
||||||
|
|
||||||
|
# Only upgrade to advanced AI for critical simple tasks
|
||||||
|
if self._isCriticalTask(context):
|
||||||
|
logger.info(f"Upgrading to advanced AI for critical simple task: {context}")
|
||||||
|
try:
|
||||||
|
response = await asyncio.wait_for(
|
||||||
|
self._callAdvancedAI(prompt, context),
|
||||||
|
timeout=self.ai_call_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reset failure count on success
|
||||||
|
self.ai_failure_count = 0
|
||||||
|
logger.info(f"Advanced AI upgrade successful for critical task: {context}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as advanced_error:
|
||||||
|
# Both failed for critical task
|
||||||
|
error_message = f"Basic AI failed: {str(basic_error)}. Advanced AI failed: {str(advanced_error)}"
|
||||||
|
raise Exception(error_message)
|
||||||
|
else:
|
||||||
|
# Non-critical simple task failed
|
||||||
|
raise Exception(f"Basic AI failed for simple task: {error_message}")
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self._recordAIFailure("Timeout")
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt) # Exponential backoff
|
||||||
|
logger.warning(f"AI call timed out, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise Exception(f"AI call timed out after {self.ai_call_timeout} seconds")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
|
||||||
|
# Special handling for overloaded service (529 error)
|
||||||
|
if "overloaded" in error_message.lower() or "529" in error_message:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt) # Exponential backoff
|
||||||
|
logger.warning(f"AI service overloaded, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries})")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Don't record this as a circuit breaker failure since it's a service issue
|
||||||
|
raise Exception("AI service is currently overloaded. Please try again in a few minutes.")
|
||||||
|
|
||||||
|
# For other errors, record failure and potentially retry
|
||||||
|
self._recordAIFailure(error_message)
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt) # Exponential backoff
|
||||||
|
logger.warning(f"AI call failed, retrying in {delay} seconds (attempt {attempt + 1}/{max_retries}): {error_message}")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
def _isCircuitBreakerOpen(self) -> bool:
|
def _isCircuitBreakerOpen(self) -> bool:
|
||||||
"""Check if circuit breaker is open"""
|
"""Check if circuit breaker is open"""
|
||||||
|
|
@ -678,6 +790,146 @@ class ChatManager:
|
||||||
self.ai_last_failure_time = None
|
self.ai_last_failure_time = None
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _determineAIChoice(self, prompt: str, context: str) -> str:
|
||||||
|
"""Determine whether to use advanced or basic AI based on task complexity"""
|
||||||
|
|
||||||
|
# Check for forced AI choice based on context
|
||||||
|
forced_choice = self._getForcedAIChoice(context)
|
||||||
|
if forced_choice:
|
||||||
|
logger.debug(f"Forced AI choice for {context}: {forced_choice}")
|
||||||
|
return forced_choice
|
||||||
|
|
||||||
|
# Define complex task patterns that require advanced AI
|
||||||
|
complex_patterns = [
|
||||||
|
# Task planning and workflow management
|
||||||
|
"task_planning", "action_generation", "result_review", "task_completion_validation",
|
||||||
|
|
||||||
|
# Complex document analysis
|
||||||
|
"document", "extract", "analysis", "comprehensive", "detailed analysis",
|
||||||
|
|
||||||
|
# Multi-step reasoning
|
||||||
|
"plan", "strategy", "evaluate", "assess", "compare", "analyze",
|
||||||
|
|
||||||
|
# Complex business logic
|
||||||
|
"workflow", "task", "action", "validation", "review", "assessment",
|
||||||
|
|
||||||
|
# Critical decision making
|
||||||
|
"decision", "recommendation", "evaluation", "quality", "success criteria",
|
||||||
|
|
||||||
|
# Complex prompts
|
||||||
|
"JSON", "structured", "format", "validation", "improvements", "quality_score"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Define simple task patterns that can use basic AI
|
||||||
|
simple_patterns = [
|
||||||
|
# Basic text processing
|
||||||
|
"summarize", "translate", "format", "convert", "extract text",
|
||||||
|
|
||||||
|
# Simple queries
|
||||||
|
"find", "search", "list", "get", "retrieve",
|
||||||
|
|
||||||
|
# Basic operations
|
||||||
|
"send", "upload", "download", "create", "delete",
|
||||||
|
|
||||||
|
# Simple responses
|
||||||
|
"confirm", "acknowledge", "status", "info"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check prompt and context for complexity indicators
|
||||||
|
combined_text = f"{prompt} {context}".lower()
|
||||||
|
|
||||||
|
# Count complex indicators
|
||||||
|
complex_count = sum(1 for pattern in complex_patterns if pattern in combined_text)
|
||||||
|
|
||||||
|
# Count simple indicators
|
||||||
|
simple_count = sum(1 for pattern in simple_patterns if pattern in combined_text)
|
||||||
|
|
||||||
|
# Additional complexity factors
|
||||||
|
prompt_length = len(prompt)
|
||||||
|
has_json_requirement = "json" in combined_text and ("{" in prompt or "}" in prompt)
|
||||||
|
has_structured_output = any(word in combined_text for word in ["format", "structure", "template"])
|
||||||
|
has_validation = any(word in combined_text for word in ["validate", "check", "verify", "quality"])
|
||||||
|
|
||||||
|
# Calculate complexity score
|
||||||
|
complexity_score = 0
|
||||||
|
complexity_score += complex_count * 2 # Complex patterns worth more
|
||||||
|
complexity_score += simple_count * 1 # Simple patterns worth less
|
||||||
|
complexity_score += (prompt_length > 1000) * 3 # Long prompts are complex
|
||||||
|
complexity_score += has_json_requirement * 5 # JSON requirements are complex
|
||||||
|
complexity_score += has_structured_output * 3 # Structured output is complex
|
||||||
|
complexity_score += has_validation * 4 # Validation is complex
|
||||||
|
|
||||||
|
# Determine AI choice based on complexity score
|
||||||
|
if complexity_score >= 5:
|
||||||
|
logger.debug(f"Complex task detected (score: {complexity_score}) - using advanced AI for {context}")
|
||||||
|
return "advanced"
|
||||||
|
else:
|
||||||
|
logger.debug(f"Simple task detected (score: {complexity_score}) - using basic AI for {context}")
|
||||||
|
return "basic"
|
||||||
|
|
||||||
|
def _getForcedAIChoice(self, context: str) -> str:
|
||||||
|
"""Get forced AI choice for specific contexts (can be overridden)"""
|
||||||
|
|
||||||
|
# Define contexts that always use advanced AI
|
||||||
|
advanced_contexts = [
|
||||||
|
"task_planning", # Always use advanced for task planning
|
||||||
|
"action_generation", # Always use advanced for action generation
|
||||||
|
"result_review", # Always use advanced for result review
|
||||||
|
"task_completion_validation" # Always use advanced for validation
|
||||||
|
]
|
||||||
|
|
||||||
|
# Define contexts that always use basic AI
|
||||||
|
basic_contexts = [
|
||||||
|
"summarize", # Always use basic for summarization
|
||||||
|
"translate", # Always use basic for translation
|
||||||
|
"format", # Always use basic for formatting
|
||||||
|
"status", # Always use basic for status updates
|
||||||
|
"info" # Always use basic for info queries
|
||||||
|
]
|
||||||
|
|
||||||
|
context_lower = context.lower()
|
||||||
|
|
||||||
|
# Check for forced advanced AI
|
||||||
|
for advanced_context in advanced_contexts:
|
||||||
|
if advanced_context in context_lower:
|
||||||
|
return "advanced"
|
||||||
|
|
||||||
|
# Check for forced basic AI
|
||||||
|
for basic_context in basic_contexts:
|
||||||
|
if basic_context in context_lower:
|
||||||
|
return "basic"
|
||||||
|
|
||||||
|
# No forced choice
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _isCriticalTask(self, context: str) -> bool:
|
||||||
|
"""Determine if a simple task is critical enough to warrant advanced AI upgrade"""
|
||||||
|
|
||||||
|
# Define critical task patterns
|
||||||
|
critical_patterns = [
|
||||||
|
# Workflow critical tasks
|
||||||
|
"task_planning", "workflow", "critical", "essential",
|
||||||
|
|
||||||
|
# User-facing decisions
|
||||||
|
"decision", "recommendation", "evaluation", "assessment",
|
||||||
|
|
||||||
|
# Quality-sensitive tasks
|
||||||
|
"quality", "validation", "review", "check",
|
||||||
|
|
||||||
|
# Business-critical operations
|
||||||
|
"business", "strategy", "planning", "analysis"
|
||||||
|
]
|
||||||
|
|
||||||
|
context_lower = context.lower()
|
||||||
|
|
||||||
|
# Check if context contains critical patterns
|
||||||
|
is_critical = any(pattern in context_lower for pattern in critical_patterns)
|
||||||
|
|
||||||
|
if is_critical:
|
||||||
|
logger.debug(f"Critical task detected - {context}")
|
||||||
|
|
||||||
|
return is_critical
|
||||||
|
|
||||||
def _recordAIFailure(self, error: str):
|
def _recordAIFailure(self, error: str):
|
||||||
"""Record AI failure for circuit breaker"""
|
"""Record AI failure for circuit breaker"""
|
||||||
self.ai_failure_count += 1
|
self.ai_failure_count += 1
|
||||||
|
|
@ -1753,14 +2005,35 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
|
||||||
logger.error(f"Error parsing review response: {str(e)}")
|
logger.error(f"Error parsing review response: {str(e)}")
|
||||||
return {'status': 'failed', 'reason': f'Parse error: {str(e)}'}
|
return {'status': 'failed', 'reason': f'Parse error: {str(e)}'}
|
||||||
|
|
||||||
async def _callAI(self, prompt: str, context: str) -> str:
|
async def _callAdvancedAI(self, prompt: str, context: str) -> str:
|
||||||
"""Call AI service with prompt"""
|
"""Call advanced AI service with prompt (primary method)"""
|
||||||
try:
|
try:
|
||||||
# Use the existing AI call mechanism through service
|
# Use the advanced AI call mechanism through service
|
||||||
if hasattr(self, 'service') and self.service:
|
if hasattr(self, 'service') and self.service:
|
||||||
# Ensure service is properly initialized
|
# Try advanced AI call first
|
||||||
|
if hasattr(self.service, 'callAiTextAdvanced'):
|
||||||
|
response = await self.service.callAiTextAdvanced(prompt)
|
||||||
|
logger.debug(f"Advanced AI call successful for {context}")
|
||||||
|
return response
|
||||||
|
else:
|
||||||
|
raise Exception("Service does not have callAiTextAdvanced method")
|
||||||
|
else:
|
||||||
|
raise Exception("No service available for AI calls")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
logger.warning(f"Advanced AI call failed for {context}: {error_message}")
|
||||||
|
raise Exception(f"Advanced AI failed: {error_message}")
|
||||||
|
|
||||||
|
async def _callStandardAI(self, prompt: str, context: str) -> str:
|
||||||
|
"""Call standard AI service with prompt (fallback method)"""
|
||||||
|
try:
|
||||||
|
# Use the standard AI call mechanism through service
|
||||||
|
if hasattr(self, 'service') and self.service:
|
||||||
|
# Try standard AI call as fallback
|
||||||
if hasattr(self.service, 'callAiTextBasic'):
|
if hasattr(self.service, 'callAiTextBasic'):
|
||||||
response = await self.service.callAiTextBasic(prompt)
|
response = await self.service.callAiTextBasic(prompt)
|
||||||
|
logger.debug(f"Standard AI call successful for {context}")
|
||||||
return response
|
return response
|
||||||
else:
|
else:
|
||||||
raise Exception("Service does not have callAiTextBasic method")
|
raise Exception("Service does not have callAiTextBasic method")
|
||||||
|
|
@ -1768,8 +2041,26 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
|
||||||
raise Exception("No service available for AI calls")
|
raise Exception("No service available for AI calls")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error calling AI for {context}: {str(e)}")
|
error_message = str(e)
|
||||||
raise
|
logger.error(f"Standard AI call failed for {context}: {error_message}")
|
||||||
|
|
||||||
|
# Provide more specific error messages based on the error type
|
||||||
|
if "overloaded" in error_message.lower() or "529" in error_message:
|
||||||
|
detailed_error = "AI service is currently overloaded. Please try again in a few minutes."
|
||||||
|
elif "rate limit" in error_message.lower() or "429" in error_message:
|
||||||
|
detailed_error = "Rate limit exceeded. Please wait before making another request."
|
||||||
|
elif "api key" in error_message.lower() or "401" in error_message:
|
||||||
|
detailed_error = "Invalid API key. Please check your AI service configuration."
|
||||||
|
elif "timeout" in error_message.lower():
|
||||||
|
detailed_error = "AI service request timed out. Please try again."
|
||||||
|
else:
|
||||||
|
detailed_error = f"AI service error: {error_message}"
|
||||||
|
|
||||||
|
raise Exception(detailed_error)
|
||||||
|
|
||||||
|
async def _callAI(self, prompt: str, context: str) -> str:
|
||||||
|
"""Call AI service with prompt (legacy method - now uses the circuit breaker)"""
|
||||||
|
return await self._callAIWithCircuitBreaker(prompt, context)
|
||||||
|
|
||||||
# ===== WORKFLOW FEEDBACK GENERATION =====
|
# ===== WORKFLOW FEEDBACK GENERATION =====
|
||||||
|
|
||||||
|
|
@ -1866,11 +2157,20 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
|
||||||
task_description = task_step.description
|
task_description = task_step.description
|
||||||
logger.info(f"=== PROCESSING TASK {i+1}/{len(task_plan.tasks)}: {task_description} ===")
|
logger.info(f"=== PROCESSING TASK {i+1}/{len(task_plan.tasks)}: {task_description} ===")
|
||||||
|
|
||||||
# Create user-friendly task start log
|
# Create user-friendly task start log with action details
|
||||||
progress = 20 + (i * 60 // len(task_plan.tasks))
|
progress = 20 + (i * 60 // len(task_plan.tasks))
|
||||||
|
|
||||||
|
# Get actions for this task to show in the log
|
||||||
|
task_actions = await self.defineTaskActions(task_step, workflow, previous_results)
|
||||||
|
action_details = []
|
||||||
|
for j, action in enumerate(task_actions):
|
||||||
|
action_details.append(f" {j+1}. {action.execMethod}.{action.execAction}")
|
||||||
|
|
||||||
|
action_summary = "\n".join(action_details) if action_details else " (Actions will be generated during execution)"
|
||||||
|
|
||||||
self.chatInterface.createWorkflowLog({
|
self.chatInterface.createWorkflowLog({
|
||||||
"workflowId": workflow.id,
|
"workflowId": workflow.id,
|
||||||
"message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}",
|
"message": f"Executing task {i+1}/{len(task_plan.tasks)}: {task_description}\nActions to be executed:\n{action_summary}",
|
||||||
"type": "info",
|
"type": "info",
|
||||||
"status": "running",
|
"status": "running",
|
||||||
"progress": progress,
|
"progress": progress,
|
||||||
|
|
@ -2032,6 +2332,16 @@ Please review the task requirements and try again with different input or approa
|
||||||
for i, action in enumerate(actions):
|
for i, action in enumerate(actions):
|
||||||
logger.info(f"Executing action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}")
|
logger.info(f"Executing action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}")
|
||||||
|
|
||||||
|
# Add action start log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"Starting action {i+1}/{len(actions)}: {action.execMethod}.{action.execAction}",
|
||||||
|
"type": "info",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
|
|
||||||
# Execute action with validation
|
# Execute action with validation
|
||||||
result = await self.executeActionWithValidation(action, workflow, context)
|
result = await self.executeActionWithValidation(action, workflow, context)
|
||||||
|
|
||||||
|
|
@ -2039,7 +2349,37 @@ Please review the task requirements and try again with different input or approa
|
||||||
state.addSuccessfulAction(result)
|
state.addSuccessfulAction(result)
|
||||||
logger.info(f"Action {i+1} completed successfully")
|
logger.info(f"Action {i+1} completed successfully")
|
||||||
|
|
||||||
|
# Add action completion message with result documents
|
||||||
|
documents_info = ""
|
||||||
|
if result.documents and len(result.documents) > 0:
|
||||||
|
doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}"
|
||||||
|
for j, doc in enumerate(result.documents)]
|
||||||
|
documents_info = f"\n📄 Generated documents: {', '.join(doc_names)}"
|
||||||
|
|
||||||
|
# Create completion message
|
||||||
|
completion_message = f"✅ Action {i+1}/{len(actions)} completed: {action.execMethod}.{action.execAction}{documents_info}"
|
||||||
|
|
||||||
|
# Add as log entry instead of message
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": completion_message,
|
||||||
|
"type": "success",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
|
|
||||||
elif result.validation.get('status') == 'retry':
|
elif result.validation.get('status') == 'retry':
|
||||||
|
# Add retry log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"🔄 Action {i+1}/{len(actions)} needs retry: {action.execMethod}.{action.execAction}",
|
||||||
|
"type": "warning",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
|
|
||||||
# Retry individual action
|
# Retry individual action
|
||||||
improvements = result.validation.get('improvements', [])
|
improvements = result.validation.get('improvements', [])
|
||||||
retry_result = await self.retryActionWithImprovements(action, result, improvements)
|
retry_result = await self.retryActionWithImprovements(action, result, improvements)
|
||||||
|
|
@ -2047,15 +2387,51 @@ Please review the task requirements and try again with different input or approa
|
||||||
if retry_result.validation.get('status') == 'success':
|
if retry_result.validation.get('status') == 'success':
|
||||||
state.addSuccessfulAction(retry_result)
|
state.addSuccessfulAction(retry_result)
|
||||||
logger.info(f"Action {i+1} retry successful")
|
logger.info(f"Action {i+1} retry successful")
|
||||||
|
|
||||||
|
# Add retry success log
|
||||||
|
retry_documents_info = ""
|
||||||
|
if retry_result.documents and len(retry_result.documents) > 0:
|
||||||
|
doc_names = [doc.filename if hasattr(doc, 'filename') else f"Document {j+1}"
|
||||||
|
for j, doc in enumerate(retry_result.documents)]
|
||||||
|
retry_documents_info = f"\n📄 Generated documents: {', '.join(doc_names)}"
|
||||||
|
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"✅ Action {i+1}/{len(actions)} retry successful: {action.execMethod}.{action.execAction}{retry_documents_info}",
|
||||||
|
"type": "success",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
else:
|
else:
|
||||||
state.addFailedAction(retry_result)
|
state.addFailedAction(retry_result)
|
||||||
logger.error(f"Action {i+1} retry failed")
|
logger.error(f"Action {i+1} retry failed")
|
||||||
|
|
||||||
|
# Add retry failure log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"❌ Action {i+1}/{len(actions)} retry failed: {action.execMethod}.{action.execAction}",
|
||||||
|
"type": "error",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
# Action failed after retry - stop task execution and regenerate
|
# Action failed after retry - stop task execution and regenerate
|
||||||
break
|
break
|
||||||
|
|
||||||
else: # fail
|
else: # fail
|
||||||
state.addFailedAction(result)
|
state.addFailedAction(result)
|
||||||
logger.error(f"Action {i+1} failed validation - stopping task execution")
|
logger.error(f"Action {i+1} failed validation - stopping task execution")
|
||||||
|
|
||||||
|
# Add failure log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"❌ Action {i+1}/{len(actions)} failed: {action.execMethod}.{action.execAction}",
|
||||||
|
"type": "error",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
# Action failed - stop task execution and regenerate
|
# Action failed - stop task execution and regenerate
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -2412,12 +2788,47 @@ NOTE: Respond with ONLY the JSON object. Do not include any explanatory text."""
|
||||||
if validation['status'] == 'success':
|
if validation['status'] == 'success':
|
||||||
action.setSuccess()
|
action.setSuccess()
|
||||||
logger.info(f"Action {action.execMethod}.{action.execAction} validated successfully")
|
logger.info(f"Action {action.execMethod}.{action.execAction} validated successfully")
|
||||||
|
|
||||||
|
# Only create action message if documents were produced
|
||||||
|
if result.documents and len(result.documents) > 0:
|
||||||
|
await self._createActionMessage(action, result, workflow, action.execResultLabel)
|
||||||
|
else:
|
||||||
|
# Add validation success log instead of message
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"✅ Action validation successful: {action.execMethod}.{action.execAction}",
|
||||||
|
"type": "success",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
|
|
||||||
elif validation['status'] == 'retry':
|
elif validation['status'] == 'retry':
|
||||||
action.status = TaskStatus.PENDING # Keep pending for retry
|
action.status = TaskStatus.PENDING # Keep pending for retry
|
||||||
logger.warning(f"Action {action.execMethod}.{action.execAction} needs retry: {validation.get('reason', 'No reason')}")
|
logger.warning(f"Action {action.execMethod}.{action.execAction} needs retry: {validation.get('reason', 'No reason')}")
|
||||||
|
|
||||||
|
# Add validation retry log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"🔄 Action validation requires retry: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}",
|
||||||
|
"type": "warning",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
else: # fail
|
else: # fail
|
||||||
action.setError(validation.get('reason', 'Action failed validation'))
|
action.setError(validation.get('reason', 'Action failed validation'))
|
||||||
logger.error(f"Action {action.execMethod}.{action.execAction} failed validation: {validation.get('reason', 'No reason')}")
|
logger.error(f"Action {action.execMethod}.{action.execAction} failed validation: {validation.get('reason', 'No reason')}")
|
||||||
|
|
||||||
|
# Add validation failure log
|
||||||
|
self.chatInterface.createWorkflowLog({
|
||||||
|
"workflowId": workflow.id,
|
||||||
|
"message": f"❌ Action validation failed: {action.execMethod}.{action.execAction} - {validation.get('reason', 'No reason')}",
|
||||||
|
"type": "error",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0,
|
||||||
|
"agentName": "System"
|
||||||
|
})
|
||||||
|
|
||||||
return action_result
|
return action_result
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,8 +76,22 @@ class AiAnthropic:
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
logger.error(f"Anthropic API error: {response.status_code} - {response.text}")
|
error_detail = f"Anthropic API error: {response.status_code} - {response.text}"
|
||||||
raise HTTPException(status_code=500, detail="Error communicating with Anthropic API")
|
logger.error(error_detail)
|
||||||
|
|
||||||
|
# Provide more specific error messages based on status code
|
||||||
|
if response.status_code == 529:
|
||||||
|
error_message = "Anthropic API is currently overloaded. Please try again in a few minutes."
|
||||||
|
elif response.status_code == 429:
|
||||||
|
error_message = "Rate limit exceeded. Please wait before making another request."
|
||||||
|
elif response.status_code == 401:
|
||||||
|
error_message = "Invalid API key. Please check your Anthropic API configuration."
|
||||||
|
elif response.status_code == 400:
|
||||||
|
error_message = f"Invalid request to Anthropic API: {response.text}"
|
||||||
|
else:
|
||||||
|
error_message = f"Anthropic API error ({response.status_code}): {response.text}"
|
||||||
|
|
||||||
|
raise HTTPException(status_code=500, detail=error_message)
|
||||||
|
|
||||||
# Parse response
|
# Parse response
|
||||||
anthropicResponse = response.json()
|
anthropicResponse = response.json()
|
||||||
|
|
|
||||||
|
|
@ -269,6 +269,10 @@ class ChatObjects:
|
||||||
|
|
||||||
# Get messages for this workflow
|
# Get messages for this workflow
|
||||||
messages = self.db.getRecordset("workflowMessages", recordFilter={"workflowId": workflowId})
|
messages = self.db.getRecordset("workflowMessages", recordFilter={"workflowId": workflowId})
|
||||||
|
|
||||||
|
# Sort messages by publishedAt timestamp to ensure chronological order
|
||||||
|
messages.sort(key=lambda x: x.get("publishedAt", x.get("timestamp", "0")))
|
||||||
|
|
||||||
return [ChatMessage(**msg) for msg in messages]
|
return [ChatMessage(**msg) for msg in messages]
|
||||||
|
|
||||||
def createWorkflowMessage(self, messageData: Dict[str, Any]) -> ChatMessage:
|
def createWorkflowMessage(self, messageData: Dict[str, Any]) -> ChatMessage:
|
||||||
|
|
@ -545,7 +549,12 @@ class ChatObjects:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Get logs for this workflow
|
# Get logs for this workflow
|
||||||
return [ChatLog(**log) for log in self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId})]
|
logs = self.db.getRecordset("workflowLogs", recordFilter={"workflowId": workflowId})
|
||||||
|
|
||||||
|
# Sort logs by timestamp (Unix timestamps)
|
||||||
|
logs.sort(key=lambda x: float(x.get("timestamp", 0)))
|
||||||
|
|
||||||
|
return [ChatLog(**log) for log in logs]
|
||||||
|
|
||||||
def updateWorkflowStats(self, workflowId: str, bytesSent: int = 0, bytesReceived: int = 0) -> bool:
|
def updateWorkflowStats(self, workflowId: str, bytesSent: int = 0, bytesReceived: int = 0) -> bool:
|
||||||
"""Updates workflow statistics during execution with incremental values."""
|
"""Updates workflow statistics during execution with incremental values."""
|
||||||
|
|
@ -867,12 +876,25 @@ class ChatObjects:
|
||||||
raise ValueError(f"Workflow {workflowId} not found")
|
raise ValueError(f"Workflow {workflowId} not found")
|
||||||
|
|
||||||
|
|
||||||
# Update workflow
|
# Update workflow - set status back to running for resumed workflows
|
||||||
self.updateWorkflow(workflowId, {
|
self.updateWorkflow(workflowId, {
|
||||||
|
"status": "running", # Set status back to running for resumed workflows
|
||||||
"lastActivity": currentTime,
|
"lastActivity": currentTime,
|
||||||
"currentRound": workflow.currentRound + 1
|
"currentRound": workflow.currentRound + 1
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Update the workflow object status as well
|
||||||
|
workflow.status = "running"
|
||||||
|
|
||||||
|
# Add log entry for workflow resumption
|
||||||
|
self.createWorkflowLog({
|
||||||
|
"workflowId": workflowId,
|
||||||
|
"message": f"Workflow resumed (round {workflow.currentRound + 1})",
|
||||||
|
"type": "info",
|
||||||
|
"status": "running",
|
||||||
|
"progress": 0
|
||||||
|
})
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Create new workflow
|
# Create new workflow
|
||||||
workflowData = {
|
workflowData = {
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
import json # Added for JSON parsing
|
||||||
|
|
||||||
from modules.chat.methodBase import MethodBase, ActionResult, action
|
from modules.chat.methodBase import MethodBase, ActionResult, action
|
||||||
from modules.shared.configuration import APP_CONFIG
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
@ -38,40 +39,105 @@ class MethodWeb(MethodBase):
|
||||||
self.timeout = 30
|
self.timeout = 30
|
||||||
|
|
||||||
def _readUrl(self, url: str) -> BeautifulSoup:
|
def _readUrl(self, url: str) -> BeautifulSoup:
|
||||||
"""Read a URL and return a BeautifulSoup parser for the content"""
|
"""Read a URL and return a BeautifulSoup parser for the content with enhanced error handling"""
|
||||||
if not url or not url.startswith(('http://', 'https://')):
|
if not url or not url.startswith(('http://', 'https://')):
|
||||||
|
logger.error(f"Invalid URL: {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Enhanced headers to mimic real browser
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': self.user_agent,
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'DNT': '1',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Cache-Control': 'max-age=0'
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Initial request
|
# Use session for better connection handling
|
||||||
response = requests.get(url, headers=headers, timeout=self.timeout)
|
session = requests.Session()
|
||||||
|
session.headers.update(headers)
|
||||||
|
|
||||||
# Handling for status 202
|
# Initial request with allow_redirects
|
||||||
if response.status_code == 202:
|
response = session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||||
# Retry with backoff
|
|
||||||
backoff_times = [0.5, 1.0, 2.0, 5.0]
|
# Handle various status codes
|
||||||
|
if response.status_code == 200:
|
||||||
|
# Success - parse content
|
||||||
|
logger.debug(f"Successfully read URL: {url}")
|
||||||
|
return BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
elif response.status_code == 202:
|
||||||
|
# Accepted - retry with backoff
|
||||||
|
logger.info(f"Status 202 for {url}, retrying with backoff...")
|
||||||
|
backoff_times = [1.0, 2.0, 5.0, 10.0]
|
||||||
|
|
||||||
for wait_time in backoff_times:
|
for wait_time in backoff_times:
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
response = requests.get(url, headers=headers, timeout=self.timeout)
|
retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||||
|
|
||||||
if response.status_code != 202:
|
if retry_response.status_code == 200:
|
||||||
|
logger.debug(f"Successfully read URL after retry: {url}")
|
||||||
|
return BeautifulSoup(retry_response.text, 'html.parser')
|
||||||
|
elif retry_response.status_code != 202:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Raise for error status codes
|
logger.warning(f"Failed to read URL after retries: {url}")
|
||||||
response.raise_for_status()
|
return None
|
||||||
|
|
||||||
# Parse HTML
|
elif response.status_code in [301, 302, 307, 308]:
|
||||||
return BeautifulSoup(response.text, 'html.parser')
|
# Redirect - should be handled by allow_redirects=True
|
||||||
|
logger.warning(f"Unexpected redirect status {response.status_code} for {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif response.status_code == 403:
|
||||||
|
# Forbidden - try with different user agent
|
||||||
|
logger.warning(f"403 Forbidden for {url}, trying with different user agent...")
|
||||||
|
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
session.headers.update(headers)
|
||||||
|
|
||||||
|
retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||||
|
if retry_response.status_code == 200:
|
||||||
|
logger.debug(f"Successfully read URL with different user agent: {url}")
|
||||||
|
return BeautifulSoup(retry_response.text, 'html.parser')
|
||||||
|
else:
|
||||||
|
logger.error(f"Still getting {retry_response.status_code} for {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
elif response.status_code == 429:
|
||||||
|
# Rate limited - wait and retry
|
||||||
|
logger.warning(f"Rate limited for {url}, waiting 30 seconds...")
|
||||||
|
time.sleep(30)
|
||||||
|
retry_response = session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||||
|
if retry_response.status_code == 200:
|
||||||
|
logger.debug(f"Successfully read URL after rate limit: {url}")
|
||||||
|
return BeautifulSoup(retry_response.text, 'html.parser')
|
||||||
|
else:
|
||||||
|
logger.error(f"Still getting {retry_response.status_code} after rate limit wait for {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Other error status codes
|
||||||
|
logger.error(f"HTTP {response.status_code} for {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
logger.error(f"Timeout reading URL: {url}")
|
||||||
|
return None
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
logger.error(f"Connection error reading URL: {url}")
|
||||||
|
return None
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Request error reading URL {url}: {str(e)}")
|
||||||
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error reading URL {url}: {str(e)}")
|
logger.error(f"Unexpected error reading URL {url}: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _extractTitle(self, soup: BeautifulSoup, url: str) -> str:
|
def _extractTitle(self, soup: BeautifulSoup, url: str) -> str:
|
||||||
|
|
@ -91,32 +157,109 @@ class MethodWeb(MethodBase):
|
||||||
|
|
||||||
return title
|
return title
|
||||||
|
|
||||||
def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 10000) -> str:
|
def _extractMainContent(self, soup: BeautifulSoup, max_chars: int = 50000) -> str:
|
||||||
"""Extract the main content from an HTML page"""
|
"""Extract the main content from an HTML page with enhanced content detection"""
|
||||||
if not soup:
|
if not soup:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Try to find main content elements in priority order
|
# Try to find main content elements in priority order with more selectors
|
||||||
main_content = None
|
main_content = None
|
||||||
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
content_selectors = [
|
||||||
|
'main',
|
||||||
|
'article',
|
||||||
|
'#content',
|
||||||
|
'.content',
|
||||||
|
'#main',
|
||||||
|
'.main',
|
||||||
|
'.post-content',
|
||||||
|
'.entry-content',
|
||||||
|
'.article-content',
|
||||||
|
'.page-content',
|
||||||
|
'[role="main"]',
|
||||||
|
'.container',
|
||||||
|
'.wrapper'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in content_selectors:
|
||||||
content = soup.select_one(selector)
|
content = soup.select_one(selector)
|
||||||
if content:
|
if content:
|
||||||
main_content = content
|
main_content = content
|
||||||
|
logger.debug(f"Found main content using selector: {selector}")
|
||||||
break
|
break
|
||||||
|
|
||||||
# If no main content found, use the body
|
# If no main content found, use the body
|
||||||
if not main_content:
|
if not main_content:
|
||||||
main_content = soup.find('body') or soup
|
main_content = soup.find('body') or soup
|
||||||
|
logger.debug("Using body as main content")
|
||||||
|
|
||||||
# Remove script, style, nav, footer elements that don't contribute to main content
|
# Create a copy to avoid modifying the original
|
||||||
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
content_copy = main_content.copy()
|
||||||
element.extract()
|
|
||||||
|
|
||||||
# Extract text content
|
# Remove elements that don't contribute to main content (less aggressive)
|
||||||
text_content = main_content.get_text(separator=' ', strip=True)
|
elements_to_remove = [
|
||||||
|
'script', 'style', 'noscript',
|
||||||
|
'nav', 'footer', 'header', 'aside',
|
||||||
|
'.sidebar', '#sidebar', '.comments', '#comments',
|
||||||
|
'.advertisement', '.ads', '.ad', '.banner',
|
||||||
|
'iframe', '.social-share', '.share-buttons',
|
||||||
|
'.breadcrumb', '.breadcrumbs', '.pagination',
|
||||||
|
'.related-posts', '.related-articles',
|
||||||
|
'.newsletter', '.subscribe', '.signup',
|
||||||
|
'.cookie-notice', '.privacy-notice',
|
||||||
|
'.popup', '.modal', '.overlay'
|
||||||
|
]
|
||||||
|
|
||||||
# Limit to max_chars
|
for selector in elements_to_remove:
|
||||||
return text_content[:max_chars]
|
for element in content_copy.select(selector):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
|
# Extract text content with better formatting
|
||||||
|
text_content = content_copy.get_text(separator='\n', strip=True)
|
||||||
|
|
||||||
|
# Clean up the text
|
||||||
|
lines = text_content.split('\n')
|
||||||
|
cleaned_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and len(line) > 10: # Only keep meaningful lines
|
||||||
|
cleaned_lines.append(line)
|
||||||
|
|
||||||
|
# Join lines with proper spacing
|
||||||
|
cleaned_content = '\n\n'.join(cleaned_lines)
|
||||||
|
|
||||||
|
# If content is too short, try alternative extraction
|
||||||
|
if len(cleaned_content) < 500:
|
||||||
|
logger.debug("Content too short, trying alternative extraction...")
|
||||||
|
|
||||||
|
# Try to extract from all paragraphs
|
||||||
|
paragraphs = soup.find_all(['p', 'div', 'section'])
|
||||||
|
alt_content = []
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
text = p.get_text(strip=True)
|
||||||
|
if text and len(text) > 20: # Only meaningful paragraphs
|
||||||
|
alt_content.append(text)
|
||||||
|
|
||||||
|
if alt_content:
|
||||||
|
cleaned_content = '\n\n'.join(alt_content[:20]) # Limit to first 20 paragraphs
|
||||||
|
|
||||||
|
# Limit to max_chars but preserve complete sentences
|
||||||
|
if len(cleaned_content) > max_chars:
|
||||||
|
# Try to cut at a sentence boundary
|
||||||
|
sentences = cleaned_content.split('. ')
|
||||||
|
truncated_content = ""
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(truncated_content + sentence) < max_chars:
|
||||||
|
truncated_content += sentence + ". "
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
cleaned_content = truncated_content.strip()
|
||||||
|
|
||||||
|
logger.debug(f"Extracted {len(cleaned_content)} characters of content")
|
||||||
|
return cleaned_content
|
||||||
|
|
||||||
def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
def _checkAccessibility(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||||
"""Check basic accessibility features"""
|
"""Check basic accessibility features"""
|
||||||
|
|
@ -214,10 +357,413 @@ class MethodWeb(MethodBase):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _detectJavaScriptRendering(self, soup: BeautifulSoup) -> bool:
|
||||||
|
"""Detect if a page likely requires JavaScript rendering"""
|
||||||
|
if not soup:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for common indicators of JavaScript-rendered content
|
||||||
|
indicators = [
|
||||||
|
# Angular, React, Vue indicators
|
||||||
|
soup.find('div', {'ng-app': True}),
|
||||||
|
soup.find('div', {'id': 'root'}),
|
||||||
|
soup.find('div', {'id': 'app'}),
|
||||||
|
soup.find('div', {'id': 'react-root'}),
|
||||||
|
|
||||||
|
# SPA indicators
|
||||||
|
soup.find('div', {'id': 'spa-root'}),
|
||||||
|
soup.find('div', {'class': 'spa-container'}),
|
||||||
|
|
||||||
|
# Modern framework indicators
|
||||||
|
soup.find('div', {'data-reactroot': True}),
|
||||||
|
soup.find('div', {'data-ng-controller': True}),
|
||||||
|
|
||||||
|
# Empty content with scripts
|
||||||
|
len(soup.get_text(strip=True)) < 100 and len(soup.find_all('script')) > 2
|
||||||
|
]
|
||||||
|
|
||||||
|
return any(indicators)
|
||||||
|
|
||||||
|
def _extractMetaInformation(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
||||||
|
"""Extract meta information from the page"""
|
||||||
|
meta_info = {
|
||||||
|
"url": url,
|
||||||
|
"title": self._extractTitle(soup, url),
|
||||||
|
"description": "",
|
||||||
|
"keywords": "",
|
||||||
|
"author": "",
|
||||||
|
"language": "",
|
||||||
|
"robots": "",
|
||||||
|
"viewport": "",
|
||||||
|
"charset": "",
|
||||||
|
"canonical": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract meta tags
|
||||||
|
meta_tags = soup.find_all('meta')
|
||||||
|
for meta in meta_tags:
|
||||||
|
name = meta.get('name', '').lower()
|
||||||
|
property = meta.get('property', '').lower()
|
||||||
|
content = meta.get('content', '')
|
||||||
|
|
||||||
|
if name == 'description' or property == 'og:description':
|
||||||
|
meta_info['description'] = content
|
||||||
|
elif name == 'keywords':
|
||||||
|
meta_info['keywords'] = content
|
||||||
|
elif name == 'author':
|
||||||
|
meta_info['author'] = content
|
||||||
|
elif name == 'language':
|
||||||
|
meta_info['language'] = content
|
||||||
|
elif name == 'robots':
|
||||||
|
meta_info['robots'] = content
|
||||||
|
elif name == 'viewport':
|
||||||
|
meta_info['viewport'] = content
|
||||||
|
elif property == 'og:title':
|
||||||
|
meta_info['title'] = content
|
||||||
|
elif property == 'og:url':
|
||||||
|
meta_info['canonical'] = content
|
||||||
|
|
||||||
|
# Extract charset
|
||||||
|
charset_meta = soup.find('meta', charset=True)
|
||||||
|
if charset_meta:
|
||||||
|
meta_info['charset'] = charset_meta.get('charset', '')
|
||||||
|
|
||||||
|
# Extract canonical URL
|
||||||
|
canonical_link = soup.find('link', rel='canonical')
|
||||||
|
if canonical_link:
|
||||||
|
meta_info['canonical'] = canonical_link.get('href', '')
|
||||||
|
|
||||||
|
return meta_info
|
||||||
|
|
||||||
|
def _getAlternativeApproaches(self, url: str, requires_js: bool, content_length: int) -> List[str]:
|
||||||
|
"""Get alternative approaches for sites that are difficult to crawl"""
|
||||||
|
approaches = []
|
||||||
|
|
||||||
|
if requires_js:
|
||||||
|
approaches.extend([
|
||||||
|
"Site requires JavaScript rendering - consider using a headless browser",
|
||||||
|
"Try accessing the site's API endpoints directly",
|
||||||
|
"Look for RSS feeds or sitemaps",
|
||||||
|
"Check if the site has a mobile version that's easier to parse"
|
||||||
|
])
|
||||||
|
|
||||||
|
if content_length < 100:
|
||||||
|
approaches.extend([
|
||||||
|
"Site may have anti-bot protection - try with different user agents",
|
||||||
|
"Check if the site requires authentication",
|
||||||
|
"Look for alternative URLs (www vs non-www, http vs https)",
|
||||||
|
"Try accessing the site's robots.txt for crawling guidelines"
|
||||||
|
])
|
||||||
|
|
||||||
|
# Add general suggestions
|
||||||
|
approaches.extend([
|
||||||
|
"Use the web.search action to find alternative sources",
|
||||||
|
"Try the web.scrape action with specific CSS selectors",
|
||||||
|
"Check if the site has a public API or data export"
|
||||||
|
])
|
||||||
|
|
||||||
|
return approaches
|
||||||
|
|
||||||
|
async def _tryAdvancedAIWebResearch(self, action_type: str, parameters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Try to get web research results using advanced AI first
|
||||||
|
|
||||||
|
Args:
|
||||||
|
action_type: Type of action ('crawl', 'scrape', or 'search')
|
||||||
|
parameters: Action parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with AI results if successful, None if AI call fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Create appropriate prompt based on action type
|
||||||
|
if action_type == "crawl":
|
||||||
|
prompt = self._createCrawlAIPrompt(parameters)
|
||||||
|
elif action_type == "scrape":
|
||||||
|
prompt = self._createScrapeAIPrompt(parameters)
|
||||||
|
elif action_type == "search":
|
||||||
|
prompt = self._createSearchAIPrompt(parameters)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown action type for AI research: {action_type}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try advanced AI call
|
||||||
|
if hasattr(self.service, 'callAiTextAdvanced'):
|
||||||
|
logger.info(f"Attempting advanced AI web research for {action_type}")
|
||||||
|
response = await self.service.callAiTextAdvanced(prompt)
|
||||||
|
|
||||||
|
# Parse the AI response
|
||||||
|
parsed_result = self._parseAIWebResponse(response, action_type)
|
||||||
|
if parsed_result:
|
||||||
|
logger.info(f"Advanced AI web research successful for {action_type}")
|
||||||
|
return parsed_result
|
||||||
|
else:
|
||||||
|
logger.warning(f"Failed to parse AI response for {action_type}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logger.warning("Service does not have callAiTextAdvanced method")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Advanced AI web research failed for {action_type}: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _createCrawlAIPrompt(self, parameters: Dict[str, Any]) -> str:
|
||||||
|
"""Create AI prompt for web crawling"""
|
||||||
|
urls = parameters.get("urls", [])
|
||||||
|
maxDepth = parameters.get("maxDepth", 2)
|
||||||
|
includeImages = parameters.get("includeImages", False)
|
||||||
|
followLinks = parameters.get("followLinks", True)
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URLs based on your extensive training data and knowledge.
|
||||||
|
|
||||||
|
URLs to research: {urls}
|
||||||
|
Max depth: {maxDepth}
|
||||||
|
Include images: {includeImages}
|
||||||
|
Follow links: {followLinks}
|
||||||
|
|
||||||
|
For each URL, please provide comprehensive information including:
|
||||||
|
1. Company/organization information and background
|
||||||
|
2. Main business activities and services
|
||||||
|
3. Key personnel and leadership
|
||||||
|
4. Contact information and locations
|
||||||
|
5. Recent news and developments
|
||||||
|
6. Industry analysis and market position
|
||||||
|
7. Related companies and partnerships
|
||||||
|
8. Website structure and key pages
|
||||||
|
9. Business model and revenue streams
|
||||||
|
10. Regulatory compliance and certifications
|
||||||
|
|
||||||
|
For each URL, provide:
|
||||||
|
- url: The original URL
|
||||||
|
- title: Company/organization name
|
||||||
|
- content: Comprehensive description and analysis
|
||||||
|
- content_length: Number of characters in content
|
||||||
|
- meta_info: Business information object
|
||||||
|
- links: Related companies and important connections
|
||||||
|
- images: Company logos or key visuals if known
|
||||||
|
- requires_javascript: Boolean (usually false for static info)
|
||||||
|
- alternative_approaches: Additional research suggestions
|
||||||
|
- timestamp: Current timestamp
|
||||||
|
|
||||||
|
Return the results in this exact JSON format:
|
||||||
|
{{
|
||||||
|
"urls": {urls},
|
||||||
|
"maxDepth": {maxDepth},
|
||||||
|
"includeImages": {includeImages},
|
||||||
|
"followLinks": {followLinks},
|
||||||
|
"crawlResults": [
|
||||||
|
{{
|
||||||
|
"url": "url_here",
|
||||||
|
"depth": {maxDepth},
|
||||||
|
"followLinks": {followLinks},
|
||||||
|
"extractContent": true,
|
||||||
|
"title": "company_name",
|
||||||
|
"content": "comprehensive_company_analysis",
|
||||||
|
"content_length": 1234,
|
||||||
|
"meta_info": {{
|
||||||
|
"url": "url_here",
|
||||||
|
"title": "company_name",
|
||||||
|
"description": "business_description",
|
||||||
|
"keywords": "industry_keywords",
|
||||||
|
"author": "company_info",
|
||||||
|
"language": "language_code",
|
||||||
|
"robots": "robots_info",
|
||||||
|
"viewport": "viewport_info",
|
||||||
|
"charset": "charset_info",
|
||||||
|
"canonical": "canonical_url"
|
||||||
|
}},
|
||||||
|
"links": [
|
||||||
|
{{
|
||||||
|
"url": "related_company_url",
|
||||||
|
"text": "company_name"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"images": [
|
||||||
|
{{
|
||||||
|
"src": "logo_url",
|
||||||
|
"alt": "company_logo",
|
||||||
|
"title": "company_name",
|
||||||
|
"width": "width_value",
|
||||||
|
"height": "height_value"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"requires_javascript": false,
|
||||||
|
"alternative_approaches": ["approach1", "approach2"],
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"summary": {{
|
||||||
|
"total_urls": {len(urls)},
|
||||||
|
"successful_crawls": 0,
|
||||||
|
"failed_crawls": 0,
|
||||||
|
"total_content_chars": 0
|
||||||
|
}},
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Please provide accurate, comprehensive information about each company/organization based on your knowledge. If you don't have specific information about a URL, provide general industry analysis and suggest alternative research approaches.
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _createScrapeAIPrompt(self, parameters: Dict[str, Any]) -> str:
|
||||||
|
"""Create AI prompt for web scraping"""
|
||||||
|
url = parameters.get("url")
|
||||||
|
selectors = parameters.get("selectors", {})
|
||||||
|
format = parameters.get("format", "json")
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are an advanced AI research assistant with comprehensive knowledge about websites, companies, and online content. Please provide detailed information about the following URL and the specific data requested based on your extensive training data and knowledge.
|
||||||
|
|
||||||
|
URL to research: {url}
|
||||||
|
Data selectors: {selectors}
|
||||||
|
Output format: {format}
|
||||||
|
|
||||||
|
Please provide comprehensive information including:
|
||||||
|
1. Company/organization background and history
|
||||||
|
2. Business activities and services offered
|
||||||
|
3. Key personnel and leadership information
|
||||||
|
4. Financial information and performance data
|
||||||
|
5. Market position and competitive analysis
|
||||||
|
6. Recent news and developments
|
||||||
|
7. Contact information and locations
|
||||||
|
8. Industry trends and insights
|
||||||
|
9. Related companies and partnerships
|
||||||
|
10. Regulatory and compliance information
|
||||||
|
|
||||||
|
For each data selector requested, provide relevant information in the specified format (text, html, or json).
|
||||||
|
|
||||||
|
Return the results in this exact JSON format:
|
||||||
|
{{
|
||||||
|
"url": "{url}",
|
||||||
|
"selectors": {selectors},
|
||||||
|
"format": "{format}",
|
||||||
|
"scrapedData": {{
|
||||||
|
"url": "{url}",
|
||||||
|
"selectors": {selectors},
|
||||||
|
"format": "{format}",
|
||||||
|
"content": {{
|
||||||
|
"company_info": ["comprehensive_company_analysis"],
|
||||||
|
"business_activities": ["detailed_business_description"],
|
||||||
|
"leadership": ["key_personnel_information"],
|
||||||
|
"financial_data": ["financial_performance_analysis"],
|
||||||
|
"market_position": ["competitive_analysis"],
|
||||||
|
"recent_news": ["latest_developments"],
|
||||||
|
"contact_info": ["contact_details"],
|
||||||
|
"industry_insights": ["market_trends"],
|
||||||
|
"partnerships": ["related_companies"],
|
||||||
|
"compliance": ["regulatory_information"]
|
||||||
|
}},
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}},
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Please provide accurate, comprehensive information about the company/organization based on your knowledge. If you don't have specific information about the URL, provide general industry analysis and suggest alternative research approaches.
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _createSearchAIPrompt(self, parameters: Dict[str, Any]) -> str:
|
||||||
|
"""Create AI prompt for web search"""
|
||||||
|
query = parameters.get("query")
|
||||||
|
engine = parameters.get("engine", "google")
|
||||||
|
maxResults = parameters.get("maxResults", 10)
|
||||||
|
filter = parameters.get("filter")
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are an advanced AI research assistant with comprehensive knowledge about companies, industries, and business information. Please provide detailed information about the following search query based on your extensive training data and knowledge.
|
||||||
|
|
||||||
|
Search query: {query}
|
||||||
|
Search engine: {engine}
|
||||||
|
Max results: {maxResults}
|
||||||
|
Filter: {filter}
|
||||||
|
|
||||||
|
Please provide comprehensive research results including:
|
||||||
|
1. Relevant company/organization information
|
||||||
|
2. Industry analysis and market insights
|
||||||
|
3. Key personnel and leadership details
|
||||||
|
4. Business activities and services
|
||||||
|
5. Financial performance and metrics
|
||||||
|
6. Recent news and developments
|
||||||
|
7. Competitive landscape analysis
|
||||||
|
8. Market trends and opportunities
|
||||||
|
9. Regulatory and compliance information
|
||||||
|
10. Related companies and partnerships
|
||||||
|
|
||||||
|
For each search result, provide:
|
||||||
|
- title: Company/organization name
|
||||||
|
- url: Official website or primary source
|
||||||
|
- snippet: Brief description and key highlights
|
||||||
|
- content: Comprehensive analysis and insights
|
||||||
|
|
||||||
|
Return the results in this exact JSON format:
|
||||||
|
{{
|
||||||
|
"query": "{query}",
|
||||||
|
"engine": "{engine}",
|
||||||
|
"maxResults": {maxResults},
|
||||||
|
"filter": "{filter}",
|
||||||
|
"searchResults": {{
|
||||||
|
"query": "{query}",
|
||||||
|
"maxResults": {maxResults},
|
||||||
|
"results": [
|
||||||
|
{{
|
||||||
|
"title": "company_name",
|
||||||
|
"url": "official_website",
|
||||||
|
"snippet": "brief_description",
|
||||||
|
"content": "comprehensive_analysis"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"totalFound": 0,
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}},
|
||||||
|
"timestamp": "2024-01-01T00:00:00Z"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Please provide accurate, comprehensive information about the search query based on your knowledge. If you don't have specific information about the query, provide general industry analysis and suggest alternative research approaches.
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def _parseAIWebResponse(self, response: str, action_type: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Parse AI response into structured data"""
|
||||||
|
try:
|
||||||
|
# Extract JSON from response
|
||||||
|
json_start = response.find('{')
|
||||||
|
json_end = response.rfind('}') + 1
|
||||||
|
if json_start == -1 or json_end == 0:
|
||||||
|
logger.warning(f"No JSON found in AI response: {response}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
json_str = response[json_start:json_end]
|
||||||
|
parsed_data = json.loads(json_str)
|
||||||
|
|
||||||
|
# Validate basic structure based on action type
|
||||||
|
if action_type == "crawl":
|
||||||
|
if "crawlResults" not in parsed_data:
|
||||||
|
logger.warning("Invalid crawl response structure")
|
||||||
|
return None
|
||||||
|
elif action_type == "scrape":
|
||||||
|
if "scrapedData" not in parsed_data:
|
||||||
|
logger.warning("Invalid scrape response structure")
|
||||||
|
return None
|
||||||
|
elif action_type == "search":
|
||||||
|
if "searchResults" not in parsed_data:
|
||||||
|
logger.warning("Invalid search response structure")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return parsed_data
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Failed to parse AI response JSON: {str(e)}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error parsing AI response: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
@action
|
@action
|
||||||
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
"""
|
"""
|
||||||
Crawl web pages and extract content
|
Crawl web pages and extract content with enhanced error handling and content detection
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
urls (List[str]): List of URLs to crawl
|
urls (List[str]): List of URLs to crawl
|
||||||
|
|
@ -240,23 +786,76 @@ class MethodWeb(MethodBase):
|
||||||
error="URLs are required"
|
error="URLs are required"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Try advanced AI research first
|
||||||
|
ai_result = await self._tryAdvancedAIWebResearch("crawl", parameters)
|
||||||
|
if ai_result:
|
||||||
|
logger.info("Using advanced AI web research for crawl")
|
||||||
|
# Reconstruct the result data from the AI response
|
||||||
|
result_data = {
|
||||||
|
"urls": ai_result.get("urls", []),
|
||||||
|
"maxDepth": ai_result.get("maxDepth", 2),
|
||||||
|
"includeImages": ai_result.get("includeImages", False),
|
||||||
|
"followLinks": ai_result.get("followLinks", True),
|
||||||
|
"crawlResults": ai_result.get("crawlResults", []),
|
||||||
|
"summary": ai_result.get("summary", {}),
|
||||||
|
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
|
||||||
|
}
|
||||||
|
return self._createResult(
|
||||||
|
success=True,
|
||||||
|
data={
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"documentName": f"web_crawl_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
|
||||||
|
"documentData": result_data,
|
||||||
|
"mimeType": "application/json"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Advanced AI web research failed, falling back to regular web crawling")
|
||||||
|
|
||||||
# Crawl each URL
|
# Crawl each URL
|
||||||
crawl_results = []
|
crawl_results = []
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
try:
|
try:
|
||||||
# Read the URL
|
logger.info(f"Crawling URL: {url}")
|
||||||
|
|
||||||
|
# Read the URL with enhanced error handling
|
||||||
soup = self._readUrl(url)
|
soup = self._readUrl(url)
|
||||||
if not soup:
|
if not soup:
|
||||||
|
logger.error(f"Failed to read URL: {url}")
|
||||||
crawl_results.append({
|
crawl_results.append({
|
||||||
"error": "Failed to read URL",
|
"error": "Failed to read URL - check if the site is accessible and not blocking crawlers",
|
||||||
"url": url
|
"url": url,
|
||||||
|
"suggestions": [
|
||||||
|
"Try accessing the URL directly in a browser",
|
||||||
|
"Check if the site requires JavaScript",
|
||||||
|
"Verify the URL is correct and accessible"
|
||||||
|
]
|
||||||
})
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract basic information
|
# Extract comprehensive information
|
||||||
title = self._extractTitle(soup, url)
|
title = self._extractTitle(soup, url)
|
||||||
content = self._extractMainContent(soup) if True else ""
|
content = self._extractMainContent(soup)
|
||||||
|
meta_info = self._extractMetaInformation(soup, url)
|
||||||
|
|
||||||
|
# Check if content is meaningful
|
||||||
|
content_length = len(content)
|
||||||
|
if content_length < 100:
|
||||||
|
logger.warning(f"Very little content extracted from {url} ({content_length} chars)")
|
||||||
|
crawl_results.append({
|
||||||
|
"url": url,
|
||||||
|
"title": title,
|
||||||
|
"content": content,
|
||||||
|
"content_length": content_length,
|
||||||
|
"warning": "Very little content extracted - site may require JavaScript or have anti-bot protection",
|
||||||
|
"meta_info": meta_info,
|
||||||
|
"timestamp": datetime.now(UTC).isoformat()
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
# Extract links if requested
|
# Extract links if requested
|
||||||
links = []
|
links = []
|
||||||
|
|
@ -264,21 +863,32 @@ class MethodWeb(MethodBase):
|
||||||
for link in soup.find_all('a', href=True):
|
for link in soup.find_all('a', href=True):
|
||||||
href = link.get('href')
|
href = link.get('href')
|
||||||
if href and href.startswith(('http://', 'https://')):
|
if href and href.startswith(('http://', 'https://')):
|
||||||
links.append({
|
link_text = link.get_text(strip=True)
|
||||||
'url': href,
|
if link_text: # Only include links with text
|
||||||
'text': link.get_text(strip=True)[:100]
|
links.append({
|
||||||
|
'url': href,
|
||||||
|
'text': link_text[:100]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract images if requested
|
||||||
|
images = []
|
||||||
|
if includeImages:
|
||||||
|
for img in soup.find_all('img', src=True):
|
||||||
|
src = img.get('src')
|
||||||
|
if src:
|
||||||
|
images.append({
|
||||||
|
'src': src,
|
||||||
|
'alt': img.get('alt', ''),
|
||||||
|
'title': img.get('title', ''),
|
||||||
|
'width': img.get('width', ''),
|
||||||
|
'height': img.get('height', '')
|
||||||
})
|
})
|
||||||
|
|
||||||
# Extract images
|
# Check for JavaScript rendering requirements
|
||||||
images = []
|
requires_js = self._detectJavaScriptRendering(soup)
|
||||||
for img in soup.find_all('img', src=True):
|
|
||||||
src = img.get('src')
|
# Get alternative approaches if needed
|
||||||
if src:
|
alternative_approaches = self._getAlternativeApproaches(url, requires_js, content_length)
|
||||||
images.append({
|
|
||||||
'src': src,
|
|
||||||
'alt': img.get('alt', ''),
|
|
||||||
'title': img.get('title', '')
|
|
||||||
})
|
|
||||||
|
|
||||||
crawl_results.append({
|
crawl_results.append({
|
||||||
"url": url,
|
"url": url,
|
||||||
|
|
@ -287,16 +897,27 @@ class MethodWeb(MethodBase):
|
||||||
"extractContent": True,
|
"extractContent": True,
|
||||||
"title": title,
|
"title": title,
|
||||||
"content": content,
|
"content": content,
|
||||||
"links": links[:10], # Limit to first 10 links
|
"content_length": content_length,
|
||||||
"images": images[:10], # Limit to first 10 images
|
"meta_info": meta_info,
|
||||||
|
"links": links[:20], # Limit to first 20 links
|
||||||
|
"images": images[:20], # Limit to first 20 images
|
||||||
|
"requires_javascript": requires_js,
|
||||||
|
"alternative_approaches": alternative_approaches,
|
||||||
"timestamp": datetime.now(UTC).isoformat()
|
"timestamp": datetime.now(UTC).isoformat()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
logger.info(f"Successfully crawled {url} - extracted {content_length} characters")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error crawling web page {url}: {str(e)}")
|
logger.error(f"Error crawling web page {url}: {str(e)}")
|
||||||
crawl_results.append({
|
crawl_results.append({
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
"url": url
|
"url": url,
|
||||||
|
"suggestions": [
|
||||||
|
"Check if the URL is accessible",
|
||||||
|
"Try with a different user agent",
|
||||||
|
"Verify the site doesn't block automated access"
|
||||||
|
]
|
||||||
})
|
})
|
||||||
|
|
||||||
# Create result data
|
# Create result data
|
||||||
|
|
@ -306,6 +927,12 @@ class MethodWeb(MethodBase):
|
||||||
"includeImages": includeImages,
|
"includeImages": includeImages,
|
||||||
"followLinks": followLinks,
|
"followLinks": followLinks,
|
||||||
"crawlResults": crawl_results,
|
"crawlResults": crawl_results,
|
||||||
|
"summary": {
|
||||||
|
"total_urls": len(urls),
|
||||||
|
"successful_crawls": len([r for r in crawl_results if "error" not in r]),
|
||||||
|
"failed_crawls": len([r for r in crawl_results if "error" in r]),
|
||||||
|
"total_content_chars": sum([r.get("content_length", 0) for r in crawl_results if "content_length" in r])
|
||||||
|
},
|
||||||
"timestamp": datetime.now(UTC).isoformat()
|
"timestamp": datetime.now(UTC).isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -367,6 +994,33 @@ class MethodWeb(MethodBase):
|
||||||
error="URL and selectors are required"
|
error="URL and selectors are required"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Try advanced AI research first
|
||||||
|
ai_result = await self._tryAdvancedAIWebResearch("scrape", parameters)
|
||||||
|
if ai_result:
|
||||||
|
logger.info("Using advanced AI web research for scrape")
|
||||||
|
# Reconstruct the result data from the AI response
|
||||||
|
result_data = {
|
||||||
|
"url": ai_result.get("url"),
|
||||||
|
"selectors": ai_result.get("selectors"),
|
||||||
|
"format": ai_result.get("format"),
|
||||||
|
"scrapedData": ai_result.get("scrapedData"),
|
||||||
|
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
|
||||||
|
}
|
||||||
|
return self._createResult(
|
||||||
|
success=True,
|
||||||
|
data={
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"documentName": f"web_scrape_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
|
||||||
|
"documentData": result_data,
|
||||||
|
"mimeType": "application/json"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Advanced AI web research failed, falling back to regular web scraping")
|
||||||
|
|
||||||
# Read the URL
|
# Read the URL
|
||||||
soup = self._readUrl(url)
|
soup = self._readUrl(url)
|
||||||
if not soup:
|
if not soup:
|
||||||
|
|
@ -478,6 +1132,34 @@ class MethodWeb(MethodBase):
|
||||||
error="Search query is required"
|
error="Search query is required"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Try advanced AI research first
|
||||||
|
ai_result = await self._tryAdvancedAIWebResearch("search", parameters)
|
||||||
|
if ai_result:
|
||||||
|
logger.info("Using advanced AI web research for search")
|
||||||
|
# Reconstruct the result data from the AI response
|
||||||
|
result_data = {
|
||||||
|
"query": ai_result.get("query"),
|
||||||
|
"engine": ai_result.get("engine"),
|
||||||
|
"maxResults": ai_result.get("maxResults"),
|
||||||
|
"filter": ai_result.get("filter"),
|
||||||
|
"searchResults": ai_result.get("searchResults"),
|
||||||
|
"timestamp": ai_result.get("timestamp", datetime.now(UTC).isoformat())
|
||||||
|
}
|
||||||
|
return self._createResult(
|
||||||
|
success=True,
|
||||||
|
data={
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"documentName": f"web_search_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}.json",
|
||||||
|
"documentData": result_data,
|
||||||
|
"mimeType": "application/json"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Advanced AI web research failed, falling back to regular web search")
|
||||||
|
|
||||||
# Search web content using Google search via SerpAPI
|
# Search web content using Google search via SerpAPI
|
||||||
try:
|
try:
|
||||||
if not self.srcApikey:
|
if not self.srcApikey:
|
||||||
|
|
@ -601,94 +1283,3 @@ class MethodWeb(MethodBase):
|
||||||
error=str(e)
|
error=str(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
@action
|
|
||||||
async def validate(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
||||||
"""
|
|
||||||
Validate web pages for various criteria
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
url (str): URL to validate
|
|
||||||
checks (List[str], optional): Types of checks to perform (default: ["accessibility", "seo", "performance"])
|
|
||||||
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
url = parameters.get("url")
|
|
||||||
checks = parameters.get("checks", ["accessibility", "seo", "performance"])
|
|
||||||
expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
|
|
||||||
|
|
||||||
if not url:
|
|
||||||
return self._createResult(
|
|
||||||
success=False,
|
|
||||||
data={},
|
|
||||||
error="URL is required"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Read the URL
|
|
||||||
soup = self._readUrl(url)
|
|
||||||
if not soup:
|
|
||||||
return self._createResult(
|
|
||||||
success=False,
|
|
||||||
data={},
|
|
||||||
error="Failed to read URL"
|
|
||||||
)
|
|
||||||
|
|
||||||
validation_results = {}
|
|
||||||
|
|
||||||
for check in checks:
|
|
||||||
if check == "accessibility":
|
|
||||||
validation_results["accessibility"] = self._checkAccessibility(soup)
|
|
||||||
elif check == "seo":
|
|
||||||
validation_results["seo"] = self._checkSEO(soup)
|
|
||||||
elif check == "performance":
|
|
||||||
validation_results["performance"] = self._checkPerformance(soup, url)
|
|
||||||
else:
|
|
||||||
validation_results[check] = {"status": "unknown", "message": f"Unknown check type: {check}"}
|
|
||||||
|
|
||||||
validation_result = {
|
|
||||||
"url": url,
|
|
||||||
"checks": checks,
|
|
||||||
"results": validation_results,
|
|
||||||
"timestamp": datetime.now(UTC).isoformat()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create result data
|
|
||||||
result_data = {
|
|
||||||
"url": url,
|
|
||||||
"checks": checks,
|
|
||||||
"validationResult": validation_result,
|
|
||||||
"timestamp": datetime.now(UTC).isoformat()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Determine output format based on expected formats
|
|
||||||
output_extension = ".json" # Default
|
|
||||||
output_mime_type = "application/json" # Default
|
|
||||||
|
|
||||||
if expectedDocumentFormats and len(expectedDocumentFormats) > 0:
|
|
||||||
# Use the first expected format
|
|
||||||
expected_format = expectedDocumentFormats[0]
|
|
||||||
output_extension = expected_format.get("extension", ".json")
|
|
||||||
output_mime_type = expected_format.get("mimeType", "application/json")
|
|
||||||
logger.info(f"Using expected format: {output_extension} ({output_mime_type})")
|
|
||||||
else:
|
|
||||||
logger.info("No expected format specified, using default .json format")
|
|
||||||
|
|
||||||
return self._createResult(
|
|
||||||
success=True,
|
|
||||||
data={
|
|
||||||
"documents": [
|
|
||||||
{
|
|
||||||
"documentName": f"web_validation_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S')}{output_extension}",
|
|
||||||
"documentData": result_data,
|
|
||||||
"mimeType": output_mime_type
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error validating web page: {str(e)}")
|
|
||||||
return self._createResult(
|
|
||||||
success=False,
|
|
||||||
data={},
|
|
||||||
error=str(e)
|
|
||||||
)
|
|
||||||
|
|
@ -170,7 +170,7 @@ class WorkflowManager:
|
||||||
# Add completion log entry
|
# Add completion log entry
|
||||||
self.chatInterface.createWorkflowLog({
|
self.chatInterface.createWorkflowLog({
|
||||||
"workflowId": workflow.id,
|
"workflowId": workflow.id,
|
||||||
"message": "Workflow completed successfully",
|
"message": "Workflow completed",
|
||||||
"type": "success",
|
"type": "success",
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"progress": 100
|
"progress": 100
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue