backend running for mvp1
This commit is contained in:
parent
b0c45fb798
commit
c75a3b67ce
10 changed files with 2529 additions and 615 deletions
1014
gwserver/modules/BAKcoder.py
Normal file
1014
gwserver/modules/BAKcoder.py
Normal file
File diff suppressed because it is too large
Load diff
613
gwserver/modules/BAKwebcrawler.py
Normal file
613
gwserver/modules/BAKwebcrawler.py
Normal file
|
|
@ -0,0 +1,613 @@
|
||||||
|
"""
|
||||||
|
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
|
||||||
|
Angepasst für das refaktorisierte Core-Modul.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from typing import List, Dict, Any, Optional, Union
|
||||||
|
import re
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from urllib.parse import quote_plus, unquote
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
from modules.agentservice_base import BaseAgent
|
||||||
|
from connectors.connector_aichat_openai import ChatService
|
||||||
|
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
||||||
|
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class WebcrawlerAgent(BaseAgent):
|
||||||
|
|
||||||
|
"""Agent für Web-Recherche und Informationsbeschaffung"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialisiert den WebCrawler-Agenten"""
|
||||||
|
super().__init__()
|
||||||
|
self.id = "webcrawler"
|
||||||
|
self.name = "Webscraper"
|
||||||
|
self.type = "scraper"
|
||||||
|
self.description = "Recherchiert Informationen im Web"
|
||||||
|
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
|
||||||
|
self.result_format = "SearchResults"
|
||||||
|
|
||||||
|
# Add enhanced document capabilities
|
||||||
|
self.supports_documents = True
|
||||||
|
self.document_capabilities = ["read", "create"]
|
||||||
|
self.required_context = ["workflow_id"]
|
||||||
|
self.document_handler = None
|
||||||
|
|
||||||
|
# Initialize protocol
|
||||||
|
self.protocol = AgentCommunicationProtocol()
|
||||||
|
|
||||||
|
# Chat-Service initialisieren
|
||||||
|
self.chat_service = ChatService()
|
||||||
|
|
||||||
|
# Utility-Klassen initialisieren
|
||||||
|
self.message_utils = MessageUtils()
|
||||||
|
|
||||||
|
# Web-Crawling-Konfiguration
|
||||||
|
self.max_url = 3
|
||||||
|
self.max_key = 3
|
||||||
|
self.max_result = 3
|
||||||
|
self.timeout = 10
|
||||||
|
self.headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'Referer': 'https://www.google.com/',
|
||||||
|
'DNT': '1',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
}
|
||||||
|
self.max_urls = 10
|
||||||
|
self.max_content_length = 100000
|
||||||
|
|
||||||
|
def get_agent_info(self) -> Dict[str, Any]:
|
||||||
|
"""Get agent information for agent registry"""
|
||||||
|
info = super().get_agent_info()
|
||||||
|
info.update({
|
||||||
|
"metadata": {
|
||||||
|
"max_url": self.max_url,
|
||||||
|
"max_result": self.max_result,
|
||||||
|
"timeout": self.timeout
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return info
|
||||||
|
|
||||||
|
def set_document_handler(self, document_handler):
|
||||||
|
"""Set the document handler for file operations"""
|
||||||
|
self.document_handler = document_handler
|
||||||
|
|
||||||
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: Die zu verarbeitende Nachricht
|
||||||
|
context: Zusätzlicher Kontext
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Die generierte Antwort mit der Web-Recherche
|
||||||
|
"""
|
||||||
|
# Extract workflow_id from context or message
|
||||||
|
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
|
||||||
|
|
||||||
|
# Get or create logging_utils
|
||||||
|
log_func = context.get("log_func") if context else None
|
||||||
|
logging_utils = LoggingUtils(workflow_id, log_func)
|
||||||
|
|
||||||
|
# Send status update using protocol
|
||||||
|
if log_func:
|
||||||
|
status_message = self.protocol.create_status_update_message(
|
||||||
|
status_description="Starte Web-Recherche",
|
||||||
|
sender_id=self.id,
|
||||||
|
status="in_progress",
|
||||||
|
progress=0.0,
|
||||||
|
context_id=workflow_id
|
||||||
|
)
|
||||||
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||||
|
|
||||||
|
# Create response structure
|
||||||
|
response = {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "",
|
||||||
|
"agent_id": self.id,
|
||||||
|
"agent_type": self.type,
|
||||||
|
"agent_name": self.name,
|
||||||
|
"result_format": self.result_format,
|
||||||
|
"workflow_id": workflow_id
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the query from the message
|
||||||
|
prompt = await self.get_prompt(message)
|
||||||
|
logging_utils.info(f"Web-Recherche für: {prompt[:50]}...", "agents")
|
||||||
|
|
||||||
|
# Update progress using protocol
|
||||||
|
if log_func:
|
||||||
|
status_message = self.protocol.create_status_update_message(
|
||||||
|
status_description=f"Recherchiere: {prompt[:30]}...",
|
||||||
|
sender_id=self.id,
|
||||||
|
status="in_progress",
|
||||||
|
progress=0.3,
|
||||||
|
context_id=workflow_id
|
||||||
|
)
|
||||||
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||||
|
|
||||||
|
# Führe die Web-Recherche durch
|
||||||
|
web_query_result = await self.get_web_query(message)
|
||||||
|
|
||||||
|
# Final status update
|
||||||
|
if log_func:
|
||||||
|
status_message = self.protocol.create_status_update_message(
|
||||||
|
status_description="Web-Recherche abgeschlossen",
|
||||||
|
sender_id=self.id,
|
||||||
|
status="completed",
|
||||||
|
progress=1.0,
|
||||||
|
context_id=workflow_id
|
||||||
|
)
|
||||||
|
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||||
|
|
||||||
|
# Set the content in the response
|
||||||
|
response["content"] = web_query_result
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Fehler bei der Web-Recherche: {str(e)}"
|
||||||
|
logging_utils.error(error_msg, "error")
|
||||||
|
|
||||||
|
# Create error response using protocol
|
||||||
|
error_message = self.protocol.create_error_message(
|
||||||
|
error_description=error_msg,
|
||||||
|
sender_id=self.id,
|
||||||
|
error_type="web_search",
|
||||||
|
error_details={"traceback": traceback.format_exc()},
|
||||||
|
context_id=workflow_id
|
||||||
|
)
|
||||||
|
|
||||||
|
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||||
|
"""Send a document request using the protocol"""
|
||||||
|
return self.protocol.create_document_request_message(
|
||||||
|
document_description=document_description,
|
||||||
|
sender_id=sender_id,
|
||||||
|
receiver_id=receiver_id,
|
||||||
|
filters=filters,
|
||||||
|
context_id=context_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
|
||||||
|
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||||
|
"""Send a result message using the protocol"""
|
||||||
|
return self.protocol.create_result_message(
|
||||||
|
result_content=result_content,
|
||||||
|
sender_id=sender_id,
|
||||||
|
receiver_id=receiver_id,
|
||||||
|
task_id=task_id,
|
||||||
|
output_data=output_data,
|
||||||
|
result_format="SearchResults",
|
||||||
|
context_id=context_id
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
||||||
|
task = message_context.get("content", "")
|
||||||
|
return task.strip()
|
||||||
|
|
||||||
|
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
||||||
|
prompt = await self.get_prompt(message_context)
|
||||||
|
result_json = await self.run_web_query(prompt)
|
||||||
|
result_data = ""
|
||||||
|
summary_src = ""
|
||||||
|
|
||||||
|
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
||||||
|
if isinstance(result_json, list):
|
||||||
|
total_tokens = 0
|
||||||
|
|
||||||
|
for i, result in enumerate(result_json, 1):
|
||||||
|
# Limit content size for each result
|
||||||
|
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=15000) # Allow ~15000 tokens per result
|
||||||
|
|
||||||
|
web_answer_instructions = f"""
|
||||||
|
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
|
||||||
|
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
|
||||||
|
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
|
||||||
|
|
||||||
|
Dies ist das Resultat:
|
||||||
|
{result_data_limited}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Count tokens in the instructions to ensure we don't exceed API limits
|
||||||
|
instruction_tokens = self.count_tokens(web_answer_instructions)
|
||||||
|
if total_tokens + instruction_tokens > 60000:
|
||||||
|
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
|
||||||
|
break
|
||||||
|
|
||||||
|
total_tokens += instruction_tokens
|
||||||
|
|
||||||
|
# Zusätzliche Anweisungen für Web-Recherche
|
||||||
|
content_text = await self.chat_service.call_api(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": web_answer_instructions
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a summary but ensure we stay within token limits
|
||||||
|
content_summary = content_text[:2000] # Limit to ~2000 characters
|
||||||
|
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
|
||||||
|
summary_src += f"\n{content_summary}"
|
||||||
|
|
||||||
|
# Update token count
|
||||||
|
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
|
||||||
|
else:
|
||||||
|
result_data = "no data received"
|
||||||
|
|
||||||
|
logger.info(f"Web analysis result sent {len(result_data)}B")
|
||||||
|
|
||||||
|
# Zusätzliche Zusammenfassung
|
||||||
|
summary = ""
|
||||||
|
if len(summary_src) > 1:
|
||||||
|
# Limit summary source to ensure we don't exceed API limits
|
||||||
|
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
||||||
|
|
||||||
|
summary = await self.chat_service.call_api(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src_limited}\n"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format the final result
|
||||||
|
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
||||||
|
if prompt=="":
|
||||||
|
return []
|
||||||
|
|
||||||
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
||||||
|
|
||||||
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
||||||
|
|
||||||
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
||||||
|
|
||||||
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
||||||
|
"""
|
||||||
|
|
||||||
|
content_text = await self.chat_service.call_api(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": ptext
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Remove markdown formatting if present
|
||||||
|
if content_text.startswith("```json"):
|
||||||
|
# Find the end of the JSON block
|
||||||
|
end_marker = "```"
|
||||||
|
end_index = content_text.rfind(end_marker)
|
||||||
|
if end_index != -1:
|
||||||
|
# Extract the JSON content without the markdown markers
|
||||||
|
content_text = content_text[7:end_index].strip()
|
||||||
|
|
||||||
|
# Now parse the JSON
|
||||||
|
try:
|
||||||
|
logger.info(f"Valid json received: {str(content_text)}")
|
||||||
|
pjson = json.loads(content_text)
|
||||||
|
# Now call scrape_json with the parsed dictionary
|
||||||
|
result_json = await self.scrape_json(pjson)
|
||||||
|
return result_json
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Failed to parse JSON: {e}")
|
||||||
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Scrapes web content based on a research strategy JSON.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
research_strategy: A dictionary containing:
|
||||||
|
- 'skey': List of search keywords
|
||||||
|
- 'url': List of direct URLs to scrape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with URLs as keys and scraped content as values
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info("Starting JSON-based web scraping")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Validate input structure
|
||||||
|
if not isinstance(research_strategy, dict):
|
||||||
|
logger.error("Invalid research_strategy format: not a dictionary")
|
||||||
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
||||||
|
|
||||||
|
keys = research_strategy.get("skey", [])
|
||||||
|
direct_urls = research_strategy.get("url", [])
|
||||||
|
|
||||||
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
||||||
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
||||||
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
||||||
|
|
||||||
|
# Process search keywords through search engine
|
||||||
|
for keyword in keys:
|
||||||
|
logger.info(f"Processing keyword: {keyword}")
|
||||||
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
||||||
|
logger.info(f"... {len(found_results)} results found")
|
||||||
|
results.extend(found_results)
|
||||||
|
|
||||||
|
# Process direct URLs
|
||||||
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
||||||
|
for url in direct_urls:
|
||||||
|
if url in results:
|
||||||
|
logger.info(f"Skipping already scraped URL: {url}")
|
||||||
|
continue
|
||||||
|
soup=self.read_url(url)
|
||||||
|
|
||||||
|
# Extract title from the page if it exists
|
||||||
|
if isinstance(soup, BeautifulSoup):
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
title = title_tag.text.strip() if title_tag else "No title"
|
||||||
|
|
||||||
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
||||||
|
if title == "No title":
|
||||||
|
h1_tag = soup.find('h1')
|
||||||
|
if h1_tag:
|
||||||
|
title = h1_tag.text.strip()
|
||||||
|
else:
|
||||||
|
# Handle the case where soup is an error message string
|
||||||
|
title = "Error fetching page"
|
||||||
|
|
||||||
|
results.append(self.parse_result(soup,"No title",url))
|
||||||
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
||||||
|
"""
|
||||||
|
Extract the main content from an HTML page while limiting character count.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
soup: BeautifulSoup object containing the page content
|
||||||
|
max_chars: Maximum number of characters to extract
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted main content as string
|
||||||
|
"""
|
||||||
|
if not isinstance(soup, BeautifulSoup):
|
||||||
|
return str(soup)[:max_chars]
|
||||||
|
|
||||||
|
# Try to find main content elements in order of priority
|
||||||
|
main_content = None
|
||||||
|
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
||||||
|
content = soup.select_one(selector)
|
||||||
|
if content:
|
||||||
|
main_content = content
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no main content found, use the body
|
||||||
|
if not main_content:
|
||||||
|
main_content = soup.find('body') or soup
|
||||||
|
|
||||||
|
# Remove script, style, nav, footer elements that don't contribute to main content
|
||||||
|
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
|
# Extract text content
|
||||||
|
text_content = main_content.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
|
# Limit to max_chars
|
||||||
|
return text_content[:max_chars]
|
||||||
|
|
||||||
|
def tokenize_for_counting(self, text: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Simple token counter for estimating token usage.
|
||||||
|
This is an approximation since the exact tokenization depends on the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens
|
||||||
|
"""
|
||||||
|
# Simple tokenization by splitting on whitespace and punctuation
|
||||||
|
import re
|
||||||
|
return re.findall(r'\w+|[^\w\s]', text)
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
"""
|
||||||
|
Count the approximate number of tokens in a text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated token count
|
||||||
|
"""
|
||||||
|
tokens = self.tokenize_for_counting(text)
|
||||||
|
return len(tokens)
|
||||||
|
|
||||||
|
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
||||||
|
"""
|
||||||
|
Limit the text to a maximum number of tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
max_tokens: Maximum number of tokens allowed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Limited text
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
tokens = self.tokenize_for_counting(text)
|
||||||
|
|
||||||
|
# If text is already under the limit, return as is
|
||||||
|
if len(tokens) <= max_tokens:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Otherwise, truncate text to max_tokens
|
||||||
|
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
||||||
|
|
||||||
|
def search_web(self, query: str) -> List[Dict]:
|
||||||
|
formatted_query = quote_plus(query)
|
||||||
|
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
||||||
|
|
||||||
|
search_results_soup = self.read_url(url)
|
||||||
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
||||||
|
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extract search results
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Find all result containers
|
||||||
|
result_elements = search_results_soup.select('.result')
|
||||||
|
|
||||||
|
for result in result_elements:
|
||||||
|
# Extract title
|
||||||
|
title_element = result.select_one('.result__a')
|
||||||
|
title = title_element.text.strip() if title_element else 'No title'
|
||||||
|
|
||||||
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
||||||
|
url_element = title_element.get('href') if title_element else ''
|
||||||
|
extracted_url = 'No URL'
|
||||||
|
|
||||||
|
if url_element:
|
||||||
|
# Extract the actual URL from DuckDuckGo's redirect
|
||||||
|
if url_element.startswith('/d.js?q='):
|
||||||
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
||||||
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
||||||
|
extracted_url = unquote(url_element[start:end])
|
||||||
|
|
||||||
|
# Make sure the URL has the correct protocol prefix
|
||||||
|
if not extracted_url.startswith(('http://', 'https://')):
|
||||||
|
if not extracted_url.startswith('//'):
|
||||||
|
extracted_url = 'https://' + extracted_url
|
||||||
|
else:
|
||||||
|
extracted_url = 'https:' + extracted_url
|
||||||
|
else:
|
||||||
|
extracted_url = url_element
|
||||||
|
|
||||||
|
# Extract snippet directly from search results page
|
||||||
|
snippet_element = result.select_one('.result__snippet')
|
||||||
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||||
|
|
||||||
|
# Now fetch the actual page content for the data field
|
||||||
|
target_page_soup = self.read_url(extracted_url)
|
||||||
|
|
||||||
|
# Use the new content extraction method to limit content size
|
||||||
|
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'title': title,
|
||||||
|
'url': extracted_url,
|
||||||
|
'snippet': snippet,
|
||||||
|
'data': content
|
||||||
|
})
|
||||||
|
|
||||||
|
# Limit the number of results if needed
|
||||||
|
if len(results) >= self.max_result:
|
||||||
|
break
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def read_url(self, url: str) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
||||||
|
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Die zu lesende URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Initialer Request
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
|
||||||
|
# Polling für Status 202
|
||||||
|
if response.status_code == 202:
|
||||||
|
# Maximal 3 Versuche mit steigenden Intervallen
|
||||||
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
||||||
|
|
||||||
|
for wait_time in backoff_times:
|
||||||
|
time.sleep(wait_time) # Warten mit steigender Zeit
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
|
||||||
|
# Wenn kein 202 mehr, dann abbrechen
|
||||||
|
if response.status_code != 202:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Für andere Fehler-Status einen Fehler auslösen
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# HTML parsen
|
||||||
|
return BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Leeres BeautifulSoup-Objekt erstellen
|
||||||
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
||||||
|
|
||||||
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
||||||
|
# Extract snippet/description
|
||||||
|
snippet_element = data.select_one('.result__snippet')
|
||||||
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||||
|
|
||||||
|
result={
|
||||||
|
'title': title,
|
||||||
|
'url': url,
|
||||||
|
'snippet': snippet,
|
||||||
|
'data': data.prettify()
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton-Instanz
|
||||||
|
_webcrawler_agent = None
|
||||||
|
|
||||||
|
def get_webcrawler_agent():
|
||||||
|
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
|
||||||
|
global _webcrawler_agent
|
||||||
|
if _webcrawler_agent is None:
|
||||||
|
_webcrawler_agent = WebcrawlerAgent()
|
||||||
|
return _webcrawler_agent
|
||||||
|
|
@ -39,10 +39,14 @@ class AnalystAgent(BaseAgent):
|
||||||
self.capabilities = "data_analysis,pattern_recognition,statistics,visualization,data_interpretation"
|
self.capabilities = "data_analysis,pattern_recognition,statistics,visualization,data_interpretation"
|
||||||
self.result_format = "AnalysisReport"
|
self.result_format = "AnalysisReport"
|
||||||
|
|
||||||
|
# Initialize AI service
|
||||||
|
self.ai_service = None
|
||||||
|
|
||||||
# Document capabilities
|
# Document capabilities
|
||||||
self.supports_documents = True
|
self.supports_documents = True
|
||||||
self.document_capabilities = ["read", "analyze", "extract"]
|
self.document_capabilities = ["read", "analyze", "extract"]
|
||||||
self.required_context = ["data_source", "analysis_objectives"]
|
self.required_context = ["data_source", "analysis_objectives"]
|
||||||
|
self.document_handler = None
|
||||||
|
|
||||||
# Initialize protocol
|
# Initialize protocol
|
||||||
self.protocol = AgentCommunicationProtocol()
|
self.protocol = AgentCommunicationProtocol()
|
||||||
|
|
@ -69,6 +73,10 @@ class AnalystAgent(BaseAgent):
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def set_document_handler(self, document_handler):
|
||||||
|
"""Set the document handler for file operations"""
|
||||||
|
self.document_handler = document_handler
|
||||||
|
|
||||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Process a message and perform data analysis.
|
Process a message and perform data analysis.
|
||||||
|
|
@ -277,7 +285,7 @@ class AnalystAgent(BaseAgent):
|
||||||
for document in message.get("documents", []):
|
for document in message.get("documents", []):
|
||||||
source = document.get("source", {})
|
source = document.get("source", {})
|
||||||
filename = source.get("name", "")
|
filename = source.get("name", "")
|
||||||
file_id = source.get("id", "")
|
file_id = source.get("id", 0)
|
||||||
content_type = source.get("content_type", "")
|
content_type = source.get("content_type", "")
|
||||||
|
|
||||||
# Skip if not a recognizable data file
|
# Skip if not a recognizable data file
|
||||||
|
|
@ -288,7 +296,7 @@ class AnalystAgent(BaseAgent):
|
||||||
# Try to get file content through document handler first
|
# Try to get file content through document handler first
|
||||||
file_content = None
|
file_content = None
|
||||||
if self.document_handler:
|
if self.document_handler:
|
||||||
file_content = await self.document_handler.get_file_content(file_id)
|
file_content = self.document_handler.get_file_content_from_message(message, file_id=file_id)
|
||||||
|
|
||||||
# Process based on file type
|
# Process based on file type
|
||||||
if filename.lower().endswith('.csv'):
|
if filename.lower().endswith('.csv'):
|
||||||
|
|
@ -323,32 +331,41 @@ class AnalystAgent(BaseAgent):
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _process_csv(self, file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
|
def _process_csv(self, file_content: Union[bytes, str], filename: str) -> Optional[pd.DataFrame]:
|
||||||
"""Process CSV file content into a pandas DataFrame"""
|
"""Process CSV file content into a pandas DataFrame"""
|
||||||
if file_content is None:
|
if file_content is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try various encodings
|
# Handle the case where file_content is already a string
|
||||||
for encoding in ['utf-8', 'latin1', 'cp1252']:
|
if isinstance(file_content, str):
|
||||||
try:
|
text_content = file_content
|
||||||
# Use StringIO to create a file-like object
|
df = pd.read_csv(io.StringIO(text_content))
|
||||||
text_content = file_content.decode(encoding)
|
df = self._preprocess_dataframe(df)
|
||||||
df = pd.read_csv(io.StringIO(text_content))
|
return df
|
||||||
|
|
||||||
# Basic preprocessing
|
# Handle the case where file_content is bytes
|
||||||
df = self._preprocess_dataframe(df)
|
else:
|
||||||
return df
|
# Try various encodings
|
||||||
except UnicodeDecodeError:
|
for encoding in ['utf-8', 'latin1', 'cp1252']:
|
||||||
continue
|
try:
|
||||||
except Exception as e:
|
# Use StringIO to create a file-like object
|
||||||
logger.error(f"Error processing CSV with {encoding} encoding: {str(e)}")
|
text_content = file_content.decode(encoding)
|
||||||
|
df = pd.read_csv(io.StringIO(text_content))
|
||||||
|
|
||||||
# If all encodings fail, try one more time with errors='replace'
|
# Basic preprocessing
|
||||||
text_content = file_content.decode('utf-8', errors='replace')
|
df = self._preprocess_dataframe(df)
|
||||||
df = pd.read_csv(io.StringIO(text_content))
|
return df
|
||||||
df = self._preprocess_dataframe(df)
|
except UnicodeDecodeError:
|
||||||
return df
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing CSV with {encoding} encoding: {str(e)}")
|
||||||
|
|
||||||
|
# If all encodings fail, try one more time with errors='replace'
|
||||||
|
text_content = file_content.decode('utf-8', errors='replace')
|
||||||
|
df = pd.read_csv(io.StringIO(text_content))
|
||||||
|
df = self._preprocess_dataframe(df)
|
||||||
|
return df
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to process CSV file {filename}: {str(e)}")
|
logger.error(f"Failed to process CSV file {filename}: {str(e)}")
|
||||||
|
|
@ -1482,6 +1499,7 @@ class AnalystAgent(BaseAgent):
|
||||||
Generated analysis
|
Generated analysis
|
||||||
"""
|
"""
|
||||||
if not self.ai_service:
|
if not self.ai_service:
|
||||||
|
logging.warning("AI service not available for analysis generation")
|
||||||
return f"## Data Analysis ({analysis_type})\n\nUnable to generate analysis: AI service not available."
|
return f"## Data Analysis ({analysis_type})\n\nUnable to generate analysis: AI service not available."
|
||||||
|
|
||||||
# Create specialized prompt based on analysis type
|
# Create specialized prompt based on analysis type
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
"""
|
"""
|
||||||
CoderAgent - A unified agent for developing and executing Python code.
|
CoderAgent - A unified agent for developing and executing Python code.
|
||||||
Includes code execution capabilities previously in separate modules.
|
Includes code execution capabilities previously in separate modules.
|
||||||
|
Enhanced with auto-correction loop for handling execution errors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -13,6 +14,7 @@ import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Any, Optional, Tuple
|
from typing import List, Dict, Any, Optional, Tuple
|
||||||
|
|
||||||
|
|
@ -23,7 +25,9 @@ from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtoc
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Existing SimpleCodeExecutor class remains unchanged
|
||||||
class SimpleCodeExecutor:
|
class SimpleCodeExecutor:
|
||||||
|
# ... existing code ...
|
||||||
"""
|
"""
|
||||||
A simplified executor that runs Python code in isolated virtual environments.
|
A simplified executor that runs Python code in isolated virtual environments.
|
||||||
"""
|
"""
|
||||||
|
|
@ -36,7 +40,8 @@ class SimpleCodeExecutor:
|
||||||
timeout: int = 30,
|
timeout: int = 30,
|
||||||
max_memory_mb: int = 512,
|
max_memory_mb: int = 512,
|
||||||
requirements: List[str] = None,
|
requirements: List[str] = None,
|
||||||
blocked_packages: List[str] = None):
|
blocked_packages: List[str] = None,
|
||||||
|
ai_service = None):
|
||||||
"""
|
"""
|
||||||
Initialize the SimpleCodeExecutor.
|
Initialize the SimpleCodeExecutor.
|
||||||
|
|
||||||
|
|
@ -57,6 +62,7 @@ class SimpleCodeExecutor:
|
||||||
"tensorflow", "pytorch", "scikit-learn" # Resource intensive
|
"tensorflow", "pytorch", "scikit-learn" # Resource intensive
|
||||||
]
|
]
|
||||||
self.is_persistent = workflow_id is not None
|
self.is_persistent = workflow_id is not None
|
||||||
|
self.ai_service = ai_service
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_workflow_environment(cls, workflow_id: str) -> Optional[str]:
|
def get_workflow_environment(cls, workflow_id: str) -> Optional[str]:
|
||||||
|
|
@ -176,50 +182,16 @@ class SimpleCodeExecutor:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _extract_required_packages(self, code: str) -> List[str]:
|
def _extract_required_packages(self, code: str) -> List[str]:
|
||||||
"""Extract required packages from import statements and requirements comments in the code."""
|
# Extract required packages from requirements comments in the 1st code line
|
||||||
import re
|
|
||||||
packages = set()
|
packages = set()
|
||||||
|
# Check for special REQUIREMENTS comment - specific format we're looking for
|
||||||
# Check for special REQUIREMENTS comment
|
first_lines = code.split('\n')[:5] # Only check first few lines
|
||||||
requirements_match = re.search(r'# REQUIREMENTS:\s*([^\n]+)', code)
|
for line in first_lines:
|
||||||
if requirements_match:
|
if line.strip().startswith("# REQUIREMENTS:"):
|
||||||
req_str = requirements_match.group(1).strip()
|
req_str = line.replace("# REQUIREMENTS:", "").strip()
|
||||||
for pkg in req_str.split(','):
|
for pkg in req_str.split(','):
|
||||||
if pkg.strip():
|
if pkg.strip():
|
||||||
packages.add(pkg.strip())
|
packages.add(pkg.strip())
|
||||||
|
|
||||||
# Add common base packages
|
|
||||||
base_packages = [
|
|
||||||
"requests", "urllib3", "pydantic",
|
|
||||||
"pandas", "numpy", "matplotlib"
|
|
||||||
]
|
|
||||||
|
|
||||||
for pkg in base_packages:
|
|
||||||
packages.add(pkg)
|
|
||||||
|
|
||||||
# Detect pip install comments
|
|
||||||
pip_comments = re.findall(r'#\s*pip\s+install\s+([^#\n]+)', code)
|
|
||||||
for comment in pip_comments:
|
|
||||||
for pkg in comment.split():
|
|
||||||
if pkg and not pkg.startswith('-'):
|
|
||||||
packages.add(pkg.strip())
|
|
||||||
|
|
||||||
# Analyze import statements
|
|
||||||
import_lines = re.findall(r'^(?:import|from)\s+([^\s.]+)(?:\s+import|\s*$|\.)', code, re.MULTILINE)
|
|
||||||
|
|
||||||
# Standard modules that don't need installation
|
|
||||||
std_modules = {
|
|
||||||
'os', 'sys', 'time', 'datetime', 'math', 're', 'random', 'json',
|
|
||||||
'collections', 'itertools', 'functools', 'pathlib', 'shutil',
|
|
||||||
'tempfile', 'uuid', 'subprocess', 'threading', 'logging',
|
|
||||||
'traceback', 'io', 'copy', 'typing', 'asyncio'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Process all imports
|
|
||||||
for module in import_lines:
|
|
||||||
if module not in std_modules:
|
|
||||||
packages.add(module)
|
|
||||||
|
|
||||||
return list(packages)
|
return list(packages)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -247,8 +219,8 @@ class SimpleCodeExecutor:
|
||||||
all_requirements = []
|
all_requirements = []
|
||||||
|
|
||||||
# Add explicitly provided requirements
|
# Add explicitly provided requirements
|
||||||
if self.requirements:
|
# if self.requirements:
|
||||||
all_requirements.extend(self.requirements)
|
# all_requirements.extend(self.requirements)
|
||||||
|
|
||||||
# Extract requirements from code
|
# Extract requirements from code
|
||||||
extracted_requirements = self._extract_required_packages(code)
|
extracted_requirements = self._extract_required_packages(code)
|
||||||
|
|
@ -340,12 +312,13 @@ class SimpleCodeExecutor:
|
||||||
# Run the code from root dir
|
# Run the code from root dir
|
||||||
working_dir = os.path.dirname(code_file) # This should be the project root
|
working_dir = os.path.dirname(code_file) # This should be the project root
|
||||||
logger.info(f"DEBUG PATH Root: {os.getcwd()} Code: {code_file} Working Dir: {working_dir}")
|
logger.info(f"DEBUG PATH Root: {os.getcwd()} Code: {code_file} Working Dir: {working_dir}")
|
||||||
|
logger.debug(f"|{code}|")
|
||||||
process = subprocess.run(
|
process = subprocess.run(
|
||||||
[python_executable, code_file],
|
[python_executable, code_file],
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
cwd=self.temp_dir
|
cwd=working_dir
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process the output
|
# Process the output
|
||||||
|
|
@ -393,7 +366,7 @@ class SimpleCodeExecutor:
|
||||||
execution_result = {
|
execution_result = {
|
||||||
"success": False,
|
"success": False,
|
||||||
"output": "",
|
"output": "",
|
||||||
"error": f"Execution error: {str(e)}",
|
"error": f"Execution error: {str(e)} for code {code}",
|
||||||
"result": None,
|
"result": None,
|
||||||
"exit_code": -1
|
"exit_code": -1
|
||||||
}
|
}
|
||||||
|
|
@ -426,7 +399,7 @@ class SimpleCodeExecutor:
|
||||||
"""Clean up during garbage collection."""
|
"""Clean up during garbage collection."""
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
|
|
||||||
|
# Unchanged error recommendation function
|
||||||
def get_error_recommendation(error_message: str) -> str:
|
def get_error_recommendation(error_message: str) -> str:
|
||||||
"""Generate recommendations based on error message."""
|
"""Generate recommendations based on error message."""
|
||||||
if "ImportError" in error_message or "ModuleNotFoundError" in error_message:
|
if "ImportError" in error_message or "ModuleNotFoundError" in error_message:
|
||||||
|
|
@ -460,7 +433,7 @@ To fix the error:
|
||||||
|
|
||||||
|
|
||||||
class CoderAgent(BaseAgent):
|
class CoderAgent(BaseAgent):
|
||||||
"""Agent for developing and executing Python code"""
|
"""Agent for developing and executing Python code with auto-correction capabilities"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize the coder agent with proper type and capabilities"""
|
"""Initialize the coder agent with proper type and capabilities"""
|
||||||
|
|
@ -474,10 +447,14 @@ class CoderAgent(BaseAgent):
|
||||||
self.capabilities = "code_development,data_processing,file_processing,automation"
|
self.capabilities = "code_development,data_processing,file_processing,automation"
|
||||||
self.result_format = "python_code"
|
self.result_format = "python_code"
|
||||||
|
|
||||||
|
# Initialize AI service
|
||||||
|
self.ai_service = None
|
||||||
|
|
||||||
# Add document capabilities
|
# Add document capabilities
|
||||||
self.supports_documents = True
|
self.supports_documents = True
|
||||||
self.document_capabilities = ["read", "reference", "create"]
|
self.document_capabilities = ["read", "reference", "create"]
|
||||||
self.required_context = ["workflow_id"]
|
self.required_context = ["workflow_id"]
|
||||||
|
self.document_handler = None
|
||||||
|
|
||||||
# Initialize protocol
|
# Initialize protocol
|
||||||
self.protocol = AgentCommunicationProtocol()
|
self.protocol = AgentCommunicationProtocol()
|
||||||
|
|
@ -491,23 +468,33 @@ class CoderAgent(BaseAgent):
|
||||||
self.executor_memory_limit = 512 # MB
|
self.executor_memory_limit = 512 # MB
|
||||||
|
|
||||||
# AI service settings
|
# AI service settings
|
||||||
self.ai_temperature = 0.2 # Lower temperature for more deterministic code generation
|
self.ai_temperature = 0.1 # Lower temperature for more deterministic code generation
|
||||||
self.ai_max_tokens = 2000 # Enough tokens for complex code
|
self.ai_max_tokens = 2000 # Enough tokens for complex code
|
||||||
|
|
||||||
|
# Auto-correction settings (new)
|
||||||
|
self.max_correction_attempts = 3 # Maximum number of correction attempts
|
||||||
|
self.correction_temperature = 0.1 # Even lower temperature for corrections
|
||||||
|
|
||||||
def get_agent_info(self) -> Dict[str, Any]:
|
def get_agent_info(self) -> Dict[str, Any]:
|
||||||
"""Get agent information for agent registry"""
|
"""Get agent information for agent registry"""
|
||||||
info = super().get_agent_info()
|
info = super().get_agent_info()
|
||||||
info.update({
|
info.update({
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"timeout": self.executor_timeout,
|
"timeout": self.executor_timeout,
|
||||||
"memory_limit": self.executor_memory_limit
|
"memory_limit": self.executor_memory_limit,
|
||||||
|
"max_correction_attempts": self.max_correction_attempts
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def set_document_handler(self, document_handler):
|
||||||
|
"""Set the document handler for file operations"""
|
||||||
|
self.document_handler = document_handler
|
||||||
|
|
||||||
|
|
||||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Process a message to develop and execute Python code.
|
Process a message to develop and execute Python code with auto-correction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
message: The message to process
|
message: The message to process
|
||||||
|
|
@ -539,34 +526,23 @@ class CoderAgent(BaseAgent):
|
||||||
content = message.get("content", "")
|
content = message.get("content", "")
|
||||||
documents = message.get("documents", [])
|
documents = message.get("documents", [])
|
||||||
|
|
||||||
# Extract code from message content
|
|
||||||
code_blocks = re.findall(r'```(?:python)?\s*([\s\S]*?)```', content)
|
|
||||||
code_to_execute = None
|
code_to_execute = None
|
||||||
requirements = []
|
requirements = []
|
||||||
|
|
||||||
if code_blocks:
|
# Generate code based on the message content using AI
|
||||||
# Use the first code block found
|
logging_utils.info("Generating new code with AI", "agents")
|
||||||
code_to_execute = code_blocks[0]
|
|
||||||
# Clean the code to remove any markdown formatting
|
|
||||||
code_to_execute = self._clean_code(code_to_execute)
|
|
||||||
logging_utils.info(f"Code extracted from message ({len(code_to_execute)} characters)", "agents")
|
|
||||||
else:
|
|
||||||
# Generate code based on the message content using AI
|
|
||||||
logging_utils.info("No code found in message, generating new code with AI", "agents")
|
|
||||||
|
|
||||||
# Generate code using AI
|
# Generate code using AI
|
||||||
code_to_execute, requirements = await self._generate_code_from_prompt(content, documents)
|
code_to_execute, requirements = await self._generate_code_from_prompt(content, documents)
|
||||||
if not code_to_execute:
|
if not code_to_execute:
|
||||||
logging_utils.warning("AI could not generate code", "agents")
|
logging_utils.warning("AI could not generate code", "agents")
|
||||||
response["content"] = "I couldn't generate executable code based on your request. Please provide more detailed instructions."
|
response["content"] = "I couldn't generate executable code based on your request. Please provide more detailed instructions."
|
||||||
self.message_utils.finalize_message(response)
|
self.message_utils.finalize_message(response)
|
||||||
return response
|
return response
|
||||||
logging_utils.info(f"Code generated with AI ({len(code_to_execute)} characters)", "agents")
|
logging_utils.info(f"Code generated with AI ({len(code_to_execute)} characters)", "agents")
|
||||||
|
|
||||||
# Execute the code
|
# Execute the code with auto-correction loop
|
||||||
if code_to_execute:
|
if code_to_execute:
|
||||||
logging_utils.info("Executing code", "execution")
|
|
||||||
|
|
||||||
# Prepare execution context
|
# Prepare execution context
|
||||||
execution_context = {
|
execution_context = {
|
||||||
"workflow_id": workflow_id,
|
"workflow_id": workflow_id,
|
||||||
|
|
@ -575,21 +551,16 @@ class CoderAgent(BaseAgent):
|
||||||
"log_func": log_func
|
"log_func": log_func
|
||||||
}
|
}
|
||||||
|
|
||||||
# Send a status update
|
# Enhanced execution with auto-correction
|
||||||
if log_func:
|
result, attempts_info = await self._execute_with_auto_correction(
|
||||||
status_message = self.protocol.create_status_update_message(
|
code_to_execute,
|
||||||
status_description="Processing code execution request",
|
requirements,
|
||||||
sender_id=self.id,
|
execution_context,
|
||||||
status="in_progress",
|
content, # Original prompt/message
|
||||||
progress=0.5,
|
logging_utils
|
||||||
context_id=workflow_id
|
)
|
||||||
)
|
|
||||||
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
|
||||||
|
|
||||||
# Execute code
|
# Prepare response based on the final result (success or failure)
|
||||||
result = await self._execute_code(code_to_execute, requirements, execution_context)
|
|
||||||
|
|
||||||
# Prepare response
|
|
||||||
if result.get("success", False):
|
if result.get("success", False):
|
||||||
# Code execution successful
|
# Code execution successful
|
||||||
output = result.get("output", "")
|
output = result.get("output", "")
|
||||||
|
|
@ -597,20 +568,39 @@ class CoderAgent(BaseAgent):
|
||||||
logging_utils.info("Code executed successfully", "execution")
|
logging_utils.info("Code executed successfully", "execution")
|
||||||
|
|
||||||
# Format response content
|
# Format response content
|
||||||
response_content = f"## Code executed successfully\n\n"
|
response_content = f"## Code executed successfully"
|
||||||
|
|
||||||
|
# Add correction attempts info if any corrections were made
|
||||||
|
if attempts_info and len(attempts_info) > 1:
|
||||||
|
response_content += f" (after {len(attempts_info)-1} correction attempts)"
|
||||||
|
|
||||||
|
response_content += "\n\n"
|
||||||
|
|
||||||
# Include the executed code
|
# Include the executed code
|
||||||
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
response_content += f"### Final Executed Code\n\n```python\n{attempts_info[-1]['code']}\n```\n\n"
|
||||||
|
|
||||||
# Include the output if available
|
# Include the output if available
|
||||||
if output:
|
if output:
|
||||||
response_content += f"### Output\n\n```\n{output}\n```\n\n"
|
response_content += f"### Output\n\n```\n{output}\n```\n\n"
|
||||||
|
|
||||||
|
# Create document with results
|
||||||
|
data_document = self._create_document_from_result(execution_result)
|
||||||
|
if data_document:
|
||||||
|
response["documents"].append(data_document)
|
||||||
|
|
||||||
# Include the execution result if available
|
# Include the execution result if available
|
||||||
if execution_result:
|
if execution_result:
|
||||||
result_str = json.dumps(execution_result, indent=2) if isinstance(execution_result, (dict, list)) else str(execution_result)
|
result_str = json.dumps(execution_result, indent=2) if isinstance(execution_result, (dict, list)) else str(execution_result)
|
||||||
response_content += f"### Result\n\n```\n{result_str}\n```\n\n"
|
response_content += f"### Result\n\n```\n{result_str}\n```\n\n"
|
||||||
|
|
||||||
|
# Include correction history if any corrections were made
|
||||||
|
if attempts_info and len(attempts_info) > 1:
|
||||||
|
response_content += f"### Code Correction History\n\n"
|
||||||
|
for i, attempt in enumerate(attempts_info[:-1], 1):
|
||||||
|
response_content += f"**Attempt {i}:**\n\n"
|
||||||
|
response_content += f"```python\n{attempt['code']}\n```\n\n"
|
||||||
|
response_content += f"**Error:**\n\n```\n{attempt['error']}\n```\n\n"
|
||||||
|
|
||||||
response["content"] = response_content
|
response["content"] = response_content
|
||||||
|
|
||||||
# Process any files created by the code
|
# Process any files created by the code
|
||||||
|
|
@ -628,17 +618,38 @@ class CoderAgent(BaseAgent):
|
||||||
}
|
}
|
||||||
response["documents"].append(doc)
|
response["documents"].append(doc)
|
||||||
else:
|
else:
|
||||||
# Code execution failed
|
# Code execution failed after all attempts
|
||||||
error = result.get("error", "Unknown error")
|
error = result.get("error", "Unknown error")
|
||||||
logging_utils.error(f"Error during code execution: {error}", "execution")
|
logging_utils.error(f"Error during code execution after all correction attempts: {error}", "execution")
|
||||||
|
|
||||||
# Format error response
|
# Format error response
|
||||||
response_content = f"## Error during code execution\n\n"
|
response_content = f"## Error during code execution\n\n"
|
||||||
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
|
||||||
response_content += f"### Error\n\n```\n{error}\n```\n\n"
|
|
||||||
|
|
||||||
# Add recommendation based on error
|
# Include correction attempts information
|
||||||
response_content += get_error_recommendation(error)
|
if attempts_info:
|
||||||
|
response_content += f"I made {len(attempts_info)} attempts to correct the code, but couldn't resolve all issues.\n\n"
|
||||||
|
|
||||||
|
# Add the final attempt
|
||||||
|
response_content += f"### Final Code Attempt\n\n```python\n{attempts_info[-1]['code']}\n```\n\n"
|
||||||
|
response_content += f"### Final Error\n\n```\n{attempts_info[-1]['error']}\n```\n\n"
|
||||||
|
|
||||||
|
# Add recommendation based on error
|
||||||
|
response_content += get_error_recommendation(error)
|
||||||
|
|
||||||
|
# Add correction history
|
||||||
|
if len(attempts_info) > 1:
|
||||||
|
response_content += f"\n### Code Correction History\n\n"
|
||||||
|
for i, attempt in enumerate(attempts_info[:-1], 1):
|
||||||
|
response_content += f"**Attempt {i}:**\n\n"
|
||||||
|
response_content += f"```python\n{attempt['code']}\n```\n\n"
|
||||||
|
response_content += f"**Error:**\n\n```\n{attempt['error']}\n```\n\n"
|
||||||
|
else:
|
||||||
|
# Just show the code and error
|
||||||
|
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
||||||
|
response_content += f"### Error\n\n```\n{error}\n```\n\n"
|
||||||
|
|
||||||
|
# Add recommendation based on error
|
||||||
|
response_content += get_error_recommendation(error)
|
||||||
|
|
||||||
response["content"] = response_content
|
response["content"] = response_content
|
||||||
else:
|
else:
|
||||||
|
|
@ -663,6 +674,252 @@ class CoderAgent(BaseAgent):
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
def _create_document_from_result(self, execution_result, output_format="json"):
|
||||||
|
"""
|
||||||
|
Create a document object from execution results
|
||||||
|
|
||||||
|
Args:
|
||||||
|
execution_result: The data returned from code execution
|
||||||
|
output_format: Desired format (json, csv, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document object for passing to other agents
|
||||||
|
"""
|
||||||
|
if not execution_result:
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc_id = f"data_{uuid.uuid4()}"
|
||||||
|
|
||||||
|
# Determine filename and content type based on the data
|
||||||
|
if isinstance(execution_result, pd.DataFrame):
|
||||||
|
# Handle DataFrame result
|
||||||
|
filename = "processed_data.csv"
|
||||||
|
content_type = "text/csv"
|
||||||
|
content = execution_result.to_csv(index=False)
|
||||||
|
elif isinstance(execution_result, dict) or isinstance(execution_result, list):
|
||||||
|
# Handle dictionary or list result
|
||||||
|
filename = "processed_data.json"
|
||||||
|
content_type = "application/json"
|
||||||
|
content = json.dumps(execution_result)
|
||||||
|
elif isinstance(execution_result, str):
|
||||||
|
# Try to determine if string is JSON, CSV, or plain text
|
||||||
|
if execution_result.strip().startswith('{') or execution_result.strip().startswith('['):
|
||||||
|
filename = "processed_data.json"
|
||||||
|
content_type = "application/json"
|
||||||
|
elif ',' in execution_result and '\n' in execution_result:
|
||||||
|
filename = "processed_data.csv"
|
||||||
|
content_type = "text/csv"
|
||||||
|
else:
|
||||||
|
filename = "processed_data.txt"
|
||||||
|
content_type = "text/plain"
|
||||||
|
content = str(execution_result)
|
||||||
|
else:
|
||||||
|
# Default case for other types
|
||||||
|
filename = "processed_data.txt"
|
||||||
|
content_type = "text/plain"
|
||||||
|
content = str(execution_result)
|
||||||
|
|
||||||
|
# Create document object
|
||||||
|
document = {
|
||||||
|
"id": doc_id,
|
||||||
|
"source": {
|
||||||
|
"type": "generated",
|
||||||
|
"id": doc_id,
|
||||||
|
"name": filename,
|
||||||
|
"content_type": content_type,
|
||||||
|
},
|
||||||
|
"contents": [{
|
||||||
|
"type": "text",
|
||||||
|
"text": content,
|
||||||
|
"is_extracted": True
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
return document
|
||||||
|
|
||||||
|
async def _execute_with_auto_correction(
|
||||||
|
self,
|
||||||
|
initial_code: str,
|
||||||
|
requirements: List[str],
|
||||||
|
context: Dict[str, Any],
|
||||||
|
original_prompt: str,
|
||||||
|
logging_utils: LoggingUtils = None
|
||||||
|
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
||||||
|
"""
|
||||||
|
Execute code with automatic error correction and retries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
initial_code: The initial Python code to execute
|
||||||
|
requirements: List of required packages
|
||||||
|
context: Additional context for execution
|
||||||
|
original_prompt: The original user request/prompt
|
||||||
|
logging_utils: Optional logging utility
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (final execution result, list of attempt info dictionaries)
|
||||||
|
"""
|
||||||
|
# Initialize tracking data
|
||||||
|
current_code = initial_code
|
||||||
|
current_requirements = requirements.copy() if requirements else []
|
||||||
|
attempts_info = []
|
||||||
|
|
||||||
|
# Execute with correction loop
|
||||||
|
for attempt in range(1, self.max_correction_attempts + 1):
|
||||||
|
if logging_utils:
|
||||||
|
if attempt == 1:
|
||||||
|
logging_utils.info(f"Executing code (attempt {attempt}/{self.max_correction_attempts})", "execution")
|
||||||
|
else:
|
||||||
|
logging_utils.info(f"Executing corrected code (attempt {attempt}/{self.max_correction_attempts})", "execution")
|
||||||
|
|
||||||
|
# Execute the current code version
|
||||||
|
result = await self._execute_code(current_code, current_requirements, context)
|
||||||
|
|
||||||
|
# Record attempt information
|
||||||
|
attempts_info.append({
|
||||||
|
"attempt": attempt,
|
||||||
|
"code": current_code,
|
||||||
|
"error": result.get("error", ""),
|
||||||
|
"success": result.get("success", False)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check if execution was successful
|
||||||
|
if result.get("success", False):
|
||||||
|
# Success! Return the result and attempt info
|
||||||
|
return result, attempts_info
|
||||||
|
|
||||||
|
# Failed execution - check if we've reached the maximum attempt limit
|
||||||
|
if attempt >= self.max_correction_attempts:
|
||||||
|
if logging_utils:
|
||||||
|
logging_utils.warning(f"Maximum correction attempts ({self.max_correction_attempts}) reached, giving up", "execution")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Need to correct the code - generate a fix based on the error
|
||||||
|
error_message = result.get("error", "Unknown error")
|
||||||
|
|
||||||
|
if logging_utils:
|
||||||
|
logging_utils.info(f"Attempting to fix code error: {error_message[:200]}...", "execution")
|
||||||
|
|
||||||
|
# Generate corrected code
|
||||||
|
corrected_code, new_requirements = await self._generate_code_correction(
|
||||||
|
current_code,
|
||||||
|
error_message,
|
||||||
|
original_prompt,
|
||||||
|
current_requirements
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update for next attempt
|
||||||
|
if corrected_code:
|
||||||
|
current_code = corrected_code
|
||||||
|
|
||||||
|
# Add any new requirements
|
||||||
|
if new_requirements:
|
||||||
|
for req in new_requirements:
|
||||||
|
if req not in current_requirements:
|
||||||
|
current_requirements.append(req)
|
||||||
|
if logging_utils:
|
||||||
|
logging_utils.info(f"Added new requirement: {req}", "execution")
|
||||||
|
else:
|
||||||
|
# Could not generate correction, break out of the loop
|
||||||
|
if logging_utils:
|
||||||
|
logging_utils.warning("Could not generate code correction, giving up", "execution")
|
||||||
|
break
|
||||||
|
|
||||||
|
# If we get here, all attempts failed - return the last result and attempt info
|
||||||
|
return result, attempts_info
|
||||||
|
|
||||||
|
async def _generate_code_correction(
|
||||||
|
self,
|
||||||
|
code: str,
|
||||||
|
error_message: str,
|
||||||
|
original_prompt: str,
|
||||||
|
current_requirements: List[str] = None
|
||||||
|
) -> Tuple[str, List[str]]:
|
||||||
|
"""
|
||||||
|
Generate a corrected version of code based on error messages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
code: The code that produced errors
|
||||||
|
error_message: The error message to fix
|
||||||
|
original_prompt: The original task/requirements
|
||||||
|
current_requirements: List of currently required packages
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (corrected code, new requirements list)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Create a detailed prompt for code correction
|
||||||
|
correction_prompt = f"""You need to fix an error in Python code. The code was written for this task:
|
||||||
|
|
||||||
|
ORIGINAL TASK:
|
||||||
|
{original_prompt}
|
||||||
|
|
||||||
|
CURRENT CODE:
|
||||||
|
```python
|
||||||
|
{code}
|
||||||
|
```
|
||||||
|
|
||||||
|
ERROR MESSAGE:
|
||||||
|
```
|
||||||
|
{error_message}
|
||||||
|
```
|
||||||
|
|
||||||
|
CURRENT REQUIREMENTS: {', '.join(current_requirements) if current_requirements else "None"}
|
||||||
|
|
||||||
|
Your task is to analyze the error and provide a corrected version of the code.
|
||||||
|
Focus specifically on fixing the error while preserving the original functionality.
|
||||||
|
|
||||||
|
Common fixes might include:
|
||||||
|
- Fixing syntax errors (missing parentheses, indentation, etc.)
|
||||||
|
- Resolving import errors by adding appropriate requirements
|
||||||
|
- Correcting file paths or handling file not found errors
|
||||||
|
- Adding error handling for specific edge cases
|
||||||
|
- Fixing logical errors in the code
|
||||||
|
|
||||||
|
FORMAT INSTRUCTIONS:
|
||||||
|
1. Provide ONLY the complete fixed Python code without ANY explanation
|
||||||
|
2. DO NOT include code block markers like ```python or ```
|
||||||
|
3. DO NOT explain what the code does before or after it
|
||||||
|
4. DO NOT include any text that is not valid Python code
|
||||||
|
5. Start your response directly with the valid Python code
|
||||||
|
6. End your response with valid Python code
|
||||||
|
|
||||||
|
If you need to add new required packages, place them in a specially formatted comment at the top of your code like this:
|
||||||
|
# REQUIREMENTS: package1,package2,package3
|
||||||
|
|
||||||
|
Your entire response must be valid Python that can be executed without modification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create messages for the API
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are a Python debugging expert. You provide ONLY clean, fixed Python code without any explanations, markdown formatting, or non-code text. Your response should be nothing but valid, fixed Python code that can be executed directly."},
|
||||||
|
{"role": "user", "content": correction_prompt}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call the API with very low temperature for deterministic fixes
|
||||||
|
generated_content = await self.ai_service.call_api(
|
||||||
|
messages,
|
||||||
|
temperature=self.correction_temperature,
|
||||||
|
max_tokens=self.ai_max_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean the generated content to ensure it's only valid Python code
|
||||||
|
fixed_code = self._clean_code(generated_content)
|
||||||
|
|
||||||
|
# Extract requirements from special comment at the top of the code
|
||||||
|
new_requirements = []
|
||||||
|
for line in fixed_code.split('\n'):
|
||||||
|
if line.strip().startswith("# REQUIREMENTS:"):
|
||||||
|
req_str = line.replace("# REQUIREMENTS:", "").strip()
|
||||||
|
new_requirements = [r.strip() for r in req_str.split(',') if r.strip()]
|
||||||
|
break
|
||||||
|
|
||||||
|
return fixed_code, new_requirements
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error generating code correction: {str(e)}", exc_info=True)
|
||||||
|
# Return None to indicate failure
|
||||||
|
return None, []
|
||||||
|
|
||||||
def _clean_code(self, code: str) -> str:
|
def _clean_code(self, code: str) -> str:
|
||||||
"""
|
"""
|
||||||
Clean up code by removing markdown code block markers and other formatting artifacts.
|
Clean up code by removing markdown code block markers and other formatting artifacts.
|
||||||
|
|
@ -728,9 +985,6 @@ class CoderAgent(BaseAgent):
|
||||||
Tuple of (generated Python code, required packages)
|
Tuple of (generated Python code, required packages)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Initialize AI service
|
|
||||||
chat_service = ChatService()
|
|
||||||
|
|
||||||
# Prepare a prompt for code generation
|
# Prepare a prompt for code generation
|
||||||
ai_prompt = f"""Generate Python code to solve the following task:
|
ai_prompt = f"""Generate Python code to solve the following task:
|
||||||
{prompt}
|
{prompt}
|
||||||
|
|
@ -767,7 +1021,7 @@ FORMAT INSTRUCTIONS:
|
||||||
- Start your response directly with valid Python code
|
- Start your response directly with valid Python code
|
||||||
- End your response with valid Python code
|
- End your response with valid Python code
|
||||||
|
|
||||||
For required packages, place them in a specially formatted comment at the top of your code like this:
|
For required packages, place them in a specially formatted comment at the top of your code one one line like this:
|
||||||
# REQUIREMENTS: pandas,numpy,matplotlib,requests
|
# REQUIREMENTS: pandas,numpy,matplotlib,requests
|
||||||
|
|
||||||
Your entire response must be valid Python that can be executed without modification.
|
Your entire response must be valid Python that can be executed without modification.
|
||||||
|
|
@ -781,7 +1035,7 @@ Your entire response must be valid Python that can be executed without modificat
|
||||||
|
|
||||||
# Call the API
|
# Call the API
|
||||||
logging.info(f"Calling AI API to generate code")
|
logging.info(f"Calling AI API to generate code")
|
||||||
generated_content = await chat_service.call_api(messages, temperature=self.ai_temperature, max_tokens=self.ai_max_tokens)
|
generated_content = await self.ai_service.call_api(messages, temperature=self.ai_temperature, max_tokens=self.ai_max_tokens)
|
||||||
|
|
||||||
# Clean the generated content to ensure it's only valid Python code
|
# Clean the generated content to ensure it's only valid Python code
|
||||||
code = self._clean_code(generated_content)
|
code = self._clean_code(generated_content)
|
||||||
|
|
@ -843,7 +1097,8 @@ result = {{"error": "Code generation failed", "message": "{error_str}"}}
|
||||||
timeout=self.executor_timeout,
|
timeout=self.executor_timeout,
|
||||||
max_memory_mb=self.executor_memory_limit,
|
max_memory_mb=self.executor_memory_limit,
|
||||||
requirements=requirements,
|
requirements=requirements,
|
||||||
blocked_packages=blocked_packages
|
blocked_packages=blocked_packages,
|
||||||
|
ai_service = self.ai_service
|
||||||
)
|
)
|
||||||
|
|
||||||
# Prepare input data for the code
|
# Prepare input data for the code
|
||||||
|
|
@ -924,7 +1179,6 @@ result = {{"error": "Code generation failed", "message": "{error_str}"}}
|
||||||
# Log error information
|
# Log error information
|
||||||
error = result.get("error", "Unknown error")
|
error = result.get("error", "Unknown error")
|
||||||
logging_utils.error(f"Error during code execution: {error}", "execution")
|
logging_utils.error(f"Error during code execution: {error}", "execution")
|
||||||
print("DEBUG CODE-ERROR:",code,"#END")
|
|
||||||
|
|
||||||
# Clean up non-persistent environments
|
# Clean up non-persistent environments
|
||||||
if not executor.is_persistent:
|
if not executor.is_persistent:
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ from datetime import datetime
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from modules.agentservice_base import BaseAgent
|
from modules.agentservice_base import BaseAgent
|
||||||
from connectors.connector_aichat_openai import ChatService
|
|
||||||
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
||||||
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
||||||
from modules.agentservice_filemanager import FileManager # Import the file manager
|
from modules.agentservice_filemanager import FileManager # Import the file manager
|
||||||
|
|
@ -64,6 +63,10 @@ class DocumentationAgent(BaseAgent):
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def set_document_handler(self, document_handler):
|
||||||
|
"""Set the document handler for file operations"""
|
||||||
|
self.document_handler = document_handler
|
||||||
|
|
||||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Process a message and create documentation.
|
Process a message and create documentation.
|
||||||
|
|
|
||||||
|
|
@ -5,21 +5,16 @@ Angepasst für das refaktorisierte Core-Modul.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import random
|
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from typing import List, Dict, Any, Optional, Union
|
from typing import List, Dict, Any, Optional
|
||||||
import re
|
|
||||||
import uuid
|
|
||||||
from datetime import datetime
|
|
||||||
from urllib.parse import quote_plus, unquote
|
from urllib.parse import quote_plus, unquote
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
from modules.agentservice_base import BaseAgent
|
from modules.agentservice_base import BaseAgent
|
||||||
from connectors.connector_aichat_openai import ChatService
|
from modules.agentservice_utils import MessageUtils, LoggingUtils
|
||||||
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
from modules.agentservice_protocol import AgentCommunicationProtocol
|
||||||
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -42,13 +37,14 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
self.supports_documents = True
|
self.supports_documents = True
|
||||||
self.document_capabilities = ["read", "create"]
|
self.document_capabilities = ["read", "create"]
|
||||||
self.required_context = ["workflow_id"]
|
self.required_context = ["workflow_id"]
|
||||||
|
self.document_handler = None
|
||||||
|
|
||||||
|
# Initialize AI service
|
||||||
|
self.ai_service = None
|
||||||
|
|
||||||
# Initialize protocol
|
# Initialize protocol
|
||||||
self.protocol = AgentCommunicationProtocol()
|
self.protocol = AgentCommunicationProtocol()
|
||||||
|
|
||||||
# Chat-Service initialisieren
|
|
||||||
self.chat_service = ChatService()
|
|
||||||
|
|
||||||
# Utility-Klassen initialisieren
|
# Utility-Klassen initialisieren
|
||||||
self.message_utils = MessageUtils()
|
self.message_utils = MessageUtils()
|
||||||
|
|
||||||
|
|
@ -57,17 +53,6 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
self.max_key = 3
|
self.max_key = 3
|
||||||
self.max_result = 3
|
self.max_result = 3
|
||||||
self.timeout = 10
|
self.timeout = 10
|
||||||
self.headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.5',
|
|
||||||
'Referer': 'https://www.google.com/',
|
|
||||||
'DNT': '1',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Upgrade-Insecure-Requests': '1',
|
|
||||||
}
|
|
||||||
self.max_urls = 10
|
|
||||||
self.max_content_length = 100000
|
|
||||||
|
|
||||||
def get_agent_info(self) -> Dict[str, Any]:
|
def get_agent_info(self) -> Dict[str, Any]:
|
||||||
"""Get agent information for agent registry"""
|
"""Get agent information for agent registry"""
|
||||||
|
|
@ -81,6 +66,10 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def set_document_handler(self, document_handler):
|
||||||
|
"""Set the document handler for file operations"""
|
||||||
|
self.document_handler = document_handler
|
||||||
|
|
||||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
||||||
|
|
@ -173,29 +162,6 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
|
||||||
"""Send a document request using the protocol"""
|
|
||||||
return self.protocol.create_document_request_message(
|
|
||||||
document_description=document_description,
|
|
||||||
sender_id=sender_id,
|
|
||||||
receiver_id=receiver_id,
|
|
||||||
filters=filters,
|
|
||||||
context_id=context_id
|
|
||||||
)
|
|
||||||
|
|
||||||
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
|
|
||||||
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
|
||||||
"""Send a result message using the protocol"""
|
|
||||||
return self.protocol.create_result_message(
|
|
||||||
result_content=result_content,
|
|
||||||
sender_id=sender_id,
|
|
||||||
receiver_id=receiver_id,
|
|
||||||
task_id=task_id,
|
|
||||||
output_data=output_data,
|
|
||||||
result_format="SearchResults",
|
|
||||||
context_id=context_id
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
||||||
task = message_context.get("content", "")
|
task = message_context.get("content", "")
|
||||||
return task.strip()
|
return task.strip()
|
||||||
|
|
@ -232,7 +198,7 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
total_tokens += instruction_tokens
|
total_tokens += instruction_tokens
|
||||||
|
|
||||||
# Zusätzliche Anweisungen für Web-Recherche
|
# Zusätzliche Anweisungen für Web-Recherche
|
||||||
content_text = await self.chat_service.call_api(
|
content_text = await self.ai_service.call_api(
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
|
|
@ -263,7 +229,7 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
# Limit summary source to ensure we don't exceed API limits
|
# Limit summary source to ensure we don't exceed API limits
|
||||||
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
||||||
|
|
||||||
summary = await self.chat_service.call_api(
|
summary = await self.ai_service.call_api(
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
|
|
@ -280,322 +246,329 @@ class WebcrawlerAgent(BaseAgent):
|
||||||
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def run_web_query(self, prompt: str) -> List[Dict]:
|
async def run_web_query(self, prompt: str) -> List[Dict]:
|
||||||
if prompt=="":
|
if prompt=="":
|
||||||
return []
|
return []
|
||||||
|
|
||||||
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
||||||
|
|
||||||
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
||||||
|
|
||||||
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
||||||
|
|
||||||
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content_text = await self.chat_service.call_api(
|
content_text = await self.ai_service.call_api(
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": ptext
|
"content": ptext
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
# Remove markdown formatting if present
|
|
||||||
if content_text.startswith("```json"):
|
|
||||||
# Find the end of the JSON block
|
|
||||||
end_marker = "```"
|
|
||||||
end_index = content_text.rfind(end_marker)
|
|
||||||
if end_index != -1:
|
|
||||||
# Extract the JSON content without the markdown markers
|
|
||||||
content_text = content_text[7:end_index].strip()
|
|
||||||
|
|
||||||
# Now parse the JSON
|
|
||||||
try:
|
|
||||||
logger.info(f"Valid json received: {str(content_text)}")
|
|
||||||
pjson = json.loads(content_text)
|
|
||||||
# Now call scrape_json with the parsed dictionary
|
|
||||||
result_json = await self.scrape_json(pjson)
|
|
||||||
return result_json
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error(f"Failed to parse JSON: {e}")
|
|
||||||
logger.error(f"Cleaned content: {content_text[:100]}...")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
|
||||||
"""
|
|
||||||
Scrapes web content based on a research strategy JSON.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
research_strategy: A dictionary containing:
|
|
||||||
- 'skey': List of search keywords
|
|
||||||
- 'url': List of direct URLs to scrape
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with URLs as keys and scraped content as values
|
|
||||||
"""
|
|
||||||
|
|
||||||
logger.info("Starting JSON-based web scraping")
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# Validate input structure
|
|
||||||
if not isinstance(research_strategy, dict):
|
|
||||||
logger.error("Invalid research_strategy format: not a dictionary")
|
|
||||||
return {"error": "Invalid research_strategy format: not a dictionary"}
|
|
||||||
|
|
||||||
keys = research_strategy.get("skey", [])
|
|
||||||
direct_urls = research_strategy.get("url", [])
|
|
||||||
|
|
||||||
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
|
||||||
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
|
||||||
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
|
||||||
|
|
||||||
# Process search keywords through search engine
|
|
||||||
for keyword in keys:
|
|
||||||
logger.info(f"Processing keyword: {keyword}")
|
|
||||||
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
|
||||||
logger.info(f"... {len(found_results)} results found")
|
|
||||||
results.extend(found_results)
|
|
||||||
|
|
||||||
# Process direct URLs
|
|
||||||
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
|
||||||
for url in direct_urls:
|
|
||||||
if url in results:
|
|
||||||
logger.info(f"Skipping already scraped URL: {url}")
|
|
||||||
continue
|
|
||||||
soup=self.read_url(url)
|
|
||||||
|
|
||||||
# Extract title from the page if it exists
|
|
||||||
if isinstance(soup, BeautifulSoup):
|
|
||||||
title_tag = soup.find('title')
|
|
||||||
title = title_tag.text.strip() if title_tag else "No title"
|
|
||||||
|
|
||||||
# Alternative: You could also look for h1 tags if the title tag is missing
|
|
||||||
if title == "No title":
|
|
||||||
h1_tag = soup.find('h1')
|
|
||||||
if h1_tag:
|
|
||||||
title = h1_tag.text.strip()
|
|
||||||
else:
|
|
||||||
# Handle the case where soup is an error message string
|
|
||||||
title = "Error fetching page"
|
|
||||||
|
|
||||||
results.append(self.parse_result(soup,"No title",url))
|
|
||||||
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
|
||||||
return results
|
|
||||||
|
|
||||||
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
|
||||||
"""
|
|
||||||
Extract the main content from an HTML page while limiting character count.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
soup: BeautifulSoup object containing the page content
|
|
||||||
max_chars: Maximum number of characters to extract
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted main content as string
|
|
||||||
"""
|
|
||||||
if not isinstance(soup, BeautifulSoup):
|
|
||||||
return str(soup)[:max_chars]
|
|
||||||
|
|
||||||
# Try to find main content elements in order of priority
|
|
||||||
main_content = None
|
|
||||||
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
|
||||||
content = soup.select_one(selector)
|
|
||||||
if content:
|
|
||||||
main_content = content
|
|
||||||
break
|
|
||||||
|
|
||||||
# If no main content found, use the body
|
|
||||||
if not main_content:
|
|
||||||
main_content = soup.find('body') or soup
|
|
||||||
|
|
||||||
# Remove script, style, nav, footer elements that don't contribute to main content
|
|
||||||
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
|
||||||
element.extract()
|
|
||||||
|
|
||||||
# Extract text content
|
|
||||||
text_content = main_content.get_text(separator=' ', strip=True)
|
|
||||||
|
|
||||||
# Limit to max_chars
|
|
||||||
return text_content[:max_chars]
|
|
||||||
|
|
||||||
def tokenize_for_counting(self, text: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Simple token counter for estimating token usage.
|
|
||||||
This is an approximation since the exact tokenization depends on the model.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Input text
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of tokens
|
|
||||||
"""
|
|
||||||
# Simple tokenization by splitting on whitespace and punctuation
|
|
||||||
import re
|
|
||||||
return re.findall(r'\w+|[^\w\s]', text)
|
|
||||||
|
|
||||||
def count_tokens(self, text: str) -> int:
|
|
||||||
"""
|
|
||||||
Count the approximate number of tokens in a text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Input text
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Estimated token count
|
|
||||||
"""
|
|
||||||
tokens = self.tokenize_for_counting(text)
|
|
||||||
return len(tokens)
|
|
||||||
|
|
||||||
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
|
||||||
"""
|
|
||||||
Limit the text to a maximum number of tokens.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Input text
|
|
||||||
max_tokens: Maximum number of tokens allowed
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Limited text
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
tokens = self.tokenize_for_counting(text)
|
|
||||||
|
|
||||||
# If text is already under the limit, return as is
|
|
||||||
if len(tokens) <= max_tokens:
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Otherwise, truncate text to max_tokens
|
|
||||||
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
|
||||||
|
|
||||||
def search_web(self, query: str) -> List[Dict]:
|
|
||||||
formatted_query = quote_plus(query)
|
|
||||||
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
|
||||||
|
|
||||||
search_results_soup = self.read_url(url)
|
|
||||||
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
|
||||||
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Extract search results
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# Find all result containers
|
|
||||||
result_elements = search_results_soup.select('.result')
|
|
||||||
|
|
||||||
for result in result_elements:
|
|
||||||
# Extract title
|
|
||||||
title_element = result.select_one('.result__a')
|
|
||||||
title = title_element.text.strip() if title_element else 'No title'
|
|
||||||
|
|
||||||
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
|
||||||
url_element = title_element.get('href') if title_element else ''
|
|
||||||
extracted_url = 'No URL'
|
|
||||||
|
|
||||||
if url_element:
|
|
||||||
# Extract the actual URL from DuckDuckGo's redirect
|
|
||||||
if url_element.startswith('/d.js?q='):
|
|
||||||
start = url_element.find('?q=') + 3 # Skip '?q='
|
|
||||||
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
|
||||||
extracted_url = unquote(url_element[start:end])
|
|
||||||
|
|
||||||
# Make sure the URL has the correct protocol prefix
|
|
||||||
if not extracted_url.startswith(('http://', 'https://')):
|
|
||||||
if not extracted_url.startswith('//'):
|
|
||||||
extracted_url = 'https://' + extracted_url
|
|
||||||
else:
|
|
||||||
extracted_url = 'https:' + extracted_url
|
|
||||||
else:
|
|
||||||
extracted_url = url_element
|
|
||||||
|
|
||||||
# Extract snippet directly from search results page
|
|
||||||
snippet_element = result.select_one('.result__snippet')
|
|
||||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
|
||||||
|
|
||||||
# Now fetch the actual page content for the data field
|
|
||||||
target_page_soup = self.read_url(extracted_url)
|
|
||||||
|
|
||||||
# Use the new content extraction method to limit content size
|
|
||||||
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
'title': title,
|
|
||||||
'url': extracted_url,
|
|
||||||
'snippet': snippet,
|
|
||||||
'data': content
|
|
||||||
})
|
|
||||||
|
|
||||||
# Limit the number of results if needed
|
|
||||||
if len(results) >= self.max_result:
|
|
||||||
break
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def read_url(self, url: str) -> BeautifulSoup:
|
|
||||||
"""
|
|
||||||
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
|
||||||
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: Die zu lesende URL
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
|
||||||
"""
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Remove markdown formatting if present
|
||||||
|
if content_text.startswith("```json"):
|
||||||
|
# Find the end of the JSON block
|
||||||
|
end_marker = "```"
|
||||||
|
end_index = content_text.rfind(end_marker)
|
||||||
|
if end_index != -1:
|
||||||
|
# Extract the JSON content without the markdown markers
|
||||||
|
content_text = content_text[7:end_index].strip()
|
||||||
|
|
||||||
try:
|
# Now parse the JSON
|
||||||
import time
|
try:
|
||||||
|
logger.info(f"Valid json received: {str(content_text)}")
|
||||||
|
pjson = json.loads(content_text)
|
||||||
|
# Now call scrape_json with the parsed dictionary
|
||||||
|
result_json = await self.scrape_json(pjson)
|
||||||
|
return result_json
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Failed to parse JSON: {e}")
|
||||||
|
logger.error(f"Cleaned content: {content_text[:100]}...")
|
||||||
|
return []
|
||||||
|
|
||||||
# Initialer Request
|
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Scrapes web content based on a research strategy JSON.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
research_strategy: A dictionary containing:
|
||||||
|
- 'skey': List of search keywords
|
||||||
|
- 'url': List of direct URLs to scrape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with URLs as keys and scraped content as values
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info("Starting JSON-based web scraping")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Validate input structure
|
||||||
|
if not isinstance(research_strategy, dict):
|
||||||
|
logger.error("Invalid research_strategy format: not a dictionary")
|
||||||
|
return {"error": "Invalid research_strategy format: not a dictionary"}
|
||||||
|
|
||||||
|
keys = research_strategy.get("skey", [])
|
||||||
|
direct_urls = research_strategy.get("url", [])
|
||||||
|
|
||||||
|
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
||||||
|
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
||||||
|
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
||||||
|
|
||||||
|
# Process search keywords through search engine
|
||||||
|
for keyword in keys:
|
||||||
|
logger.info(f"Processing keyword: {keyword}")
|
||||||
|
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
||||||
|
logger.info(f"... {len(found_results)} results found")
|
||||||
|
results.extend(found_results)
|
||||||
|
|
||||||
|
# Process direct URLs
|
||||||
|
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
||||||
|
for url in direct_urls:
|
||||||
|
if url in results:
|
||||||
|
logger.info(f"Skipping already scraped URL: {url}")
|
||||||
|
continue
|
||||||
|
soup = self.read_url(url)
|
||||||
|
|
||||||
|
# Extract title from the page if it exists
|
||||||
|
if isinstance(soup, BeautifulSoup):
|
||||||
|
title_tag = soup.find('title')
|
||||||
|
title = title_tag.text.strip() if title_tag else "No title"
|
||||||
|
|
||||||
|
# Alternative: You could also look for h1 tags if the title tag is missing
|
||||||
|
if title == "No title":
|
||||||
|
h1_tag = soup.find('h1')
|
||||||
|
if h1_tag:
|
||||||
|
title = h1_tag.text.strip()
|
||||||
|
else:
|
||||||
|
# Handle the case where soup is an error message string
|
||||||
|
title = "Error fetching page"
|
||||||
|
|
||||||
|
results.append(self.parse_result(soup, title, url))
|
||||||
|
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
||||||
|
"""
|
||||||
|
Extract the main content from an HTML page while limiting character count.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
soup: BeautifulSoup object containing the page content
|
||||||
|
max_chars: Maximum number of characters to extract
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted main content as string
|
||||||
|
"""
|
||||||
|
if not isinstance(soup, BeautifulSoup):
|
||||||
|
return str(soup)[:max_chars]
|
||||||
|
|
||||||
|
# Try to find main content elements in order of priority
|
||||||
|
main_content = None
|
||||||
|
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
||||||
|
content = soup.select_one(selector)
|
||||||
|
if content:
|
||||||
|
main_content = content
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no main content found, use the body
|
||||||
|
if not main_content:
|
||||||
|
main_content = soup.find('body') or soup
|
||||||
|
|
||||||
|
# Remove script, style, nav, footer elements that don't contribute to main content
|
||||||
|
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
|
# Extract text content
|
||||||
|
text_content = main_content.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
|
# Limit to max_chars
|
||||||
|
return text_content[:max_chars]
|
||||||
|
|
||||||
|
def tokenize_for_counting(self, text: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Simple token counter for estimating token usage.
|
||||||
|
This is an approximation since the exact tokenization depends on the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens
|
||||||
|
"""
|
||||||
|
# Simple tokenization by splitting on whitespace and punctuation
|
||||||
|
import re
|
||||||
|
return re.findall(r'\w+|[^\w\s]', text)
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
"""
|
||||||
|
Count the approximate number of tokens in a text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated token count
|
||||||
|
"""
|
||||||
|
tokens = self.tokenize_for_counting(text)
|
||||||
|
return len(tokens)
|
||||||
|
|
||||||
|
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
||||||
|
"""
|
||||||
|
Limit the text to a maximum number of tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text
|
||||||
|
max_tokens: Maximum number of tokens allowed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Limited text
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
tokens = self.tokenize_for_counting(text)
|
||||||
|
|
||||||
|
# If text is already under the limit, return as is
|
||||||
|
if len(tokens) <= max_tokens:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Otherwise, truncate text to max_tokens
|
||||||
|
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
||||||
|
|
||||||
|
def search_web(self, query: str) -> List[Dict]:
|
||||||
|
formatted_query = quote_plus(query)
|
||||||
|
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
||||||
|
|
||||||
|
search_results_soup = self.read_url(url)
|
||||||
|
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
||||||
|
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Extract search results
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Find all result containers
|
||||||
|
result_elements = search_results_soup.select('.result')
|
||||||
|
|
||||||
|
for result in result_elements:
|
||||||
|
# Extract title
|
||||||
|
title_element = result.select_one('.result__a')
|
||||||
|
title = title_element.text.strip() if title_element else 'No title'
|
||||||
|
|
||||||
|
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
||||||
|
url_element = title_element.get('href') if title_element else ''
|
||||||
|
extracted_url = 'No URL'
|
||||||
|
|
||||||
|
if url_element:
|
||||||
|
# Extract the actual URL from DuckDuckGo's redirect
|
||||||
|
if url_element.startswith('/d.js?q='):
|
||||||
|
start = url_element.find('?q=') + 3 # Skip '?q='
|
||||||
|
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
||||||
|
extracted_url = unquote(url_element[start:end])
|
||||||
|
|
||||||
|
# Make sure the URL has the correct protocol prefix
|
||||||
|
if not extracted_url.startswith(('http://', 'https://')):
|
||||||
|
if not extracted_url.startswith('//'):
|
||||||
|
extracted_url = 'https://' + extracted_url
|
||||||
|
else:
|
||||||
|
extracted_url = 'https:' + extracted_url
|
||||||
|
else:
|
||||||
|
extracted_url = url_element
|
||||||
|
|
||||||
|
# Extract snippet directly from search results page
|
||||||
|
snippet_element = result.select_one('.result__snippet')
|
||||||
|
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||||
|
|
||||||
|
# Now fetch the actual page content for the data field
|
||||||
|
target_page_soup = self.read_url(extracted_url)
|
||||||
|
|
||||||
|
# Use the new content extraction method to limit content size
|
||||||
|
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'title': title,
|
||||||
|
'url': extracted_url,
|
||||||
|
'snippet': snippet,
|
||||||
|
'data': content
|
||||||
|
})
|
||||||
|
|
||||||
|
# Limit the number of results if needed
|
||||||
|
if len(results) >= self.max_result:
|
||||||
|
break
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def read_url(self, url: str) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
||||||
|
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Die zu lesende URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialer Request
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
|
||||||
|
# Polling für Status 202
|
||||||
|
if response.status_code == 202:
|
||||||
|
# Maximal 3 Versuche mit steigenden Intervallen
|
||||||
|
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
||||||
|
|
||||||
|
for wait_time in backoff_times:
|
||||||
|
time.sleep(wait_time) # Warten mit steigender Zeit
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
|
||||||
# Polling für Status 202
|
# Wenn kein 202 mehr, dann abbrechen
|
||||||
if response.status_code == 202:
|
if response.status_code != 202:
|
||||||
# Maximal 3 Versuche mit steigenden Intervallen
|
break
|
||||||
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
|
||||||
|
|
||||||
for wait_time in backoff_times:
|
# Für andere Fehler-Status einen Fehler auslösen
|
||||||
time.sleep(wait_time) # Warten mit steigender Zeit
|
response.raise_for_status()
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
|
|
||||||
# Wenn kein 202 mehr, dann abbrechen
|
# HTML parsen
|
||||||
if response.status_code != 202:
|
return BeautifulSoup(response.text, 'html.parser')
|
||||||
break
|
|
||||||
|
|
||||||
# Für andere Fehler-Status einen Fehler auslösen
|
except Exception as e:
|
||||||
response.raise_for_status()
|
# Leeres BeautifulSoup-Objekt erstellen
|
||||||
|
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
||||||
|
|
||||||
# HTML parsen
|
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
||||||
return BeautifulSoup(response.text, 'html.parser')
|
"""
|
||||||
|
Parse a BeautifulSoup object into a result dictionary.
|
||||||
|
|
||||||
except Exception as e:
|
Args:
|
||||||
# Leeres BeautifulSoup-Objekt erstellen
|
data: BeautifulSoup object containing the page content
|
||||||
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
title: Page title
|
||||||
|
url: Page URL
|
||||||
|
|
||||||
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
Returns:
|
||||||
# Extract snippet/description
|
Dictionary with result data
|
||||||
snippet_element = data.select_one('.result__snippet')
|
"""
|
||||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
# Extract content using the main content extraction method
|
||||||
|
content = self.extract_main_content(data, max_chars=30000)
|
||||||
|
|
||||||
result={
|
result = {
|
||||||
'title': title,
|
'title': title,
|
||||||
'url': url,
|
'url': url,
|
||||||
'snippet': snippet,
|
'snippet': 'No description', # Default value
|
||||||
'data': data.prettify()
|
'data': content
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# Singleton-Instanz
|
# Singleton-Instanz
|
||||||
|
|
|
||||||
|
|
@ -712,7 +712,7 @@ def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[s
|
||||||
# Check if file ID matches (handle both string and int comparison)
|
# Check if file ID matches (handle both string and int comparison)
|
||||||
if (source.get("id") == file_id or
|
if (source.get("id") == file_id or
|
||||||
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
|
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
|
||||||
(isinstance(file_id, str) and source.get("id") == int(file_id))):
|
(isinstance(file_id, str) and source.get("id") == file_id)):
|
||||||
|
|
||||||
# Add contents of the file
|
# Add contents of the file
|
||||||
doc_contents = document.get("contents", [])
|
doc_contents = document.get("contents", [])
|
||||||
|
|
|
||||||
|
|
@ -95,14 +95,18 @@ class AgentRegistry:
|
||||||
self.ai_service = ai_service
|
self.ai_service = ai_service
|
||||||
self.document_handler = document_handler
|
self.document_handler = document_handler
|
||||||
self.lucydom_interface = lucydom_interface
|
self.lucydom_interface = lucydom_interface
|
||||||
|
# Update all registered agents
|
||||||
|
self.update_agent_dependencies()
|
||||||
|
|
||||||
# Update dependencies for all registered agents
|
|
||||||
|
def update_agent_dependencies(self):
|
||||||
|
"""Update dependencies for all registered agents"""
|
||||||
for agent_id, agent in self.agents.items():
|
for agent_id, agent in self.agents.items():
|
||||||
if hasattr(agent, 'set_dependencies'):
|
if hasattr(agent, 'set_dependencies'):
|
||||||
agent.set_dependencies(
|
agent.set_dependencies(
|
||||||
ai_service=ai_service,
|
ai_service=self.ai_service,
|
||||||
document_handler=document_handler,
|
document_handler=self.document_handler,
|
||||||
lucydom_interface=lucydom_interface
|
lucydom_interface=self.lucydom_interface
|
||||||
)
|
)
|
||||||
|
|
||||||
def register_agent(self, agent: 'BaseAgent'):
|
def register_agent(self, agent: 'BaseAgent'):
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from datetime import datetime
|
||||||
from typing import List, Dict, Any, Optional, Tuple, Union
|
from typing import List, Dict, Any, Optional, Tuple, Union
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.getLogger('matplotlib.font_manager').setLevel(logging.INFO)
|
||||||
|
|
||||||
class WorkflowExecution:
|
class WorkflowExecution:
|
||||||
"""
|
"""
|
||||||
|
|
@ -39,10 +40,21 @@ class WorkflowExecution:
|
||||||
|
|
||||||
# Initialize agent registry
|
# Initialize agent registry
|
||||||
self.agent_registry = AgentRegistry.get_instance()
|
self.agent_registry = AgentRegistry.get_instance()
|
||||||
|
# Set dependencies for agents
|
||||||
|
|
||||||
# Initialize file manager
|
# Initialize file manager
|
||||||
self.file_manager = get_workflow_file_manager(workflow_id, lucydom_interface)
|
self.file_manager = get_workflow_file_manager(workflow_id, lucydom_interface)
|
||||||
|
|
||||||
|
# Import and initialize document handler
|
||||||
|
from modules.agentservice_document_handler import get_document_handler
|
||||||
|
self.document_handler = get_document_handler(workflow_id, lucydom_interface, ai_service)
|
||||||
|
|
||||||
|
self.agent_registry.set_dependencies(
|
||||||
|
ai_service=ai_service,
|
||||||
|
document_handler=self.document_handler,
|
||||||
|
lucydom_interface=lucydom_interface
|
||||||
|
)
|
||||||
|
|
||||||
async def execute(self, message: Dict[str, Any], workflow: Dict[str, Any], files: List[Dict[str, Any]] = None, is_user_input: bool = False):
|
async def execute(self, message: Dict[str, Any], workflow: Dict[str, Any], files: List[Dict[str, Any]] = None, is_user_input: bool = False):
|
||||||
"""
|
"""
|
||||||
Execute the workflow with integrated planning and agent selection.
|
Execute the workflow with integrated planning and agent selection.
|
||||||
|
|
@ -67,6 +79,7 @@ class WorkflowExecution:
|
||||||
# 3. Create agent-aware work plan
|
# 3. Create agent-aware work plan
|
||||||
work_plan = await self._create_agent_aware_work_plan(workflow, user_message)
|
work_plan = await self._create_agent_aware_work_plan(workflow, user_message)
|
||||||
self.logging_utils.info(f"Created agent-aware work plan with {len(work_plan)} activities", "planning")
|
self.logging_utils.info(f"Created agent-aware work plan with {len(work_plan)} activities", "planning")
|
||||||
|
self.logging_utils.debug(f"{work_plan}.", "planning")
|
||||||
|
|
||||||
# 4. Execute the activities in the work plan
|
# 4. Execute the activities in the work plan
|
||||||
results = await self._execute_work_plan(workflow, work_plan)
|
results = await self._execute_work_plan(workflow, work_plan)
|
||||||
|
|
@ -165,7 +178,7 @@ class WorkflowExecution:
|
||||||
"content_type": source.get("content_type", "unknown")
|
"content_type": source.get("content_type", "unknown")
|
||||||
})
|
})
|
||||||
|
|
||||||
# Create the planning prompt with agent awareness
|
# Create the planning prompt with agent awareness and document handling information
|
||||||
plan_prompt = f"""
|
plan_prompt = f"""
|
||||||
As an AI workflow manager, create a detailed agent-aware work plan for the following task:
|
As an AI workflow manager, create a detailed agent-aware work plan for the following task:
|
||||||
|
|
||||||
|
|
@ -177,21 +190,25 @@ AVAILABLE AGENTS:
|
||||||
AVAILABLE DOCUMENTS:
|
AVAILABLE DOCUMENTS:
|
||||||
{document_info if document_info else "No documents provided"}
|
{document_info if document_info else "No documents provided"}
|
||||||
|
|
||||||
|
IMPORTANT: Document extraction happens automatically in the workflow. Documents in the message are already available to all agents. DO NOT assign agent_coder or any other agent specifically for just reading or extracting document content. Only assign agents for tasks that require specific processing beyond what the document handler already provides.
|
||||||
|
|
||||||
The work plan should include a structured list of activities. Each activity should have:
|
The work plan should include a structured list of activities. Each activity should have:
|
||||||
1. title - A short descriptive title for the activity
|
1. title - A short descriptive title for the activity
|
||||||
2. description - What needs to be done in this activity
|
2. description - What needs to be done in this activity
|
||||||
3. assigned_agents - List of agent IDs that should handle this activity (can be multiple in sequence)
|
3. assigned_agents - List of agent IDs that should handle this activity (can be multiple in sequence)
|
||||||
4. agent_prompts - Specific instructions for each agent (matched by index to assigned_agents)
|
4. agent_prompts - Specific instructions for each agent (matched by index to assigned_agents)
|
||||||
5. document_requirements - Description of which documents are needed for this activity
|
5. document_requirements - Description of which documents are needed for this activity (these will be automatically extracted)
|
||||||
6. expected_output - The expected output format and content
|
6. expected_output - The expected output format and content
|
||||||
7. dependencies - List of previous activities this depends on (by index)
|
7. dependencies - List of previous activities this depends on (by index)
|
||||||
|
|
||||||
IMPORTANT GUIDELINES:
|
IMPORTANT GUIDELINES:
|
||||||
- Each activity should have clear objectives and be assigned to the most appropriate agent(s)
|
- Each activity should have clear objectives and be assigned to the most appropriate agent(s)
|
||||||
- When multiple agents are assigned to an activity, specify the sequence and how outputs should flow between them
|
- When multiple agents are assigned to an activity, specify the sequence and how outputs should flow between them
|
||||||
- Documents are processed on-demand, so each activity should specify which documents it requires
|
- Documents are processed on-demand by the system's document handler, so only specify which documents are needed, not how to extract them
|
||||||
|
- DO NOT create activities that only read or extract document content - this happens automatically
|
||||||
- Create a logical sequence where later activities can use outputs from earlier ones
|
- Create a logical sequence where later activities can use outputs from earlier ones
|
||||||
- If no specialized agent is needed for a task, use the default "assistant" agent
|
- If no specialized agent is needed for a task, use the default "assistant" agent
|
||||||
|
- Only use the agent_coder for tasks that require actual coding or complex data analysis, not for simply reading documents
|
||||||
|
|
||||||
Return the work plan as a JSON array of activity objects, each with the above properties.
|
Return the work plan as a JSON array of activity objects, each with the above properties.
|
||||||
"""
|
"""
|
||||||
|
|
@ -357,6 +374,20 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
||||||
for i, agent_id in enumerate(agent_ids):
|
for i, agent_id in enumerate(agent_ids):
|
||||||
# Get the agent
|
# Get the agent
|
||||||
agent = self.agent_registry.get_agent(agent_id)
|
agent = self.agent_registry.get_agent(agent_id)
|
||||||
|
if agent:
|
||||||
|
# Ensure dependencies are set
|
||||||
|
if hasattr(agent, 'set_dependencies'):
|
||||||
|
agent.set_dependencies(
|
||||||
|
ai_service=self.ai_service,
|
||||||
|
document_handler=self.document_handler,
|
||||||
|
lucydom_interface=self.lucydom_interface
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set document handler if agent supports it
|
||||||
|
if hasattr(agent, 'set_document_handler') and hasattr(self, 'document_handler'):
|
||||||
|
agent.set_document_handler(self.document_handler)
|
||||||
|
|
||||||
|
|
||||||
if not agent:
|
if not agent:
|
||||||
self.logging_utils.warning(f"Agent '{agent_id}' not found, using assistant instead", "agents")
|
self.logging_utils.warning(f"Agent '{agent_id}' not found, using assistant instead", "agents")
|
||||||
agent = self.agent_registry.get_agent("assistant")
|
agent = self.agent_registry.get_agent("assistant")
|
||||||
|
|
@ -380,9 +411,18 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
||||||
agent_message = self._create_message(workflow, "user")
|
agent_message = self._create_message(workflow, "user")
|
||||||
agent_message["content"] = enhanced_prompt
|
agent_message["content"] = enhanced_prompt
|
||||||
|
|
||||||
# Add any documents from previous agent if this is a continuation
|
# IMPORTANT FIX: Document handling logic
|
||||||
|
# First, check if we have documents from previous agent if this is a continuation
|
||||||
if last_documents and i > 0:
|
if last_documents and i > 0:
|
||||||
agent_message["documents"] = last_documents
|
agent_message["documents"] = last_documents
|
||||||
|
# For the first agent, make sure we pass any documents from the most recent user message
|
||||||
|
elif i == 0:
|
||||||
|
# Find the most recent user message with documents
|
||||||
|
for msg in reversed(workflow.get("messages", [])):
|
||||||
|
if msg.get("role") == "user" and msg.get("documents"):
|
||||||
|
agent_message["documents"] = msg.get("documents", [])
|
||||||
|
self.logging_utils.info(f"Passing {len(agent_message['documents'])} documents from user message to {agent_id}", "agents")
|
||||||
|
break
|
||||||
|
|
||||||
# Log agent execution
|
# Log agent execution
|
||||||
self.logging_utils.info(f"Executing agent: {agent_id}", "agents")
|
self.logging_utils.info(f"Executing agent: {agent_id}", "agents")
|
||||||
|
|
@ -402,6 +442,7 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
||||||
if "documents" in agent_response:
|
if "documents" in agent_response:
|
||||||
response_message["documents"] = agent_response["documents"]
|
response_message["documents"] = agent_response["documents"]
|
||||||
last_documents = agent_response["documents"]
|
last_documents = agent_response["documents"]
|
||||||
|
self.logging_utils.info(f"Agent {agent_id} produced {len(last_documents)} documents", "agents")
|
||||||
|
|
||||||
# Add to workflow
|
# Add to workflow
|
||||||
workflow["messages"].append(response_message)
|
workflow["messages"].append(response_message)
|
||||||
|
|
@ -419,6 +460,7 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
||||||
"format": "Text"
|
"format": "Text"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async def _extract_required_documents(self, workflow: Dict[str, Any], doc_requirements: str) -> Dict[str, Any]:
|
async def _extract_required_documents(self, workflow: Dict[str, Any], doc_requirements: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extract required documents based on requirements description.
|
Extract required documents based on requirements description.
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue