backend running for mvp1
This commit is contained in:
parent
b0c45fb798
commit
c75a3b67ce
10 changed files with 2529 additions and 615 deletions
1014
gwserver/modules/BAKcoder.py
Normal file
1014
gwserver/modules/BAKcoder.py
Normal file
File diff suppressed because it is too large
Load diff
613
gwserver/modules/BAKwebcrawler.py
Normal file
613
gwserver/modules/BAKwebcrawler.py
Normal file
|
|
@ -0,0 +1,613 @@
|
|||
"""
|
||||
WebCrawler-Agent für die Recherche und Beschaffung von Informationen aus dem Web.
|
||||
Angepasst für das refaktorisierte Core-Modul.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote_plus, unquote
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from modules.agentservice_base import BaseAgent
|
||||
from connectors.connector_aichat_openai import ChatService
|
||||
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
||||
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class WebcrawlerAgent(BaseAgent):
|
||||
|
||||
"""Agent für Web-Recherche und Informationsbeschaffung"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialisiert den WebCrawler-Agenten"""
|
||||
super().__init__()
|
||||
self.id = "webcrawler"
|
||||
self.name = "Webscraper"
|
||||
self.type = "scraper"
|
||||
self.description = "Recherchiert Informationen im Web"
|
||||
self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration"
|
||||
self.result_format = "SearchResults"
|
||||
|
||||
# Add enhanced document capabilities
|
||||
self.supports_documents = True
|
||||
self.document_capabilities = ["read", "create"]
|
||||
self.required_context = ["workflow_id"]
|
||||
self.document_handler = None
|
||||
|
||||
# Initialize protocol
|
||||
self.protocol = AgentCommunicationProtocol()
|
||||
|
||||
# Chat-Service initialisieren
|
||||
self.chat_service = ChatService()
|
||||
|
||||
# Utility-Klassen initialisieren
|
||||
self.message_utils = MessageUtils()
|
||||
|
||||
# Web-Crawling-Konfiguration
|
||||
self.max_url = 3
|
||||
self.max_key = 3
|
||||
self.max_result = 3
|
||||
self.timeout = 10
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Referer': 'https://www.google.com/',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
self.max_urls = 10
|
||||
self.max_content_length = 100000
|
||||
|
||||
def get_agent_info(self) -> Dict[str, Any]:
|
||||
"""Get agent information for agent registry"""
|
||||
info = super().get_agent_info()
|
||||
info.update({
|
||||
"metadata": {
|
||||
"max_url": self.max_url,
|
||||
"max_result": self.max_result,
|
||||
"timeout": self.timeout
|
||||
}
|
||||
})
|
||||
return info
|
||||
|
||||
def set_document_handler(self, document_handler):
|
||||
"""Set the document handler for file operations"""
|
||||
self.document_handler = document_handler
|
||||
|
||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
||||
|
||||
Args:
|
||||
message: Die zu verarbeitende Nachricht
|
||||
context: Zusätzlicher Kontext
|
||||
|
||||
Returns:
|
||||
Die generierte Antwort mit der Web-Recherche
|
||||
"""
|
||||
# Extract workflow_id from context or message
|
||||
workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown")
|
||||
|
||||
# Get or create logging_utils
|
||||
log_func = context.get("log_func") if context else None
|
||||
logging_utils = LoggingUtils(workflow_id, log_func)
|
||||
|
||||
# Send status update using protocol
|
||||
if log_func:
|
||||
status_message = self.protocol.create_status_update_message(
|
||||
status_description="Starte Web-Recherche",
|
||||
sender_id=self.id,
|
||||
status="in_progress",
|
||||
progress=0.0,
|
||||
context_id=workflow_id
|
||||
)
|
||||
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||
|
||||
# Create response structure
|
||||
response = {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"agent_id": self.id,
|
||||
"agent_type": self.type,
|
||||
"agent_name": self.name,
|
||||
"result_format": self.result_format,
|
||||
"workflow_id": workflow_id
|
||||
}
|
||||
|
||||
try:
|
||||
# Get the query from the message
|
||||
prompt = await self.get_prompt(message)
|
||||
logging_utils.info(f"Web-Recherche für: {prompt[:50]}...", "agents")
|
||||
|
||||
# Update progress using protocol
|
||||
if log_func:
|
||||
status_message = self.protocol.create_status_update_message(
|
||||
status_description=f"Recherchiere: {prompt[:30]}...",
|
||||
sender_id=self.id,
|
||||
status="in_progress",
|
||||
progress=0.3,
|
||||
context_id=workflow_id
|
||||
)
|
||||
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||
|
||||
# Führe die Web-Recherche durch
|
||||
web_query_result = await self.get_web_query(message)
|
||||
|
||||
# Final status update
|
||||
if log_func:
|
||||
status_message = self.protocol.create_status_update_message(
|
||||
status_description="Web-Recherche abgeschlossen",
|
||||
sender_id=self.id,
|
||||
status="completed",
|
||||
progress=1.0,
|
||||
context_id=workflow_id
|
||||
)
|
||||
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||
|
||||
# Set the content in the response
|
||||
response["content"] = web_query_result
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Fehler bei der Web-Recherche: {str(e)}"
|
||||
logging_utils.error(error_msg, "error")
|
||||
|
||||
# Create error response using protocol
|
||||
error_message = self.protocol.create_error_message(
|
||||
error_description=error_msg,
|
||||
sender_id=self.id,
|
||||
error_type="web_search",
|
||||
error_details={"traceback": traceback.format_exc()},
|
||||
context_id=workflow_id
|
||||
)
|
||||
|
||||
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
||||
|
||||
return response
|
||||
|
||||
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||
"""Send a document request using the protocol"""
|
||||
return self.protocol.create_document_request_message(
|
||||
document_description=document_description,
|
||||
sender_id=sender_id,
|
||||
receiver_id=receiver_id,
|
||||
filters=filters,
|
||||
context_id=context_id
|
||||
)
|
||||
|
||||
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
|
||||
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||
"""Send a result message using the protocol"""
|
||||
return self.protocol.create_result_message(
|
||||
result_content=result_content,
|
||||
sender_id=sender_id,
|
||||
receiver_id=receiver_id,
|
||||
task_id=task_id,
|
||||
output_data=output_data,
|
||||
result_format="SearchResults",
|
||||
context_id=context_id
|
||||
)
|
||||
|
||||
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
||||
task = message_context.get("content", "")
|
||||
return task.strip()
|
||||
|
||||
async def get_web_query(self, message_context: Dict[str, Any]) -> str:
|
||||
prompt = await self.get_prompt(message_context)
|
||||
result_json = await self.run_web_query(prompt)
|
||||
result_data = ""
|
||||
summary_src = ""
|
||||
|
||||
logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.")
|
||||
if isinstance(result_json, list):
|
||||
total_tokens = 0
|
||||
|
||||
for i, result in enumerate(result_json, 1):
|
||||
# Limit content size for each result
|
||||
result_data_limited = self.limit_text_for_api(result['data'], max_tokens=15000) # Allow ~15000 tokens per result
|
||||
|
||||
web_answer_instructions = f"""
|
||||
Fass das Resultat gemäss dem Auftrag zusammen in maximal rund 2000 Zeichen. Auftrag = '{prompt.replace("'","")}'
|
||||
Fasse die wichtigsten Erkenntnisse zusammen und setze sie in Bezug zur ursprünglichen Anfrage. Die Einleitung kannst Du weglassen.
|
||||
Achte darauf, nur relevante und qualitativ hochwertige Informationen zu extrahieren, welche einen Bezug zum Auftrag haben, und übersichtlich zu präsentieren. Vermittle ein ausgewogenes Bild der recherchierten Informationen.
|
||||
|
||||
Dies ist das Resultat:
|
||||
{result_data_limited}
|
||||
"""
|
||||
|
||||
# Count tokens in the instructions to ensure we don't exceed API limits
|
||||
instruction_tokens = self.count_tokens(web_answer_instructions)
|
||||
if total_tokens + instruction_tokens > 60000:
|
||||
logger.warning(f"Skipping result {i} to avoid exceeding token limit")
|
||||
break
|
||||
|
||||
total_tokens += instruction_tokens
|
||||
|
||||
# Zusätzliche Anweisungen für Web-Recherche
|
||||
content_text = await self.chat_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Informationsanalyst, der Webinhalte präzise und relevant zusammenfasst."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": web_answer_instructions
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Create a summary but ensure we stay within token limits
|
||||
content_summary = content_text[:2000] # Limit to ~2000 characters
|
||||
result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}"
|
||||
summary_src += f"\n{content_summary}"
|
||||
|
||||
# Update token count
|
||||
total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting
|
||||
else:
|
||||
result_data = "no data received"
|
||||
|
||||
logger.info(f"Web analysis result sent {len(result_data)}B")
|
||||
|
||||
# Zusätzliche Zusammenfassung
|
||||
summary = ""
|
||||
if len(summary_src) > 1:
|
||||
# Limit summary source to ensure we don't exceed API limits
|
||||
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
||||
|
||||
summary = await self.chat_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du erstellst prägnante Zusammenfassungen von Rechercheergbnissen."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Bitte fasse diese Erkenntnisse in maximal 5-6 Sätzen zusammen: {summary_src_limited}\n"
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Format the final result
|
||||
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
||||
return result
|
||||
|
||||
async def run_web_query(self, prompt: str) -> List[Dict]:
|
||||
if prompt=="":
|
||||
return []
|
||||
|
||||
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
||||
|
||||
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
||||
|
||||
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
||||
|
||||
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
||||
"""
|
||||
|
||||
content_text = await self.chat_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": ptext
|
||||
}
|
||||
]
|
||||
)
|
||||
# Remove markdown formatting if present
|
||||
if content_text.startswith("```json"):
|
||||
# Find the end of the JSON block
|
||||
end_marker = "```"
|
||||
end_index = content_text.rfind(end_marker)
|
||||
if end_index != -1:
|
||||
# Extract the JSON content without the markdown markers
|
||||
content_text = content_text[7:end_index].strip()
|
||||
|
||||
# Now parse the JSON
|
||||
try:
|
||||
logger.info(f"Valid json received: {str(content_text)}")
|
||||
pjson = json.loads(content_text)
|
||||
# Now call scrape_json with the parsed dictionary
|
||||
result_json = await self.scrape_json(pjson)
|
||||
return result_json
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON: {e}")
|
||||
logger.error(f"Cleaned content: {content_text[:100]}...")
|
||||
return []
|
||||
|
||||
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
||||
"""
|
||||
Scrapes web content based on a research strategy JSON.
|
||||
|
||||
Args:
|
||||
research_strategy: A dictionary containing:
|
||||
- 'skey': List of search keywords
|
||||
- 'url': List of direct URLs to scrape
|
||||
|
||||
Returns:
|
||||
Dictionary with URLs as keys and scraped content as values
|
||||
"""
|
||||
|
||||
logger.info("Starting JSON-based web scraping")
|
||||
results = []
|
||||
|
||||
# Validate input structure
|
||||
if not isinstance(research_strategy, dict):
|
||||
logger.error("Invalid research_strategy format: not a dictionary")
|
||||
return {"error": "Invalid research_strategy format: not a dictionary"}
|
||||
|
||||
keys = research_strategy.get("skey", [])
|
||||
direct_urls = research_strategy.get("url", [])
|
||||
|
||||
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
||||
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
||||
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
||||
|
||||
# Process search keywords through search engine
|
||||
for keyword in keys:
|
||||
logger.info(f"Processing keyword: {keyword}")
|
||||
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
||||
logger.info(f"... {len(found_results)} results found")
|
||||
results.extend(found_results)
|
||||
|
||||
# Process direct URLs
|
||||
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
||||
for url in direct_urls:
|
||||
if url in results:
|
||||
logger.info(f"Skipping already scraped URL: {url}")
|
||||
continue
|
||||
soup=self.read_url(url)
|
||||
|
||||
# Extract title from the page if it exists
|
||||
if isinstance(soup, BeautifulSoup):
|
||||
title_tag = soup.find('title')
|
||||
title = title_tag.text.strip() if title_tag else "No title"
|
||||
|
||||
# Alternative: You could also look for h1 tags if the title tag is missing
|
||||
if title == "No title":
|
||||
h1_tag = soup.find('h1')
|
||||
if h1_tag:
|
||||
title = h1_tag.text.strip()
|
||||
else:
|
||||
# Handle the case where soup is an error message string
|
||||
title = "Error fetching page"
|
||||
|
||||
results.append(self.parse_result(soup,"No title",url))
|
||||
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
||||
return results
|
||||
|
||||
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
||||
"""
|
||||
Extract the main content from an HTML page while limiting character count.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object containing the page content
|
||||
max_chars: Maximum number of characters to extract
|
||||
|
||||
Returns:
|
||||
Extracted main content as string
|
||||
"""
|
||||
if not isinstance(soup, BeautifulSoup):
|
||||
return str(soup)[:max_chars]
|
||||
|
||||
# Try to find main content elements in order of priority
|
||||
main_content = None
|
||||
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
||||
content = soup.select_one(selector)
|
||||
if content:
|
||||
main_content = content
|
||||
break
|
||||
|
||||
# If no main content found, use the body
|
||||
if not main_content:
|
||||
main_content = soup.find('body') or soup
|
||||
|
||||
# Remove script, style, nav, footer elements that don't contribute to main content
|
||||
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
||||
element.extract()
|
||||
|
||||
# Extract text content
|
||||
text_content = main_content.get_text(separator=' ', strip=True)
|
||||
|
||||
# Limit to max_chars
|
||||
return text_content[:max_chars]
|
||||
|
||||
def tokenize_for_counting(self, text: str) -> List[str]:
|
||||
"""
|
||||
Simple token counter for estimating token usage.
|
||||
This is an approximation since the exact tokenization depends on the model.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
List of tokens
|
||||
"""
|
||||
# Simple tokenization by splitting on whitespace and punctuation
|
||||
import re
|
||||
return re.findall(r'\w+|[^\w\s]', text)
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the approximate number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
return len(tokens)
|
||||
|
||||
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
||||
"""
|
||||
Limit the text to a maximum number of tokens.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
max_tokens: Maximum number of tokens allowed
|
||||
|
||||
Returns:
|
||||
Limited text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
|
||||
# If text is already under the limit, return as is
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
# Otherwise, truncate text to max_tokens
|
||||
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
||||
|
||||
def search_web(self, query: str) -> List[Dict]:
|
||||
formatted_query = quote_plus(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
||||
|
||||
search_results_soup = self.read_url(url)
|
||||
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
||||
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
||||
return []
|
||||
|
||||
# Extract search results
|
||||
results = []
|
||||
|
||||
# Find all result containers
|
||||
result_elements = search_results_soup.select('.result')
|
||||
|
||||
for result in result_elements:
|
||||
# Extract title
|
||||
title_element = result.select_one('.result__a')
|
||||
title = title_element.text.strip() if title_element else 'No title'
|
||||
|
||||
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
||||
url_element = title_element.get('href') if title_element else ''
|
||||
extracted_url = 'No URL'
|
||||
|
||||
if url_element:
|
||||
# Extract the actual URL from DuckDuckGo's redirect
|
||||
if url_element.startswith('/d.js?q='):
|
||||
start = url_element.find('?q=') + 3 # Skip '?q='
|
||||
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
||||
extracted_url = unquote(url_element[start:end])
|
||||
|
||||
# Make sure the URL has the correct protocol prefix
|
||||
if not extracted_url.startswith(('http://', 'https://')):
|
||||
if not extracted_url.startswith('//'):
|
||||
extracted_url = 'https://' + extracted_url
|
||||
else:
|
||||
extracted_url = 'https:' + extracted_url
|
||||
else:
|
||||
extracted_url = url_element
|
||||
|
||||
# Extract snippet directly from search results page
|
||||
snippet_element = result.select_one('.result__snippet')
|
||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||
|
||||
# Now fetch the actual page content for the data field
|
||||
target_page_soup = self.read_url(extracted_url)
|
||||
|
||||
# Use the new content extraction method to limit content size
|
||||
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
||||
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': extracted_url,
|
||||
'snippet': snippet,
|
||||
'data': content
|
||||
})
|
||||
|
||||
# Limit the number of results if needed
|
||||
if len(results) >= self.max_result:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def read_url(self, url: str) -> BeautifulSoup:
|
||||
"""
|
||||
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
||||
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
||||
|
||||
Args:
|
||||
url: Die zu lesende URL
|
||||
|
||||
Returns:
|
||||
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
try:
|
||||
import time
|
||||
|
||||
# Initialer Request
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
# Polling für Status 202
|
||||
if response.status_code == 202:
|
||||
# Maximal 3 Versuche mit steigenden Intervallen
|
||||
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
||||
|
||||
for wait_time in backoff_times:
|
||||
time.sleep(wait_time) # Warten mit steigender Zeit
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
# Wenn kein 202 mehr, dann abbrechen
|
||||
if response.status_code != 202:
|
||||
break
|
||||
|
||||
# Für andere Fehler-Status einen Fehler auslösen
|
||||
response.raise_for_status()
|
||||
|
||||
# HTML parsen
|
||||
return BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
except Exception as e:
|
||||
# Leeres BeautifulSoup-Objekt erstellen
|
||||
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
||||
|
||||
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
||||
# Extract snippet/description
|
||||
snippet_element = data.select_one('.result__snippet')
|
||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||
|
||||
result={
|
||||
'title': title,
|
||||
'url': url,
|
||||
'snippet': snippet,
|
||||
'data': data.prettify()
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
# Singleton-Instanz
|
||||
_webcrawler_agent = None
|
||||
|
||||
def get_webcrawler_agent():
|
||||
"""Gibt eine Singleton-Instanz des WebCrawler-Agenten zurück"""
|
||||
global _webcrawler_agent
|
||||
if _webcrawler_agent is None:
|
||||
_webcrawler_agent = WebcrawlerAgent()
|
||||
return _webcrawler_agent
|
||||
|
|
@ -38,11 +38,15 @@ class AnalystAgent(BaseAgent):
|
|||
self.description = "Analyzes and interprets data"
|
||||
self.capabilities = "data_analysis,pattern_recognition,statistics,visualization,data_interpretation"
|
||||
self.result_format = "AnalysisReport"
|
||||
|
||||
|
||||
# Initialize AI service
|
||||
self.ai_service = None
|
||||
|
||||
# Document capabilities
|
||||
self.supports_documents = True
|
||||
self.document_capabilities = ["read", "analyze", "extract"]
|
||||
self.required_context = ["data_source", "analysis_objectives"]
|
||||
self.document_handler = None
|
||||
|
||||
# Initialize protocol
|
||||
self.protocol = AgentCommunicationProtocol()
|
||||
|
|
@ -68,7 +72,11 @@ class AnalystAgent(BaseAgent):
|
|||
}
|
||||
})
|
||||
return info
|
||||
|
||||
|
||||
def set_document_handler(self, document_handler):
|
||||
"""Set the document handler for file operations"""
|
||||
self.document_handler = document_handler
|
||||
|
||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a message and perform data analysis.
|
||||
|
|
@ -277,7 +285,7 @@ class AnalystAgent(BaseAgent):
|
|||
for document in message.get("documents", []):
|
||||
source = document.get("source", {})
|
||||
filename = source.get("name", "")
|
||||
file_id = source.get("id", "")
|
||||
file_id = source.get("id", 0)
|
||||
content_type = source.get("content_type", "")
|
||||
|
||||
# Skip if not a recognizable data file
|
||||
|
|
@ -288,8 +296,8 @@ class AnalystAgent(BaseAgent):
|
|||
# Try to get file content through document handler first
|
||||
file_content = None
|
||||
if self.document_handler:
|
||||
file_content = await self.document_handler.get_file_content(file_id)
|
||||
|
||||
file_content = self.document_handler.get_file_content_from_message(message, file_id=file_id)
|
||||
|
||||
# Process based on file type
|
||||
if filename.lower().endswith('.csv'):
|
||||
df = self._process_csv(file_content, filename)
|
||||
|
|
@ -323,33 +331,42 @@ class AnalystAgent(BaseAgent):
|
|||
|
||||
return False
|
||||
|
||||
def _process_csv(self, file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
|
||||
def _process_csv(self, file_content: Union[bytes, str], filename: str) -> Optional[pd.DataFrame]:
|
||||
"""Process CSV file content into a pandas DataFrame"""
|
||||
if file_content is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Try various encodings
|
||||
for encoding in ['utf-8', 'latin1', 'cp1252']:
|
||||
try:
|
||||
# Use StringIO to create a file-like object
|
||||
text_content = file_content.decode(encoding)
|
||||
df = pd.read_csv(io.StringIO(text_content))
|
||||
|
||||
# Basic preprocessing
|
||||
df = self._preprocess_dataframe(df)
|
||||
return df
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing CSV with {encoding} encoding: {str(e)}")
|
||||
|
||||
# If all encodings fail, try one more time with errors='replace'
|
||||
text_content = file_content.decode('utf-8', errors='replace')
|
||||
df = pd.read_csv(io.StringIO(text_content))
|
||||
df = self._preprocess_dataframe(df)
|
||||
return df
|
||||
|
||||
# Handle the case where file_content is already a string
|
||||
if isinstance(file_content, str):
|
||||
text_content = file_content
|
||||
df = pd.read_csv(io.StringIO(text_content))
|
||||
df = self._preprocess_dataframe(df)
|
||||
return df
|
||||
|
||||
# Handle the case where file_content is bytes
|
||||
else:
|
||||
# Try various encodings
|
||||
for encoding in ['utf-8', 'latin1', 'cp1252']:
|
||||
try:
|
||||
# Use StringIO to create a file-like object
|
||||
text_content = file_content.decode(encoding)
|
||||
df = pd.read_csv(io.StringIO(text_content))
|
||||
|
||||
# Basic preprocessing
|
||||
df = self._preprocess_dataframe(df)
|
||||
return df
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing CSV with {encoding} encoding: {str(e)}")
|
||||
|
||||
# If all encodings fail, try one more time with errors='replace'
|
||||
text_content = file_content.decode('utf-8', errors='replace')
|
||||
df = pd.read_csv(io.StringIO(text_content))
|
||||
df = self._preprocess_dataframe(df)
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process CSV file {filename}: {str(e)}")
|
||||
return None
|
||||
|
|
@ -1482,6 +1499,7 @@ class AnalystAgent(BaseAgent):
|
|||
Generated analysis
|
||||
"""
|
||||
if not self.ai_service:
|
||||
logging.warning("AI service not available for analysis generation")
|
||||
return f"## Data Analysis ({analysis_type})\n\nUnable to generate analysis: AI service not available."
|
||||
|
||||
# Create specialized prompt based on analysis type
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
"""
|
||||
CoderAgent - A unified agent for developing and executing Python code.
|
||||
Includes code execution capabilities previously in separate modules.
|
||||
Enhanced with auto-correction loop for handling execution errors.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
|
@ -13,6 +14,7 @@ import subprocess
|
|||
import tempfile
|
||||
import shutil
|
||||
import sys
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
|
|
@ -23,7 +25,9 @@ from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtoc
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Existing SimpleCodeExecutor class remains unchanged
|
||||
class SimpleCodeExecutor:
|
||||
# ... existing code ...
|
||||
"""
|
||||
A simplified executor that runs Python code in isolated virtual environments.
|
||||
"""
|
||||
|
|
@ -36,7 +40,8 @@ class SimpleCodeExecutor:
|
|||
timeout: int = 30,
|
||||
max_memory_mb: int = 512,
|
||||
requirements: List[str] = None,
|
||||
blocked_packages: List[str] = None):
|
||||
blocked_packages: List[str] = None,
|
||||
ai_service = None):
|
||||
"""
|
||||
Initialize the SimpleCodeExecutor.
|
||||
|
||||
|
|
@ -57,6 +62,7 @@ class SimpleCodeExecutor:
|
|||
"tensorflow", "pytorch", "scikit-learn" # Resource intensive
|
||||
]
|
||||
self.is_persistent = workflow_id is not None
|
||||
self.ai_service = ai_service
|
||||
|
||||
@classmethod
|
||||
def get_workflow_environment(cls, workflow_id: str) -> Optional[str]:
|
||||
|
|
@ -67,7 +73,7 @@ class SimpleCodeExecutor:
|
|||
def set_workflow_environment(cls, workflow_id: str, env_path: str) -> None:
|
||||
"""Store a workflow environment path."""
|
||||
cls._workflow_environments[workflow_id] = env_path
|
||||
|
||||
|
||||
def _create_venv(self) -> str:
|
||||
"""Creates a virtual environment and returns the path."""
|
||||
# Check for existing environment if using workflow_id
|
||||
|
|
@ -176,50 +182,16 @@ class SimpleCodeExecutor:
|
|||
return False
|
||||
|
||||
def _extract_required_packages(self, code: str) -> List[str]:
|
||||
"""Extract required packages from import statements and requirements comments in the code."""
|
||||
import re
|
||||
# Extract required packages from requirements comments in the 1st code line
|
||||
packages = set()
|
||||
|
||||
# Check for special REQUIREMENTS comment
|
||||
requirements_match = re.search(r'# REQUIREMENTS:\s*([^\n]+)', code)
|
||||
if requirements_match:
|
||||
req_str = requirements_match.group(1).strip()
|
||||
for pkg in req_str.split(','):
|
||||
if pkg.strip():
|
||||
packages.add(pkg.strip())
|
||||
|
||||
# Add common base packages
|
||||
base_packages = [
|
||||
"requests", "urllib3", "pydantic",
|
||||
"pandas", "numpy", "matplotlib"
|
||||
]
|
||||
|
||||
for pkg in base_packages:
|
||||
packages.add(pkg)
|
||||
|
||||
# Detect pip install comments
|
||||
pip_comments = re.findall(r'#\s*pip\s+install\s+([^#\n]+)', code)
|
||||
for comment in pip_comments:
|
||||
for pkg in comment.split():
|
||||
if pkg and not pkg.startswith('-'):
|
||||
packages.add(pkg.strip())
|
||||
|
||||
# Analyze import statements
|
||||
import_lines = re.findall(r'^(?:import|from)\s+([^\s.]+)(?:\s+import|\s*$|\.)', code, re.MULTILINE)
|
||||
|
||||
# Standard modules that don't need installation
|
||||
std_modules = {
|
||||
'os', 'sys', 'time', 'datetime', 'math', 're', 'random', 'json',
|
||||
'collections', 'itertools', 'functools', 'pathlib', 'shutil',
|
||||
'tempfile', 'uuid', 'subprocess', 'threading', 'logging',
|
||||
'traceback', 'io', 'copy', 'typing', 'asyncio'
|
||||
}
|
||||
|
||||
# Process all imports
|
||||
for module in import_lines:
|
||||
if module not in std_modules:
|
||||
packages.add(module)
|
||||
|
||||
# Check for special REQUIREMENTS comment - specific format we're looking for
|
||||
first_lines = code.split('\n')[:5] # Only check first few lines
|
||||
for line in first_lines:
|
||||
if line.strip().startswith("# REQUIREMENTS:"):
|
||||
req_str = line.replace("# REQUIREMENTS:", "").strip()
|
||||
for pkg in req_str.split(','):
|
||||
if pkg.strip():
|
||||
packages.add(pkg.strip())
|
||||
return list(packages)
|
||||
|
||||
|
||||
|
|
@ -247,8 +219,8 @@ class SimpleCodeExecutor:
|
|||
all_requirements = []
|
||||
|
||||
# Add explicitly provided requirements
|
||||
if self.requirements:
|
||||
all_requirements.extend(self.requirements)
|
||||
# if self.requirements:
|
||||
# all_requirements.extend(self.requirements)
|
||||
|
||||
# Extract requirements from code
|
||||
extracted_requirements = self._extract_required_packages(code)
|
||||
|
|
@ -340,12 +312,13 @@ class SimpleCodeExecutor:
|
|||
# Run the code from root dir
|
||||
working_dir = os.path.dirname(code_file) # This should be the project root
|
||||
logger.info(f"DEBUG PATH Root: {os.getcwd()} Code: {code_file} Working Dir: {working_dir}")
|
||||
logger.debug(f"|{code}|")
|
||||
process = subprocess.run(
|
||||
[python_executable, code_file],
|
||||
timeout=self.timeout,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=self.temp_dir
|
||||
cwd=working_dir
|
||||
)
|
||||
|
||||
# Process the output
|
||||
|
|
@ -393,7 +366,7 @@ class SimpleCodeExecutor:
|
|||
execution_result = {
|
||||
"success": False,
|
||||
"output": "",
|
||||
"error": f"Execution error: {str(e)}",
|
||||
"error": f"Execution error: {str(e)} for code {code}",
|
||||
"result": None,
|
||||
"exit_code": -1
|
||||
}
|
||||
|
|
@ -426,7 +399,7 @@ class SimpleCodeExecutor:
|
|||
"""Clean up during garbage collection."""
|
||||
self.cleanup()
|
||||
|
||||
|
||||
# Unchanged error recommendation function
|
||||
def get_error_recommendation(error_message: str) -> str:
|
||||
"""Generate recommendations based on error message."""
|
||||
if "ImportError" in error_message or "ModuleNotFoundError" in error_message:
|
||||
|
|
@ -460,7 +433,7 @@ To fix the error:
|
|||
|
||||
|
||||
class CoderAgent(BaseAgent):
|
||||
"""Agent for developing and executing Python code"""
|
||||
"""Agent for developing and executing Python code with auto-correction capabilities"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the coder agent with proper type and capabilities"""
|
||||
|
|
@ -473,11 +446,15 @@ class CoderAgent(BaseAgent):
|
|||
self.description = "Develops and executes Python code"
|
||||
self.capabilities = "code_development,data_processing,file_processing,automation"
|
||||
self.result_format = "python_code"
|
||||
|
||||
|
||||
# Initialize AI service
|
||||
self.ai_service = None
|
||||
|
||||
# Add document capabilities
|
||||
self.supports_documents = True
|
||||
self.document_capabilities = ["read", "reference", "create"]
|
||||
self.required_context = ["workflow_id"]
|
||||
self.document_handler = None
|
||||
|
||||
# Initialize protocol
|
||||
self.protocol = AgentCommunicationProtocol()
|
||||
|
|
@ -491,8 +468,12 @@ class CoderAgent(BaseAgent):
|
|||
self.executor_memory_limit = 512 # MB
|
||||
|
||||
# AI service settings
|
||||
self.ai_temperature = 0.2 # Lower temperature for more deterministic code generation
|
||||
self.ai_temperature = 0.1 # Lower temperature for more deterministic code generation
|
||||
self.ai_max_tokens = 2000 # Enough tokens for complex code
|
||||
|
||||
# Auto-correction settings (new)
|
||||
self.max_correction_attempts = 3 # Maximum number of correction attempts
|
||||
self.correction_temperature = 0.1 # Even lower temperature for corrections
|
||||
|
||||
def get_agent_info(self) -> Dict[str, Any]:
|
||||
"""Get agent information for agent registry"""
|
||||
|
|
@ -500,14 +481,20 @@ class CoderAgent(BaseAgent):
|
|||
info.update({
|
||||
"metadata": {
|
||||
"timeout": self.executor_timeout,
|
||||
"memory_limit": self.executor_memory_limit
|
||||
"memory_limit": self.executor_memory_limit,
|
||||
"max_correction_attempts": self.max_correction_attempts
|
||||
}
|
||||
})
|
||||
return info
|
||||
|
||||
|
||||
def set_document_handler(self, document_handler):
|
||||
"""Set the document handler for file operations"""
|
||||
self.document_handler = document_handler
|
||||
|
||||
|
||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a message to develop and execute Python code.
|
||||
Process a message to develop and execute Python code with auto-correction.
|
||||
|
||||
Args:
|
||||
message: The message to process
|
||||
|
|
@ -539,34 +526,23 @@ class CoderAgent(BaseAgent):
|
|||
content = message.get("content", "")
|
||||
documents = message.get("documents", [])
|
||||
|
||||
# Extract code from message content
|
||||
code_blocks = re.findall(r'```(?:python)?\s*([\s\S]*?)```', content)
|
||||
code_to_execute = None
|
||||
requirements = []
|
||||
|
||||
if code_blocks:
|
||||
# Use the first code block found
|
||||
code_to_execute = code_blocks[0]
|
||||
# Clean the code to remove any markdown formatting
|
||||
code_to_execute = self._clean_code(code_to_execute)
|
||||
logging_utils.info(f"Code extracted from message ({len(code_to_execute)} characters)", "agents")
|
||||
else:
|
||||
# Generate code based on the message content using AI
|
||||
logging_utils.info("No code found in message, generating new code with AI", "agents")
|
||||
|
||||
# Generate code using AI
|
||||
code_to_execute, requirements = await self._generate_code_from_prompt(content, documents)
|
||||
if not code_to_execute:
|
||||
logging_utils.warning("AI could not generate code", "agents")
|
||||
response["content"] = "I couldn't generate executable code based on your request. Please provide more detailed instructions."
|
||||
self.message_utils.finalize_message(response)
|
||||
return response
|
||||
logging_utils.info(f"Code generated with AI ({len(code_to_execute)} characters)", "agents")
|
||||
# Generate code based on the message content using AI
|
||||
logging_utils.info("Generating new code with AI", "agents")
|
||||
|
||||
# Generate code using AI
|
||||
code_to_execute, requirements = await self._generate_code_from_prompt(content, documents)
|
||||
if not code_to_execute:
|
||||
logging_utils.warning("AI could not generate code", "agents")
|
||||
response["content"] = "I couldn't generate executable code based on your request. Please provide more detailed instructions."
|
||||
self.message_utils.finalize_message(response)
|
||||
return response
|
||||
logging_utils.info(f"Code generated with AI ({len(code_to_execute)} characters)", "agents")
|
||||
|
||||
# Execute the code
|
||||
# Execute the code with auto-correction loop
|
||||
if code_to_execute:
|
||||
logging_utils.info("Executing code", "execution")
|
||||
|
||||
# Prepare execution context
|
||||
execution_context = {
|
||||
"workflow_id": workflow_id,
|
||||
|
|
@ -575,21 +551,16 @@ class CoderAgent(BaseAgent):
|
|||
"log_func": log_func
|
||||
}
|
||||
|
||||
# Send a status update
|
||||
if log_func:
|
||||
status_message = self.protocol.create_status_update_message(
|
||||
status_description="Processing code execution request",
|
||||
sender_id=self.id,
|
||||
status="in_progress",
|
||||
progress=0.5,
|
||||
context_id=workflow_id
|
||||
)
|
||||
log_func(workflow_id, status_message.content, "info", self.id, self.name)
|
||||
|
||||
# Execute code
|
||||
result = await self._execute_code(code_to_execute, requirements, execution_context)
|
||||
# Enhanced execution with auto-correction
|
||||
result, attempts_info = await self._execute_with_auto_correction(
|
||||
code_to_execute,
|
||||
requirements,
|
||||
execution_context,
|
||||
content, # Original prompt/message
|
||||
logging_utils
|
||||
)
|
||||
|
||||
# Prepare response
|
||||
# Prepare response based on the final result (success or failure)
|
||||
if result.get("success", False):
|
||||
# Code execution successful
|
||||
output = result.get("output", "")
|
||||
|
|
@ -597,20 +568,39 @@ class CoderAgent(BaseAgent):
|
|||
logging_utils.info("Code executed successfully", "execution")
|
||||
|
||||
# Format response content
|
||||
response_content = f"## Code executed successfully\n\n"
|
||||
response_content = f"## Code executed successfully"
|
||||
|
||||
# Add correction attempts info if any corrections were made
|
||||
if attempts_info and len(attempts_info) > 1:
|
||||
response_content += f" (after {len(attempts_info)-1} correction attempts)"
|
||||
|
||||
response_content += "\n\n"
|
||||
|
||||
# Include the executed code
|
||||
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
||||
response_content += f"### Final Executed Code\n\n```python\n{attempts_info[-1]['code']}\n```\n\n"
|
||||
|
||||
# Include the output if available
|
||||
if output:
|
||||
response_content += f"### Output\n\n```\n{output}\n```\n\n"
|
||||
|
||||
# Create document with results
|
||||
data_document = self._create_document_from_result(execution_result)
|
||||
if data_document:
|
||||
response["documents"].append(data_document)
|
||||
|
||||
# Include the execution result if available
|
||||
if execution_result:
|
||||
result_str = json.dumps(execution_result, indent=2) if isinstance(execution_result, (dict, list)) else str(execution_result)
|
||||
response_content += f"### Result\n\n```\n{result_str}\n```\n\n"
|
||||
|
||||
# Include correction history if any corrections were made
|
||||
if attempts_info and len(attempts_info) > 1:
|
||||
response_content += f"### Code Correction History\n\n"
|
||||
for i, attempt in enumerate(attempts_info[:-1], 1):
|
||||
response_content += f"**Attempt {i}:**\n\n"
|
||||
response_content += f"```python\n{attempt['code']}\n```\n\n"
|
||||
response_content += f"**Error:**\n\n```\n{attempt['error']}\n```\n\n"
|
||||
|
||||
response["content"] = response_content
|
||||
|
||||
# Process any files created by the code
|
||||
|
|
@ -628,17 +618,38 @@ class CoderAgent(BaseAgent):
|
|||
}
|
||||
response["documents"].append(doc)
|
||||
else:
|
||||
# Code execution failed
|
||||
# Code execution failed after all attempts
|
||||
error = result.get("error", "Unknown error")
|
||||
logging_utils.error(f"Error during code execution: {error}", "execution")
|
||||
logging_utils.error(f"Error during code execution after all correction attempts: {error}", "execution")
|
||||
|
||||
# Format error response
|
||||
response_content = f"## Error during code execution\n\n"
|
||||
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
||||
response_content += f"### Error\n\n```\n{error}\n```\n\n"
|
||||
|
||||
# Add recommendation based on error
|
||||
response_content += get_error_recommendation(error)
|
||||
# Include correction attempts information
|
||||
if attempts_info:
|
||||
response_content += f"I made {len(attempts_info)} attempts to correct the code, but couldn't resolve all issues.\n\n"
|
||||
|
||||
# Add the final attempt
|
||||
response_content += f"### Final Code Attempt\n\n```python\n{attempts_info[-1]['code']}\n```\n\n"
|
||||
response_content += f"### Final Error\n\n```\n{attempts_info[-1]['error']}\n```\n\n"
|
||||
|
||||
# Add recommendation based on error
|
||||
response_content += get_error_recommendation(error)
|
||||
|
||||
# Add correction history
|
||||
if len(attempts_info) > 1:
|
||||
response_content += f"\n### Code Correction History\n\n"
|
||||
for i, attempt in enumerate(attempts_info[:-1], 1):
|
||||
response_content += f"**Attempt {i}:**\n\n"
|
||||
response_content += f"```python\n{attempt['code']}\n```\n\n"
|
||||
response_content += f"**Error:**\n\n```\n{attempt['error']}\n```\n\n"
|
||||
else:
|
||||
# Just show the code and error
|
||||
response_content += f"### Executed Code\n\n```python\n{code_to_execute}\n```\n\n"
|
||||
response_content += f"### Error\n\n```\n{error}\n```\n\n"
|
||||
|
||||
# Add recommendation based on error
|
||||
response_content += get_error_recommendation(error)
|
||||
|
||||
response["content"] = response_content
|
||||
else:
|
||||
|
|
@ -650,7 +661,7 @@ class CoderAgent(BaseAgent):
|
|||
|
||||
# Log success
|
||||
logging_utils.info("CoderAgent has successfully processed the request", "agents")
|
||||
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -662,7 +673,253 @@ class CoderAgent(BaseAgent):
|
|||
self.message_utils.finalize_message(response)
|
||||
|
||||
return response
|
||||
|
||||
def _create_document_from_result(self, execution_result, output_format="json"):
|
||||
"""
|
||||
Create a document object from execution results
|
||||
|
||||
Args:
|
||||
execution_result: The data returned from code execution
|
||||
output_format: Desired format (json, csv, etc.)
|
||||
|
||||
Returns:
|
||||
Document object for passing to other agents
|
||||
"""
|
||||
if not execution_result:
|
||||
return None
|
||||
|
||||
doc_id = f"data_{uuid.uuid4()}"
|
||||
|
||||
# Determine filename and content type based on the data
|
||||
if isinstance(execution_result, pd.DataFrame):
|
||||
# Handle DataFrame result
|
||||
filename = "processed_data.csv"
|
||||
content_type = "text/csv"
|
||||
content = execution_result.to_csv(index=False)
|
||||
elif isinstance(execution_result, dict) or isinstance(execution_result, list):
|
||||
# Handle dictionary or list result
|
||||
filename = "processed_data.json"
|
||||
content_type = "application/json"
|
||||
content = json.dumps(execution_result)
|
||||
elif isinstance(execution_result, str):
|
||||
# Try to determine if string is JSON, CSV, or plain text
|
||||
if execution_result.strip().startswith('{') or execution_result.strip().startswith('['):
|
||||
filename = "processed_data.json"
|
||||
content_type = "application/json"
|
||||
elif ',' in execution_result and '\n' in execution_result:
|
||||
filename = "processed_data.csv"
|
||||
content_type = "text/csv"
|
||||
else:
|
||||
filename = "processed_data.txt"
|
||||
content_type = "text/plain"
|
||||
content = str(execution_result)
|
||||
else:
|
||||
# Default case for other types
|
||||
filename = "processed_data.txt"
|
||||
content_type = "text/plain"
|
||||
content = str(execution_result)
|
||||
|
||||
# Create document object
|
||||
document = {
|
||||
"id": doc_id,
|
||||
"source": {
|
||||
"type": "generated",
|
||||
"id": doc_id,
|
||||
"name": filename,
|
||||
"content_type": content_type,
|
||||
},
|
||||
"contents": [{
|
||||
"type": "text",
|
||||
"text": content,
|
||||
"is_extracted": True
|
||||
}]
|
||||
}
|
||||
|
||||
return document
|
||||
|
||||
async def _execute_with_auto_correction(
|
||||
self,
|
||||
initial_code: str,
|
||||
requirements: List[str],
|
||||
context: Dict[str, Any],
|
||||
original_prompt: str,
|
||||
logging_utils: LoggingUtils = None
|
||||
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
||||
"""
|
||||
Execute code with automatic error correction and retries.
|
||||
|
||||
Args:
|
||||
initial_code: The initial Python code to execute
|
||||
requirements: List of required packages
|
||||
context: Additional context for execution
|
||||
original_prompt: The original user request/prompt
|
||||
logging_utils: Optional logging utility
|
||||
|
||||
Returns:
|
||||
Tuple of (final execution result, list of attempt info dictionaries)
|
||||
"""
|
||||
# Initialize tracking data
|
||||
current_code = initial_code
|
||||
current_requirements = requirements.copy() if requirements else []
|
||||
attempts_info = []
|
||||
|
||||
# Execute with correction loop
|
||||
for attempt in range(1, self.max_correction_attempts + 1):
|
||||
if logging_utils:
|
||||
if attempt == 1:
|
||||
logging_utils.info(f"Executing code (attempt {attempt}/{self.max_correction_attempts})", "execution")
|
||||
else:
|
||||
logging_utils.info(f"Executing corrected code (attempt {attempt}/{self.max_correction_attempts})", "execution")
|
||||
|
||||
# Execute the current code version
|
||||
result = await self._execute_code(current_code, current_requirements, context)
|
||||
|
||||
# Record attempt information
|
||||
attempts_info.append({
|
||||
"attempt": attempt,
|
||||
"code": current_code,
|
||||
"error": result.get("error", ""),
|
||||
"success": result.get("success", False)
|
||||
})
|
||||
|
||||
# Check if execution was successful
|
||||
if result.get("success", False):
|
||||
# Success! Return the result and attempt info
|
||||
return result, attempts_info
|
||||
|
||||
# Failed execution - check if we've reached the maximum attempt limit
|
||||
if attempt >= self.max_correction_attempts:
|
||||
if logging_utils:
|
||||
logging_utils.warning(f"Maximum correction attempts ({self.max_correction_attempts}) reached, giving up", "execution")
|
||||
break
|
||||
|
||||
# Need to correct the code - generate a fix based on the error
|
||||
error_message = result.get("error", "Unknown error")
|
||||
|
||||
if logging_utils:
|
||||
logging_utils.info(f"Attempting to fix code error: {error_message[:200]}...", "execution")
|
||||
|
||||
# Generate corrected code
|
||||
corrected_code, new_requirements = await self._generate_code_correction(
|
||||
current_code,
|
||||
error_message,
|
||||
original_prompt,
|
||||
current_requirements
|
||||
)
|
||||
|
||||
# Update for next attempt
|
||||
if corrected_code:
|
||||
current_code = corrected_code
|
||||
|
||||
# Add any new requirements
|
||||
if new_requirements:
|
||||
for req in new_requirements:
|
||||
if req not in current_requirements:
|
||||
current_requirements.append(req)
|
||||
if logging_utils:
|
||||
logging_utils.info(f"Added new requirement: {req}", "execution")
|
||||
else:
|
||||
# Could not generate correction, break out of the loop
|
||||
if logging_utils:
|
||||
logging_utils.warning("Could not generate code correction, giving up", "execution")
|
||||
break
|
||||
|
||||
# If we get here, all attempts failed - return the last result and attempt info
|
||||
return result, attempts_info
|
||||
|
||||
async def _generate_code_correction(
|
||||
self,
|
||||
code: str,
|
||||
error_message: str,
|
||||
original_prompt: str,
|
||||
current_requirements: List[str] = None
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Generate a corrected version of code based on error messages.
|
||||
|
||||
Args:
|
||||
code: The code that produced errors
|
||||
error_message: The error message to fix
|
||||
original_prompt: The original task/requirements
|
||||
current_requirements: List of currently required packages
|
||||
|
||||
Returns:
|
||||
Tuple of (corrected code, new requirements list)
|
||||
"""
|
||||
try:
|
||||
# Create a detailed prompt for code correction
|
||||
correction_prompt = f"""You need to fix an error in Python code. The code was written for this task:
|
||||
|
||||
ORIGINAL TASK:
|
||||
{original_prompt}
|
||||
|
||||
CURRENT CODE:
|
||||
```python
|
||||
{code}
|
||||
```
|
||||
|
||||
ERROR MESSAGE:
|
||||
```
|
||||
{error_message}
|
||||
```
|
||||
|
||||
CURRENT REQUIREMENTS: {', '.join(current_requirements) if current_requirements else "None"}
|
||||
|
||||
Your task is to analyze the error and provide a corrected version of the code.
|
||||
Focus specifically on fixing the error while preserving the original functionality.
|
||||
|
||||
Common fixes might include:
|
||||
- Fixing syntax errors (missing parentheses, indentation, etc.)
|
||||
- Resolving import errors by adding appropriate requirements
|
||||
- Correcting file paths or handling file not found errors
|
||||
- Adding error handling for specific edge cases
|
||||
- Fixing logical errors in the code
|
||||
|
||||
FORMAT INSTRUCTIONS:
|
||||
1. Provide ONLY the complete fixed Python code without ANY explanation
|
||||
2. DO NOT include code block markers like ```python or ```
|
||||
3. DO NOT explain what the code does before or after it
|
||||
4. DO NOT include any text that is not valid Python code
|
||||
5. Start your response directly with the valid Python code
|
||||
6. End your response with valid Python code
|
||||
|
||||
If you need to add new required packages, place them in a specially formatted comment at the top of your code like this:
|
||||
# REQUIREMENTS: package1,package2,package3
|
||||
|
||||
Your entire response must be valid Python that can be executed without modification.
|
||||
"""
|
||||
|
||||
# Create messages for the API
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a Python debugging expert. You provide ONLY clean, fixed Python code without any explanations, markdown formatting, or non-code text. Your response should be nothing but valid, fixed Python code that can be executed directly."},
|
||||
{"role": "user", "content": correction_prompt}
|
||||
]
|
||||
|
||||
# Call the API with very low temperature for deterministic fixes
|
||||
generated_content = await self.ai_service.call_api(
|
||||
messages,
|
||||
temperature=self.correction_temperature,
|
||||
max_tokens=self.ai_max_tokens
|
||||
)
|
||||
|
||||
# Clean the generated content to ensure it's only valid Python code
|
||||
fixed_code = self._clean_code(generated_content)
|
||||
|
||||
# Extract requirements from special comment at the top of the code
|
||||
new_requirements = []
|
||||
for line in fixed_code.split('\n'):
|
||||
if line.strip().startswith("# REQUIREMENTS:"):
|
||||
req_str = line.replace("# REQUIREMENTS:", "").strip()
|
||||
new_requirements = [r.strip() for r in req_str.split(',') if r.strip()]
|
||||
break
|
||||
|
||||
return fixed_code, new_requirements
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error generating code correction: {str(e)}", exc_info=True)
|
||||
# Return None to indicate failure
|
||||
return None, []
|
||||
|
||||
def _clean_code(self, code: str) -> str:
|
||||
"""
|
||||
Clean up code by removing markdown code block markers and other formatting artifacts.
|
||||
|
|
@ -728,9 +985,6 @@ class CoderAgent(BaseAgent):
|
|||
Tuple of (generated Python code, required packages)
|
||||
"""
|
||||
try:
|
||||
# Initialize AI service
|
||||
chat_service = ChatService()
|
||||
|
||||
# Prepare a prompt for code generation
|
||||
ai_prompt = f"""Generate Python code to solve the following task:
|
||||
{prompt}
|
||||
|
|
@ -767,7 +1021,7 @@ FORMAT INSTRUCTIONS:
|
|||
- Start your response directly with valid Python code
|
||||
- End your response with valid Python code
|
||||
|
||||
For required packages, place them in a specially formatted comment at the top of your code like this:
|
||||
For required packages, place them in a specially formatted comment at the top of your code one one line like this:
|
||||
# REQUIREMENTS: pandas,numpy,matplotlib,requests
|
||||
|
||||
Your entire response must be valid Python that can be executed without modification.
|
||||
|
|
@ -781,7 +1035,7 @@ Your entire response must be valid Python that can be executed without modificat
|
|||
|
||||
# Call the API
|
||||
logging.info(f"Calling AI API to generate code")
|
||||
generated_content = await chat_service.call_api(messages, temperature=self.ai_temperature, max_tokens=self.ai_max_tokens)
|
||||
generated_content = await self.ai_service.call_api(messages, temperature=self.ai_temperature, max_tokens=self.ai_max_tokens)
|
||||
|
||||
# Clean the generated content to ensure it's only valid Python code
|
||||
code = self._clean_code(generated_content)
|
||||
|
|
@ -843,7 +1097,8 @@ result = {{"error": "Code generation failed", "message": "{error_str}"}}
|
|||
timeout=self.executor_timeout,
|
||||
max_memory_mb=self.executor_memory_limit,
|
||||
requirements=requirements,
|
||||
blocked_packages=blocked_packages
|
||||
blocked_packages=blocked_packages,
|
||||
ai_service = self.ai_service
|
||||
)
|
||||
|
||||
# Prepare input data for the code
|
||||
|
|
@ -924,7 +1179,6 @@ result = {{"error": "Code generation failed", "message": "{error_str}"}}
|
|||
# Log error information
|
||||
error = result.get("error", "Unknown error")
|
||||
logging_utils.error(f"Error during code execution: {error}", "execution")
|
||||
print("DEBUG CODE-ERROR:",code,"#END")
|
||||
|
||||
# Clean up non-persistent environments
|
||||
if not executor.is_persistent:
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ from datetime import datetime
|
|||
import uuid
|
||||
|
||||
from modules.agentservice_base import BaseAgent
|
||||
from connectors.connector_aichat_openai import ChatService
|
||||
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
||||
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
||||
from modules.agentservice_filemanager import FileManager # Import the file manager
|
||||
|
|
@ -63,7 +62,11 @@ class DocumentationAgent(BaseAgent):
|
|||
}
|
||||
})
|
||||
return info
|
||||
|
||||
|
||||
def set_document_handler(self, document_handler):
|
||||
"""Set the document handler for file operations"""
|
||||
self.document_handler = document_handler
|
||||
|
||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a message and create documentation.
|
||||
|
|
|
|||
|
|
@ -5,21 +5,16 @@ Angepasst für das refaktorisierte Core-Modul.
|
|||
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
import re
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from urllib.parse import quote_plus, unquote
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from modules.agentservice_base import BaseAgent
|
||||
from connectors.connector_aichat_openai import ChatService
|
||||
from modules.agentservice_utils import WorkflowUtils, MessageUtils, LoggingUtils
|
||||
from modules.agentservice_protocol import AgentMessage, AgentCommunicationProtocol
|
||||
from modules.agentservice_utils import MessageUtils, LoggingUtils
|
||||
from modules.agentservice_protocol import AgentCommunicationProtocol
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -42,13 +37,14 @@ class WebcrawlerAgent(BaseAgent):
|
|||
self.supports_documents = True
|
||||
self.document_capabilities = ["read", "create"]
|
||||
self.required_context = ["workflow_id"]
|
||||
|
||||
self.document_handler = None
|
||||
|
||||
# Initialize AI service
|
||||
self.ai_service = None
|
||||
|
||||
# Initialize protocol
|
||||
self.protocol = AgentCommunicationProtocol()
|
||||
|
||||
# Chat-Service initialisieren
|
||||
self.chat_service = ChatService()
|
||||
|
||||
# Utility-Klassen initialisieren
|
||||
self.message_utils = MessageUtils()
|
||||
|
||||
|
|
@ -57,17 +53,6 @@ class WebcrawlerAgent(BaseAgent):
|
|||
self.max_key = 3
|
||||
self.max_result = 3
|
||||
self.timeout = 10
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Referer': 'https://www.google.com/',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
self.max_urls = 10
|
||||
self.max_content_length = 100000
|
||||
|
||||
def get_agent_info(self) -> Dict[str, Any]:
|
||||
"""Get agent information for agent registry"""
|
||||
|
|
@ -80,7 +65,11 @@ class WebcrawlerAgent(BaseAgent):
|
|||
}
|
||||
})
|
||||
return info
|
||||
|
||||
|
||||
def set_document_handler(self, document_handler):
|
||||
"""Set the document handler for file operations"""
|
||||
self.document_handler = document_handler
|
||||
|
||||
async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Verarbeitet eine Nachricht und führt eine Web-Recherche durch.
|
||||
|
|
@ -172,29 +161,6 @@ class WebcrawlerAgent(BaseAgent):
|
|||
response["content"] = f"## Fehler bei der Web-Recherche\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```"
|
||||
|
||||
return response
|
||||
|
||||
def send_document_request(self, document_description: str, sender_id: str, receiver_id: str, filters: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||
"""Send a document request using the protocol"""
|
||||
return self.protocol.create_document_request_message(
|
||||
document_description=document_description,
|
||||
sender_id=sender_id,
|
||||
receiver_id=receiver_id,
|
||||
filters=filters,
|
||||
context_id=context_id
|
||||
)
|
||||
|
||||
def send_result_message(self, result_content: str, sender_id: str, receiver_id: str, task_id: str,
|
||||
output_data: Dict[str, Any] = None, context_id: str = None) -> AgentMessage:
|
||||
"""Send a result message using the protocol"""
|
||||
return self.protocol.create_result_message(
|
||||
result_content=result_content,
|
||||
sender_id=sender_id,
|
||||
receiver_id=receiver_id,
|
||||
task_id=task_id,
|
||||
output_data=output_data,
|
||||
result_format="SearchResults",
|
||||
context_id=context_id
|
||||
)
|
||||
|
||||
async def get_prompt(self, message_context: Dict[str, Any]) -> str:
|
||||
task = message_context.get("content", "")
|
||||
|
|
@ -232,7 +198,7 @@ class WebcrawlerAgent(BaseAgent):
|
|||
total_tokens += instruction_tokens
|
||||
|
||||
# Zusätzliche Anweisungen für Web-Recherche
|
||||
content_text = await self.chat_service.call_api(
|
||||
content_text = await self.ai_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
|
|
@ -263,7 +229,7 @@ class WebcrawlerAgent(BaseAgent):
|
|||
# Limit summary source to ensure we don't exceed API limits
|
||||
summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=10000)
|
||||
|
||||
summary = await self.chat_service.call_api(
|
||||
summary = await self.ai_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
|
|
@ -280,322 +246,329 @@ class WebcrawlerAgent(BaseAgent):
|
|||
result = f"## Web-Recherche Ergebnisse\n\n### Zusammenfassung\n{summary}\n\n### Detaillierte Ergebnisse{result_data}"
|
||||
return result
|
||||
|
||||
async def run_web_query(self, prompt: str) -> List[Dict]:
|
||||
if prompt=="":
|
||||
return []
|
||||
async def run_web_query(self, prompt: str) -> List[Dict]:
|
||||
if prompt=="":
|
||||
return []
|
||||
|
||||
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
||||
ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open.
|
||||
|
||||
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
||||
'url': A list of maximum {self.max_url} specific URLs extracted from the task string.
|
||||
|
||||
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
||||
|
||||
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
||||
"""
|
||||
'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information.
|
||||
|
||||
Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition.
|
||||
"""
|
||||
|
||||
content_text = await self.chat_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": ptext
|
||||
}
|
||||
]
|
||||
)
|
||||
# Remove markdown formatting if present
|
||||
if content_text.startswith("```json"):
|
||||
# Find the end of the JSON block
|
||||
end_marker = "```"
|
||||
end_index = content_text.rfind(end_marker)
|
||||
if end_index != -1:
|
||||
# Extract the JSON content without the markdown markers
|
||||
content_text = content_text[7:end_index].strip()
|
||||
|
||||
# Now parse the JSON
|
||||
try:
|
||||
logger.info(f"Valid json received: {str(content_text)}")
|
||||
pjson = json.loads(content_text)
|
||||
# Now call scrape_json with the parsed dictionary
|
||||
result_json = await self.scrape_json(pjson)
|
||||
return result_json
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON: {e}")
|
||||
logger.error(f"Cleaned content: {content_text[:100]}...")
|
||||
return []
|
||||
|
||||
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
||||
"""
|
||||
Scrapes web content based on a research strategy JSON.
|
||||
|
||||
Args:
|
||||
research_strategy: A dictionary containing:
|
||||
- 'skey': List of search keywords
|
||||
- 'url': List of direct URLs to scrape
|
||||
|
||||
Returns:
|
||||
Dictionary with URLs as keys and scraped content as values
|
||||
"""
|
||||
|
||||
logger.info("Starting JSON-based web scraping")
|
||||
results = []
|
||||
|
||||
# Validate input structure
|
||||
if not isinstance(research_strategy, dict):
|
||||
logger.error("Invalid research_strategy format: not a dictionary")
|
||||
return {"error": "Invalid research_strategy format: not a dictionary"}
|
||||
|
||||
keys = research_strategy.get("skey", [])
|
||||
direct_urls = research_strategy.get("url", [])
|
||||
|
||||
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
||||
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
||||
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
||||
|
||||
# Process search keywords through search engine
|
||||
for keyword in keys:
|
||||
logger.info(f"Processing keyword: {keyword}")
|
||||
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
||||
logger.info(f"... {len(found_results)} results found")
|
||||
results.extend(found_results)
|
||||
|
||||
# Process direct URLs
|
||||
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
||||
for url in direct_urls:
|
||||
if url in results:
|
||||
logger.info(f"Skipping already scraped URL: {url}")
|
||||
continue
|
||||
soup=self.read_url(url)
|
||||
|
||||
# Extract title from the page if it exists
|
||||
if isinstance(soup, BeautifulSoup):
|
||||
title_tag = soup.find('title')
|
||||
title = title_tag.text.strip() if title_tag else "No title"
|
||||
|
||||
# Alternative: You could also look for h1 tags if the title tag is missing
|
||||
if title == "No title":
|
||||
h1_tag = soup.find('h1')
|
||||
if h1_tag:
|
||||
title = h1_tag.text.strip()
|
||||
else:
|
||||
# Handle the case where soup is an error message string
|
||||
title = "Error fetching page"
|
||||
|
||||
results.append(self.parse_result(soup,"No title",url))
|
||||
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
||||
return results
|
||||
|
||||
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
||||
"""
|
||||
Extract the main content from an HTML page while limiting character count.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object containing the page content
|
||||
max_chars: Maximum number of characters to extract
|
||||
|
||||
Returns:
|
||||
Extracted main content as string
|
||||
"""
|
||||
if not isinstance(soup, BeautifulSoup):
|
||||
return str(soup)[:max_chars]
|
||||
|
||||
# Try to find main content elements in order of priority
|
||||
main_content = None
|
||||
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
||||
content = soup.select_one(selector)
|
||||
if content:
|
||||
main_content = content
|
||||
break
|
||||
|
||||
# If no main content found, use the body
|
||||
if not main_content:
|
||||
main_content = soup.find('body') or soup
|
||||
|
||||
# Remove script, style, nav, footer elements that don't contribute to main content
|
||||
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
||||
element.extract()
|
||||
|
||||
# Extract text content
|
||||
text_content = main_content.get_text(separator=' ', strip=True)
|
||||
|
||||
# Limit to max_chars
|
||||
return text_content[:max_chars]
|
||||
|
||||
def tokenize_for_counting(self, text: str) -> List[str]:
|
||||
"""
|
||||
Simple token counter for estimating token usage.
|
||||
This is an approximation since the exact tokenization depends on the model.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
List of tokens
|
||||
"""
|
||||
# Simple tokenization by splitting on whitespace and punctuation
|
||||
import re
|
||||
return re.findall(r'\w+|[^\w\s]', text)
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the approximate number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
return len(tokens)
|
||||
|
||||
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
||||
"""
|
||||
Limit the text to a maximum number of tokens.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
max_tokens: Maximum number of tokens allowed
|
||||
|
||||
Returns:
|
||||
Limited text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
|
||||
# If text is already under the limit, return as is
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
# Otherwise, truncate text to max_tokens
|
||||
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
||||
|
||||
def search_web(self, query: str) -> List[Dict]:
|
||||
formatted_query = quote_plus(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
||||
|
||||
search_results_soup = self.read_url(url)
|
||||
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
||||
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
||||
return []
|
||||
|
||||
# Extract search results
|
||||
results = []
|
||||
|
||||
# Find all result containers
|
||||
result_elements = search_results_soup.select('.result')
|
||||
|
||||
for result in result_elements:
|
||||
# Extract title
|
||||
title_element = result.select_one('.result__a')
|
||||
title = title_element.text.strip() if title_element else 'No title'
|
||||
|
||||
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
||||
url_element = title_element.get('href') if title_element else ''
|
||||
extracted_url = 'No URL'
|
||||
|
||||
if url_element:
|
||||
# Extract the actual URL from DuckDuckGo's redirect
|
||||
if url_element.startswith('/d.js?q='):
|
||||
start = url_element.find('?q=') + 3 # Skip '?q='
|
||||
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
||||
extracted_url = unquote(url_element[start:end])
|
||||
|
||||
# Make sure the URL has the correct protocol prefix
|
||||
if not extracted_url.startswith(('http://', 'https://')):
|
||||
if not extracted_url.startswith('//'):
|
||||
extracted_url = 'https://' + extracted_url
|
||||
else:
|
||||
extracted_url = 'https:' + extracted_url
|
||||
else:
|
||||
extracted_url = url_element
|
||||
|
||||
# Extract snippet directly from search results page
|
||||
snippet_element = result.select_one('.result__snippet')
|
||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||
|
||||
# Now fetch the actual page content for the data field
|
||||
target_page_soup = self.read_url(extracted_url)
|
||||
|
||||
# Use the new content extraction method to limit content size
|
||||
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
||||
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': extracted_url,
|
||||
'snippet': snippet,
|
||||
'data': content
|
||||
})
|
||||
|
||||
# Limit the number of results if needed
|
||||
if len(results) >= self.max_result:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def read_url(self, url: str) -> BeautifulSoup:
|
||||
"""
|
||||
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
||||
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
||||
|
||||
Args:
|
||||
url: Die zu lesende URL
|
||||
|
||||
Returns:
|
||||
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
content_text = await self.ai_service.call_api(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Du bist ein Webrecherche-Experte, der präzise Suchstrategien entwickelt."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": ptext
|
||||
}
|
||||
]
|
||||
)
|
||||
# Remove markdown formatting if present
|
||||
if content_text.startswith("```json"):
|
||||
# Find the end of the JSON block
|
||||
end_marker = "```"
|
||||
end_index = content_text.rfind(end_marker)
|
||||
if end_index != -1:
|
||||
# Extract the JSON content without the markdown markers
|
||||
content_text = content_text[7:end_index].strip()
|
||||
|
||||
# Now parse the JSON
|
||||
try:
|
||||
logger.info(f"Valid json received: {str(content_text)}")
|
||||
pjson = json.loads(content_text)
|
||||
# Now call scrape_json with the parsed dictionary
|
||||
result_json = await self.scrape_json(pjson)
|
||||
return result_json
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON: {e}")
|
||||
logger.error(f"Cleaned content: {content_text[:100]}...")
|
||||
return []
|
||||
|
||||
async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]:
|
||||
"""
|
||||
Scrapes web content based on a research strategy JSON.
|
||||
|
||||
Args:
|
||||
research_strategy: A dictionary containing:
|
||||
- 'skey': List of search keywords
|
||||
- 'url': List of direct URLs to scrape
|
||||
|
||||
try:
|
||||
import time
|
||||
Returns:
|
||||
Dictionary with URLs as keys and scraped content as values
|
||||
"""
|
||||
|
||||
logger.info("Starting JSON-based web scraping")
|
||||
results = []
|
||||
|
||||
# Validate input structure
|
||||
if not isinstance(research_strategy, dict):
|
||||
logger.error("Invalid research_strategy format: not a dictionary")
|
||||
return {"error": "Invalid research_strategy format: not a dictionary"}
|
||||
|
||||
keys = research_strategy.get("skey", [])
|
||||
direct_urls = research_strategy.get("url", [])
|
||||
|
||||
if not isinstance(keys, list) or not isinstance(direct_urls, list):
|
||||
logger.error("Invalid research_strategy format: keys, or url is not a list")
|
||||
return {"error": "Invalid research_strategy format: keys, or url is not a list"}
|
||||
|
||||
# Process search keywords through search engine
|
||||
for keyword in keys:
|
||||
logger.info(f"Processing keyword: {keyword}")
|
||||
found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data
|
||||
logger.info(f"... {len(found_results)} results found")
|
||||
results.extend(found_results)
|
||||
|
||||
# Process direct URLs
|
||||
logger.info(f"Processing {len(direct_urls)} direct URLs")
|
||||
for url in direct_urls:
|
||||
if url in results:
|
||||
logger.info(f"Skipping already scraped URL: {url}")
|
||||
continue
|
||||
soup = self.read_url(url)
|
||||
|
||||
# Extract title from the page if it exists
|
||||
if isinstance(soup, BeautifulSoup):
|
||||
title_tag = soup.find('title')
|
||||
title = title_tag.text.strip() if title_tag else "No title"
|
||||
|
||||
# Alternative: You could also look for h1 tags if the title tag is missing
|
||||
if title == "No title":
|
||||
h1_tag = soup.find('h1')
|
||||
if h1_tag:
|
||||
title = h1_tag.text.strip()
|
||||
else:
|
||||
# Handle the case where soup is an error message string
|
||||
title = "Error fetching page"
|
||||
|
||||
results.append(self.parse_result(soup, title, url))
|
||||
logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total")
|
||||
return results
|
||||
|
||||
def extract_main_content(self, soup: BeautifulSoup, max_chars: int = 30000) -> str:
|
||||
"""
|
||||
Extract the main content from an HTML page while limiting character count.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object containing the page content
|
||||
max_chars: Maximum number of characters to extract
|
||||
|
||||
Returns:
|
||||
Extracted main content as string
|
||||
"""
|
||||
if not isinstance(soup, BeautifulSoup):
|
||||
return str(soup)[:max_chars]
|
||||
|
||||
# Try to find main content elements in order of priority
|
||||
main_content = None
|
||||
for selector in ['main', 'article', '#content', '.content', '#main', '.main']:
|
||||
content = soup.select_one(selector)
|
||||
if content:
|
||||
main_content = content
|
||||
break
|
||||
|
||||
# If no main content found, use the body
|
||||
if not main_content:
|
||||
main_content = soup.find('body') or soup
|
||||
|
||||
# Remove script, style, nav, footer elements that don't contribute to main content
|
||||
for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'):
|
||||
element.extract()
|
||||
|
||||
# Extract text content
|
||||
text_content = main_content.get_text(separator=' ', strip=True)
|
||||
|
||||
# Limit to max_chars
|
||||
return text_content[:max_chars]
|
||||
|
||||
def tokenize_for_counting(self, text: str) -> List[str]:
|
||||
"""
|
||||
Simple token counter for estimating token usage.
|
||||
This is an approximation since the exact tokenization depends on the model.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
List of tokens
|
||||
"""
|
||||
# Simple tokenization by splitting on whitespace and punctuation
|
||||
import re
|
||||
return re.findall(r'\w+|[^\w\s]', text)
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count the approximate number of tokens in a text.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
return len(tokens)
|
||||
|
||||
def limit_text_for_api(self, text: str, max_tokens: int = 60000) -> str:
|
||||
"""
|
||||
Limit the text to a maximum number of tokens.
|
||||
|
||||
Args:
|
||||
text: Input text
|
||||
max_tokens: Maximum number of tokens allowed
|
||||
|
||||
Returns:
|
||||
Limited text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
tokens = self.tokenize_for_counting(text)
|
||||
|
||||
# If text is already under the limit, return as is
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
# Otherwise, truncate text to max_tokens
|
||||
return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]"
|
||||
|
||||
def search_web(self, query: str) -> List[Dict]:
|
||||
formatted_query = quote_plus(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={formatted_query}"
|
||||
|
||||
search_results_soup = self.read_url(url)
|
||||
if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0:
|
||||
logger.warning(f"Keine Suchergebnisse gefunden für: {query}")
|
||||
return []
|
||||
|
||||
# Extract search results
|
||||
results = []
|
||||
|
||||
# Find all result containers
|
||||
result_elements = search_results_soup.select('.result')
|
||||
|
||||
for result in result_elements:
|
||||
# Extract title
|
||||
title_element = result.select_one('.result__a')
|
||||
title = title_element.text.strip() if title_element else 'No title'
|
||||
|
||||
# Extract URL (DuckDuckGo uses redirects, need to extract from href param)
|
||||
url_element = title_element.get('href') if title_element else ''
|
||||
extracted_url = 'No URL'
|
||||
|
||||
if url_element:
|
||||
# Extract the actual URL from DuckDuckGo's redirect
|
||||
if url_element.startswith('/d.js?q='):
|
||||
start = url_element.find('?q=') + 3 # Skip '?q='
|
||||
end = url_element.find('&', start) if '&' in url_element[start:] else None
|
||||
extracted_url = unquote(url_element[start:end])
|
||||
|
||||
# Initialer Request
|
||||
# Make sure the URL has the correct protocol prefix
|
||||
if not extracted_url.startswith(('http://', 'https://')):
|
||||
if not extracted_url.startswith('//'):
|
||||
extracted_url = 'https://' + extracted_url
|
||||
else:
|
||||
extracted_url = 'https:' + extracted_url
|
||||
else:
|
||||
extracted_url = url_element
|
||||
|
||||
# Extract snippet directly from search results page
|
||||
snippet_element = result.select_one('.result__snippet')
|
||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||
|
||||
# Now fetch the actual page content for the data field
|
||||
target_page_soup = self.read_url(extracted_url)
|
||||
|
||||
# Use the new content extraction method to limit content size
|
||||
content = self.extract_main_content(target_page_soup, max_chars=30000)
|
||||
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': extracted_url,
|
||||
'snippet': snippet,
|
||||
'data': content
|
||||
})
|
||||
|
||||
# Limit the number of results if needed
|
||||
if len(results) >= self.max_result:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def read_url(self, url: str) -> BeautifulSoup:
|
||||
"""
|
||||
Liest eine URL und gibt einen BeautifulSoup-Parser für den Inhalt zurück.
|
||||
Bei Fehlern wird ein leeres BeautifulSoup-Objekt zurückgegeben.
|
||||
|
||||
Args:
|
||||
url: Die zu lesende URL
|
||||
|
||||
Returns:
|
||||
BeautifulSoup-Objekt mit dem Inhalt oder leer bei Fehlern
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
try:
|
||||
# Initialer Request
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
# Polling für Status 202
|
||||
if response.status_code == 202:
|
||||
# Maximal 3 Versuche mit steigenden Intervallen
|
||||
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
||||
|
||||
for wait_time in backoff_times:
|
||||
time.sleep(wait_time) # Warten mit steigender Zeit
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
# Polling für Status 202
|
||||
if response.status_code == 202:
|
||||
# Maximal 3 Versuche mit steigenden Intervallen
|
||||
backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, dann 1s, dann 2s
|
||||
|
||||
for wait_time in backoff_times:
|
||||
time.sleep(wait_time) # Warten mit steigender Zeit
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
# Wenn kein 202 mehr, dann abbrechen
|
||||
if response.status_code != 202:
|
||||
break
|
||||
|
||||
# Für andere Fehler-Status einen Fehler auslösen
|
||||
response.raise_for_status()
|
||||
|
||||
# HTML parsen
|
||||
return BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
except Exception as e:
|
||||
# Leeres BeautifulSoup-Objekt erstellen
|
||||
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
||||
|
||||
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
||||
# Extract snippet/description
|
||||
snippet_element = data.select_one('.result__snippet')
|
||||
snippet = snippet_element.text.strip() if snippet_element else 'No description'
|
||||
|
||||
result={
|
||||
'title': title,
|
||||
'url': url,
|
||||
'snippet': snippet,
|
||||
'data': data.prettify()
|
||||
}
|
||||
return result
|
||||
# Wenn kein 202 mehr, dann abbrechen
|
||||
if response.status_code != 202:
|
||||
break
|
||||
|
||||
# Für andere Fehler-Status einen Fehler auslösen
|
||||
response.raise_for_status()
|
||||
|
||||
# HTML parsen
|
||||
return BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
except Exception as e:
|
||||
# Leeres BeautifulSoup-Objekt erstellen
|
||||
return BeautifulSoup("<html><body></body></html>", 'html.parser')
|
||||
|
||||
def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Parse a BeautifulSoup object into a result dictionary.
|
||||
|
||||
Args:
|
||||
data: BeautifulSoup object containing the page content
|
||||
title: Page title
|
||||
url: Page URL
|
||||
|
||||
Returns:
|
||||
Dictionary with result data
|
||||
"""
|
||||
# Extract content using the main content extraction method
|
||||
content = self.extract_main_content(data, max_chars=30000)
|
||||
|
||||
result = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'snippet': 'No description', # Default value
|
||||
'data': content
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
# Singleton-Instanz
|
||||
|
|
|
|||
|
|
@ -712,7 +712,7 @@ def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[s
|
|||
# Check if file ID matches (handle both string and int comparison)
|
||||
if (source.get("id") == file_id or
|
||||
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
|
||||
(isinstance(file_id, str) and source.get("id") == int(file_id))):
|
||||
(isinstance(file_id, str) and source.get("id") == file_id)):
|
||||
|
||||
# Add contents of the file
|
||||
doc_contents = document.get("contents", [])
|
||||
|
|
|
|||
|
|
@ -95,14 +95,18 @@ class AgentRegistry:
|
|||
self.ai_service = ai_service
|
||||
self.document_handler = document_handler
|
||||
self.lucydom_interface = lucydom_interface
|
||||
|
||||
# Update dependencies for all registered agents
|
||||
# Update all registered agents
|
||||
self.update_agent_dependencies()
|
||||
|
||||
|
||||
def update_agent_dependencies(self):
|
||||
"""Update dependencies for all registered agents"""
|
||||
for agent_id, agent in self.agents.items():
|
||||
if hasattr(agent, 'set_dependencies'):
|
||||
agent.set_dependencies(
|
||||
ai_service=ai_service,
|
||||
document_handler=document_handler,
|
||||
lucydom_interface=lucydom_interface
|
||||
ai_service=self.ai_service,
|
||||
document_handler=self.document_handler,
|
||||
lucydom_interface=self.lucydom_interface
|
||||
)
|
||||
|
||||
def register_agent(self, agent: 'BaseAgent'):
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from datetime import datetime
|
|||
from typing import List, Dict, Any, Optional, Tuple, Union
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.getLogger('matplotlib.font_manager').setLevel(logging.INFO)
|
||||
|
||||
class WorkflowExecution:
|
||||
"""
|
||||
|
|
@ -39,10 +40,21 @@ class WorkflowExecution:
|
|||
|
||||
# Initialize agent registry
|
||||
self.agent_registry = AgentRegistry.get_instance()
|
||||
|
||||
# Set dependencies for agents
|
||||
|
||||
# Initialize file manager
|
||||
self.file_manager = get_workflow_file_manager(workflow_id, lucydom_interface)
|
||||
|
||||
|
||||
# Import and initialize document handler
|
||||
from modules.agentservice_document_handler import get_document_handler
|
||||
self.document_handler = get_document_handler(workflow_id, lucydom_interface, ai_service)
|
||||
|
||||
self.agent_registry.set_dependencies(
|
||||
ai_service=ai_service,
|
||||
document_handler=self.document_handler,
|
||||
lucydom_interface=lucydom_interface
|
||||
)
|
||||
|
||||
async def execute(self, message: Dict[str, Any], workflow: Dict[str, Any], files: List[Dict[str, Any]] = None, is_user_input: bool = False):
|
||||
"""
|
||||
Execute the workflow with integrated planning and agent selection.
|
||||
|
|
@ -67,6 +79,7 @@ class WorkflowExecution:
|
|||
# 3. Create agent-aware work plan
|
||||
work_plan = await self._create_agent_aware_work_plan(workflow, user_message)
|
||||
self.logging_utils.info(f"Created agent-aware work plan with {len(work_plan)} activities", "planning")
|
||||
self.logging_utils.debug(f"{work_plan}.", "planning")
|
||||
|
||||
# 4. Execute the activities in the work plan
|
||||
results = await self._execute_work_plan(workflow, work_plan)
|
||||
|
|
@ -165,7 +178,7 @@ class WorkflowExecution:
|
|||
"content_type": source.get("content_type", "unknown")
|
||||
})
|
||||
|
||||
# Create the planning prompt with agent awareness
|
||||
# Create the planning prompt with agent awareness and document handling information
|
||||
plan_prompt = f"""
|
||||
As an AI workflow manager, create a detailed agent-aware work plan for the following task:
|
||||
|
||||
|
|
@ -177,21 +190,25 @@ AVAILABLE AGENTS:
|
|||
AVAILABLE DOCUMENTS:
|
||||
{document_info if document_info else "No documents provided"}
|
||||
|
||||
IMPORTANT: Document extraction happens automatically in the workflow. Documents in the message are already available to all agents. DO NOT assign agent_coder or any other agent specifically for just reading or extracting document content. Only assign agents for tasks that require specific processing beyond what the document handler already provides.
|
||||
|
||||
The work plan should include a structured list of activities. Each activity should have:
|
||||
1. title - A short descriptive title for the activity
|
||||
2. description - What needs to be done in this activity
|
||||
3. assigned_agents - List of agent IDs that should handle this activity (can be multiple in sequence)
|
||||
4. agent_prompts - Specific instructions for each agent (matched by index to assigned_agents)
|
||||
5. document_requirements - Description of which documents are needed for this activity
|
||||
5. document_requirements - Description of which documents are needed for this activity (these will be automatically extracted)
|
||||
6. expected_output - The expected output format and content
|
||||
7. dependencies - List of previous activities this depends on (by index)
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
- Each activity should have clear objectives and be assigned to the most appropriate agent(s)
|
||||
- When multiple agents are assigned to an activity, specify the sequence and how outputs should flow between them
|
||||
- Documents are processed on-demand, so each activity should specify which documents it requires
|
||||
- Documents are processed on-demand by the system's document handler, so only specify which documents are needed, not how to extract them
|
||||
- DO NOT create activities that only read or extract document content - this happens automatically
|
||||
- Create a logical sequence where later activities can use outputs from earlier ones
|
||||
- If no specialized agent is needed for a task, use the default "assistant" agent
|
||||
- Only use the agent_coder for tasks that require actual coding or complex data analysis, not for simply reading documents
|
||||
|
||||
Return the work plan as a JSON array of activity objects, each with the above properties.
|
||||
"""
|
||||
|
|
@ -357,6 +374,20 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
|||
for i, agent_id in enumerate(agent_ids):
|
||||
# Get the agent
|
||||
agent = self.agent_registry.get_agent(agent_id)
|
||||
if agent:
|
||||
# Ensure dependencies are set
|
||||
if hasattr(agent, 'set_dependencies'):
|
||||
agent.set_dependencies(
|
||||
ai_service=self.ai_service,
|
||||
document_handler=self.document_handler,
|
||||
lucydom_interface=self.lucydom_interface
|
||||
)
|
||||
|
||||
# Set document handler if agent supports it
|
||||
if hasattr(agent, 'set_document_handler') and hasattr(self, 'document_handler'):
|
||||
agent.set_document_handler(self.document_handler)
|
||||
|
||||
|
||||
if not agent:
|
||||
self.logging_utils.warning(f"Agent '{agent_id}' not found, using assistant instead", "agents")
|
||||
agent = self.agent_registry.get_agent("assistant")
|
||||
|
|
@ -380,9 +411,18 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
|||
agent_message = self._create_message(workflow, "user")
|
||||
agent_message["content"] = enhanced_prompt
|
||||
|
||||
# Add any documents from previous agent if this is a continuation
|
||||
# IMPORTANT FIX: Document handling logic
|
||||
# First, check if we have documents from previous agent if this is a continuation
|
||||
if last_documents and i > 0:
|
||||
agent_message["documents"] = last_documents
|
||||
# For the first agent, make sure we pass any documents from the most recent user message
|
||||
elif i == 0:
|
||||
# Find the most recent user message with documents
|
||||
for msg in reversed(workflow.get("messages", [])):
|
||||
if msg.get("role") == "user" and msg.get("documents"):
|
||||
agent_message["documents"] = msg.get("documents", [])
|
||||
self.logging_utils.info(f"Passing {len(agent_message['documents'])} documents from user message to {agent_id}", "agents")
|
||||
break
|
||||
|
||||
# Log agent execution
|
||||
self.logging_utils.info(f"Executing agent: {agent_id}", "agents")
|
||||
|
|
@ -402,6 +442,7 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
|||
if "documents" in agent_response:
|
||||
response_message["documents"] = agent_response["documents"]
|
||||
last_documents = agent_response["documents"]
|
||||
self.logging_utils.info(f"Agent {agent_id} produced {len(last_documents)} documents", "agents")
|
||||
|
||||
# Add to workflow
|
||||
workflow["messages"].append(response_message)
|
||||
|
|
@ -418,7 +459,8 @@ Return the work plan as a JSON array of activity objects, each with the above pr
|
|||
"content": "No agent response was generated.",
|
||||
"format": "Text"
|
||||
}
|
||||
|
||||
|
||||
|
||||
async def _extract_required_documents(self, workflow: Dict[str, Any], doc_requirements: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract required documents based on requirements description.
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue