""" WebCrawler-Agent for research and retrieval of information from the web. Adapted for the refactored Core-Module with language-agnostic detection. """ import json import logging import time import traceback from typing import List, Dict, Any, Optional from urllib.parse import quote_plus, unquote from bs4 import BeautifulSoup import requests from modules.agentservice_base import BaseAgent from modules.agentservice_utils import MessageUtils, LoggingUtils from modules.agentservice_protocol import AgentCommunicationProtocol from modules.utility import APP_CONFIG logger = logging.getLogger(__name__) class WebcrawlerAgent(BaseAgent): """Agent for Web Research and Information Retrieval""" def __init__(self): """Initialize the WebCrawler Agent""" super().__init__() self.id = "webcrawler" self.name = "Webscraper" self.type = "scraper" self.description = "Researches information on the web" self.capabilities = "web_search,information_retrieval,data_collection,source_verification,content_integration" self.result_format = "SearchResults" # Add enhanced document capabilities self.supports_documents = True self.document_capabilities = ["read", "create"] self.required_context = ["workflow_id"] self.document_handler = None # Initialize AI service self.ai_service = None # Initialize protocol self.protocol = AgentCommunicationProtocol() # Initialize utility classes self.message_utils = MessageUtils() # Web-Crawling configuration self.max_url = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_URLS")) self.max_key = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_KEYWORDS")) self.max_result = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_SEARCH_RESULTS")) self.timeout = int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT")) def get_agent_info(self) -> Dict[str, Any]: """Get agent information for agent registry""" info = super().get_agent_info() info.update({ "metadata": { "max_url": self.max_url, "max_result": self.max_result, "timeout": self.timeout } }) return info def set_document_handler(self, document_handler): """Set the document handler for file operations""" self.document_handler = document_handler async def process_message(self, message: Dict[str, Any], context: Dict[str, Any] = None) -> Dict[str, Any]: """ Process a message and conduct web research if appropriate. Args: message: The message to process context: Additional context Returns: The generated response or rejection if not a web research request """ # Extract workflow_id from context or message workflow_id = context.get("workflow_id") if context else message.get("workflow_id", "unknown") # Get or create logging_utils log_func = context.get("log_func") if context else None logging_utils = LoggingUtils(workflow_id, log_func) # Create response structure response = { "role": "assistant", "content": "", "agent_id": self.id, "agent_type": self.type, "agent_name": self.name, "result_format": self.result_format, "workflow_id": workflow_id } try: # Get the query from the message prompt = await self.get_prompt(message) # Check if this is explicitly a web research request using AI is_web_research = await self._is_web_research_request_ai(prompt) if not is_web_research: # Reject non-web research requests logging_utils.info("Request rejected: not a web research task", "agents") response["content"] = "This request doesn't appear to require web research. Redirecting to a more appropriate agent." response["status"] = "rejected" response["rejection_reason"] = "not_web_research" return response # Continue with web research process logging_utils.info(f"Web research for: {prompt[:50]}...", "agents") # Send status update using protocol if log_func: status_message = self.protocol.create_status_update_message( status_description="Starting web research", sender_id=self.id, status="in_progress", progress=0.0, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Update progress using protocol - 10% for starting the query analysis if log_func: status_message = self.protocol.create_status_update_message( status_description=f"Analyzing search strategy for: {prompt[:30]}...", sender_id=self.id, status="in_progress", progress=0.1, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Prepare the web query strategy try: # Log progress - 20% for query strategy preparation if log_func: status_message = self.protocol.create_status_update_message( status_description="Creating search strategy", sender_id=self.id, status="in_progress", progress=0.2, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Get the query strategy content_text = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You are a web research expert who develops precise search strategies." }, { "role": "user", "content": f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open. 'url': A list of maximum {self.max_url} specific URLs extracted from the task string. 'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information. Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition. """ } ] ) # Try to parse the JSON result if content_text.startswith("```json"): # Find the end of the JSON block end_marker = "```" end_index = content_text.rfind(end_marker) if end_index != -1: # Extract the JSON content without the markdown markers content_text = content_text[7:end_index].strip() try: logger.info(f"Valid json received: {str(content_text)}") pjson = json.loads(content_text) # Log parsed search strategy search_keys = pjson.get("skey", []) search_urls = pjson.get("url", []) if search_keys: logging_utils.info(f"Searching for {len(search_keys)} key terms: {', '.join(search_keys[:2])}...", "agents") if search_urls: logging_utils.info(f"Searching in {len(search_urls)} direct URLs: {', '.join(search_urls[:2])}...", "agents") # Log progress - 30% for starting the search if log_func: status_message = self.protocol.create_status_update_message( status_description="Starting web search", sender_id=self.id, status="in_progress", progress=0.3, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Execute the search results = [] total_tasks = len(search_keys) + len(search_urls) tasks_completed = 0 # Process search keywords for keyword in search_keys: logging_utils.info(f"Searching web for: '{keyword}'", "agents") # Log specific keyword search progress if log_func: progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks)) status_message = self.protocol.create_status_update_message( status_description=f"Searching for: '{keyword}'", sender_id=self.id, status="in_progress", progress=progress_pct, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) keyword_results = self.search_web(keyword) results.extend(keyword_results) logging_utils.info(f"Found: {len(keyword_results)} results for '{keyword}'", "agents") tasks_completed += 1 # Process direct URLs for url in search_urls: logging_utils.info(f"Extracting content from: {url}", "agents") # Log specific URL extraction progress if log_func: progress_pct = 0.3 + (0.5 * (tasks_completed / total_tasks)) status_message = self.protocol.create_status_update_message( status_description=f"Reading URL: {url}", sender_id=self.id, status="in_progress", progress=progress_pct, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) soup = self.read_url(url) # Extract title from the page if it exists if isinstance(soup, BeautifulSoup): title_tag = soup.find('title') title = title_tag.text.strip() if title_tag else "No title" # Alternative: You could also look for h1 tags if the title tag is missing if title == "No title": h1_tag = soup.find('h1') if h1_tag: title = h1_tag.text.strip() else: # Handle the case where soup is an error message string title = "Error fetching page" result = self.parse_result(soup, title, url) results.append(result) logging_utils.info(f"Extracted: '{title}' from {url}", "agents") tasks_completed += 1 # Log progress - 80% for processing results if log_func: status_message = self.protocol.create_status_update_message( status_description=f"Analyzing {len(results)} search results", sender_id=self.id, status="in_progress", progress=0.8, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Process results for the final output logging_utils.info(f"Analyzing {len(results)} web results", "agents") # Generate summaries for each result processed_results = [] for i, result in enumerate(results): result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) # Log individual result processing logging_utils.info(f"Analyzing result {i+1}/{len(results)}: {result['title'][:30]}...", "agents") web_answer_instructions = f""" Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}' Focus on the most important insights and connect them to the original request. You can skip any introduction. Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information. Here is the search result: {result_data_limited} """ content_summary = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You are an information analyst who precisely and relevantly summarizes web content." }, { "role": "user", "content": web_answer_instructions } ] ) # Limit summary to ~2000 characters content_summary = content_summary[:2000] processed_result = { "title": result['title'], "url": result['url'], "snippet": result['snippet'], "summary": content_summary } processed_results.append(processed_result) # Log progress - 90% for creating final summary if log_func: status_message = self.protocol.create_status_update_message( status_description="Creating overall summary", sender_id=self.id, status="in_progress", progress=0.9, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) # Create the final combined summary all_summaries = "\n\n".join([r["summary"] for r in processed_results]) all_summaries_limited = self.limit_text_for_api(all_summaries, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) logging_utils.info("Creating overall summary of web research", "agents") final_summary = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You create concise summaries of research findings." }, { "role": "user", "content": f"Please summarize these findings in 5-6 sentences: {all_summaries_limited}\n" } ] ) # Get the language of the request to use for result headers request_language_analysis = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You determine the language of a text and return only the language name." }, { "role": "user", "content": f"What language is this text in? Only respond with the language name: {prompt}" } ] ) # Get headers in the right language headers = await self._get_localized_headers(request_language_analysis.strip()) # Format the final result final_result = f"## {headers['web_research_results']}\n\n### {headers['summary']}\n{final_summary}\n\n### {headers['detailed_results']}\n" for i, result in enumerate(processed_results, 1): final_result += f"\n\n[{i}] {result['title']}\n{headers['url']}: {result['url']}\n{headers['snippet']}: {result['snippet']}\n{headers['content']}: {result['summary']}" # Set the content in the response response["content"] = final_result # Log completion - 100% progress if log_func: status_message = self.protocol.create_status_update_message( status_description="Web research completed", sender_id=self.id, status="completed", progress=1.0, context_id=workflow_id ) log_func(workflow_id, status_message.content, "info", self.id, self.name) logging_utils.info("Web research successfully completed", "agents") return response except json.JSONDecodeError as e: logging_utils.error(f"Error parsing JSON data: {e}", "error") # Fallback for JSON parse error if log_func: status_message = self.protocol.create_status_update_message( status_description=f"Error parsing search strategy: {str(e)}", sender_id=self.id, status="error", progress=0.0, context_id=workflow_id ) log_func(workflow_id, status_message.content, "error", self.id, self.name) # Use a simple fallback approach logging_utils.info("Using fallback search strategy with direct query", "agents") # Perform a direct search with the original query results = self.search_web(prompt) # Process and format results directly if results: result_text = "## Web Research Results (Fallback Mode)\n\n" for i, result in enumerate(results, 1): result_text += f"### [{i}] {result['title']}\n" result_text += f"URL: {result['url']}\n" result_text += f"Snippet: {result['snippet']}\n\n" response["content"] = result_text else: response["content"] = "## Web Research Results\n\nNo relevant results were found." return response except Exception as e: error_msg = f"Error during web research: {str(e)}" logging_utils.error(error_msg, "error") # Create error response using protocol error_message = self.protocol.create_error_message( error_description=error_msg, sender_id=self.id, error_type="web_search", error_details={"traceback": traceback.format_exc()}, context_id=workflow_id ) # Log error status if log_func: status_message = self.protocol.create_status_update_message( status_description=f"Error during web research: {str(e)}", sender_id=self.id, status="error", progress=1.0, context_id=workflow_id ) log_func(workflow_id, status_message.content, "error", self.id, self.name) response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```" return response except Exception as e: error_msg = f"Error during web research: {str(e)}" logging_utils.error(error_msg, "error") # Create error response using protocol error_message = self.protocol.create_error_message( error_description=error_msg, sender_id=self.id, error_type="web_search", error_details={"traceback": traceback.format_exc()}, context_id=workflow_id ) # Log error status if log_func: status_message = self.protocol.create_status_update_message( status_description=f"Error during web research: {str(e)}", sender_id=self.id, status="error", progress=1.0, context_id=workflow_id ) log_func(workflow_id, status_message.content, "error", self.id, self.name) response["content"] = f"## Error during web research\n\n{error_msg}\n\n```\n{traceback.format_exc()}\n```" return response async def _is_web_research_request_ai(self, prompt: str) -> bool: """ Uses AI to determine if a prompt requires web research, making it language-agnostic. Args: prompt: The user prompt Returns: True if this is explicitly a web research request, False otherwise """ if not self.ai_service: # Fallback to simpler detection if AI service isn't available return self._simple_web_detection(prompt) try: # Create a prompt to analyze whether this is a web research request analysis_prompt = f""" Analyze the following request and determine if it explicitly requires web research or online information. REQUEST: {prompt} A request requires web research if: 1. It explicitly asks to search for information online 2. It contains URLs or references to websites 3. It requests current information that would be available on the web 4. It asks to find information from web sources 5. It implicitly requires up-to-date information from the internet ONLY respond with a single word - either "YES" if web research is required, or "NO" if it is not. DO NOT include any explanation, just the answer YES or NO. """ # Call AI to analyze response = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You determine if a request requires web research. Always answer with only YES or NO." }, { "role": "user", "content": analysis_prompt } ] ) # Clean the response response = response.strip().upper() # Check if the response indicates it's a web research task if "YES" in response: return True else: return False except Exception as e: # Log error but don't fail, fall back to simpler detection logger.warning(f"Error using AI to detect web research request: {str(e)}") return self._simple_web_detection(prompt) def _simple_web_detection(self, prompt: str) -> bool: """ Simpler fallback method to detect web research requests based on URLs. Args: prompt: The user prompt Returns: True if there are clear URL indicators, False otherwise """ # URLs in the prompt strongly indicate web research url_indicators = ["http://", "https://", "www.", ".com", ".org", ".net", ".edu", ".gov"] # Check for URL patterns in the prompt contains_url = any(indicator in prompt.lower() for indicator in url_indicators) return contains_url async def _get_localized_headers(self, language: str) -> Dict[str, str]: """ Get localized headers for the web research results based on detected language. Args: language: The detected language Returns: Dictionary with localized headers """ # Default English headers headers = { "web_research_results": "Web Research Results", "summary": "Summary", "detailed_results": "Detailed Results", "url": "URL", "snippet": "Snippet", "content": "Content" } # If language detection failed or is English, return defaults if not language or language.lower() in ["english", "en"]: return headers try: # Use AI to translate headers to the detected language translation_prompt = f""" Translate these web research result headers to {language}: Web Research Results Summary Detailed Results URL Snippet Content Return a JSON object with these keys: web_research_results, summary, detailed_results, url, snippet, content """ # Call AI for translation response = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You translate headers to the specified language and return them as JSON." }, { "role": "user", "content": translation_prompt } ] ) # Extract JSON import re json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: translated_headers = json.loads(json_match.group(0)) return translated_headers except Exception as e: # Log error but continue with English headers logger.warning(f"Error translating headers to {language}: {str(e)}") return headers async def get_prompt(self, message_context: Dict[str, Any]) -> str: task = message_context.get("content", "") return task.strip() async def get_web_query(self, message_context: Dict[str, Any]) -> str: prompt = await self.get_prompt(message_context) result_json = await self.run_web_query(prompt) result_data = "" summary_src = "" logger.info(f"Web analysis prompt '{prompt}' delivers {len(result_json)} results.") if isinstance(result_json, list): total_tokens = 0 for i, result in enumerate(result_json, 1): # Limit content size for each result result_data_limited = self.limit_text_for_api(result['data'], max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) # Allow ~15000 tokens per result web_answer_instructions = f""" Summarize this search result according to the original request in approximately 2000 characters. Original request = '{prompt.replace("'","")}' Focus on the most important insights and connect them to the original request. You can skip any introduction. Extract only relevant and high-quality information related to the request, and present it in a clear format. Provide a balanced view of the researched information. Here is the search result: {result_data_limited} """ # Count tokens in the instructions to ensure we don't exceed API limits instruction_tokens = self.count_tokens(web_answer_instructions) if total_tokens + instruction_tokens > 60000: logger.warning(f"Skipping result {i} to avoid exceeding token limit") break total_tokens += instruction_tokens # Additional instructions for web research content_text = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You are an information analyst who precisely and relevantly summarizes web content." }, { "role": "user", "content": web_answer_instructions } ] ) # Create a summary but ensure we stay within token limits content_summary = content_text[:2000] # Limit to ~2000 characters result_data += f"\n\n[{i}] {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\nContent: {content_summary}" summary_src += f"\n{content_summary}" # Update token count total_tokens += self.count_tokens(content_summary) + 100 # Add buffer for formatting else: result_data = "no data received" logger.info(f"Web analysis result sent {len(result_data)}B") # Additional summary summary = "" if len(summary_src) > 1: # Limit summary source to ensure we don't exceed API limits summary_src_limited = self.limit_text_for_api(summary_src, max_tokens=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) summary = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You create concise summaries of research findings." }, { "role": "user", "content": f"Please summarize these findings in 5-6 sentences: {summary_src_limited}\n" } ] ) # Format the final result result = f"## Web Research Results\n\n### Summary\n{summary}\n\n### Detailed Results{result_data}" return result async def run_web_query(self, prompt: str) -> List[Dict]: if prompt=="": return [] ptext=f"""Create a comprehensive web research strategy for the task = '{prompt.replace("'","")}'. Return the results as a Python dictionary with these specific keys. If specific url are provided and the task requires analysis only on the provided url, then leave 'skey' open. 'url': A list of maximum {self.max_url} specific URLs extracted from the task string. 'skey': A list of maximum {self.max_key} key sentences to search for on the web. These should be precise, diverse, and targeted to get the most relevant information. Format your response as a valid json object with these two keys. Do not include any explanatory text or markdown outside of the object definition. """ content_text = await self.ai_service.call_api( messages=[ { "role": "system", "content": "You are a web research expert who develops precise search strategies." }, { "role": "user", "content": ptext } ] ) # Remove markdown formatting if present if content_text.startswith("```json"): # Find the end of the JSON block end_marker = "```" end_index = content_text.rfind(end_marker) if end_index != -1: # Extract the JSON content without the markdown markers content_text = content_text[7:end_index].strip() # Now parse the JSON try: logger.info(f"Valid json received: {str(content_text)}") pjson = json.loads(content_text) # Now call scrape_json with the parsed dictionary result_json = await self.scrape_json(pjson) return result_json except json.JSONDecodeError as e: logger.error(f"Failed to parse JSON: {e}") logger.error(f"Cleaned content: {content_text[:100]}...") return [] async def scrape_json(self, research_strategy: Dict[str, List]) -> List[Dict]: """ Scrapes web content based on a research strategy JSON. Args: research_strategy: A dictionary containing: - 'skey': List of search keywords - 'url': List of direct URLs to scrape Returns: Dictionary with URLs as keys and scraped content as values """ logger.info("Starting JSON-based web scraping") results = [] # Validate input structure if not isinstance(research_strategy, dict): logger.error("Invalid research_strategy format: not a dictionary") return {"error": "Invalid research_strategy format: not a dictionary"} keys = research_strategy.get("skey", []) direct_urls = research_strategy.get("url", []) if not isinstance(keys, list) or not isinstance(direct_urls, list): logger.error("Invalid research_strategy format: keys, or url is not a list") return {"error": "Invalid research_strategy format: keys, or url is not a list"} # Process search keywords through search engine for keyword in keys: logger.info(f"Processing keyword: {keyword}") found_results = self.search_web(keyword) # List with Dict: title,url,snippet,data logger.info(f"... {len(found_results)} results found") results.extend(found_results) # Process direct URLs logger.info(f"Processing {len(direct_urls)} direct URLs") for url in direct_urls: if url in results: logger.info(f"Skipping already scraped URL: {url}") continue soup = self.read_url(url) # Extract title from the page if it exists if isinstance(soup, BeautifulSoup): title_tag = soup.find('title') title = title_tag.text.strip() if title_tag else "No title" # Alternative: You could also look for h1 tags if the title tag is missing if title == "No title": h1_tag = soup.find('h1') if h1_tag: title = h1_tag.text.strip() else: # Handle the case where soup is an error message string title = "Error fetching page" results.append(self.parse_result(soup, title, url)) logger.info(f"JSON scraping completed. Scraped {len(results)} URLs in total") return results def extract_main_content(self, soup: BeautifulSoup, max_chars: int = int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) -> str: """ Extract the main content from an HTML page while limiting character count. Args: soup: BeautifulSoup object containing the page content max_chars: Maximum number of characters to extract Returns: Extracted main content as string """ if not isinstance(soup, BeautifulSoup): return str(soup)[:max_chars] # Try to find main content elements in order of priority main_content = None for selector in ['main', 'article', '#content', '.content', '#main', '.main']: content = soup.select_one(selector) if content: main_content = content break # If no main content found, use the body if not main_content: main_content = soup.find('body') or soup # Remove script, style, nav, footer elements that don't contribute to main content for element in main_content.select('script, style, nav, footer, header, aside, .sidebar, #sidebar, .comments, #comments, .advertisement, .ads, iframe'): element.extract() # Extract text content text_content = main_content.get_text(separator=' ', strip=True) # Limit to max_chars return text_content[:max_chars] def tokenize_for_counting(self, text: str) -> List[str]: """ Simple token counter for estimating token usage. This is an approximation since the exact tokenization depends on the model. Args: text: Input text Returns: List of tokens """ # Simple tokenization by splitting on whitespace and punctuation import re return re.findall(r'\w+|[^\w\s]', text) def count_tokens(self, text: str) -> int: """ Count the approximate number of tokens in a text. Args: text: Input text Returns: Estimated token count """ tokens = self.tokenize_for_counting(text) return len(tokens) def limit_text_for_api(self, text: str, max_tokens: int = int(APP_CONFIG.get("Connector_AiWebscraping_MAX_TOKENS"))) -> str: """ Limit the text to a maximum number of tokens. Args: text: Input text max_tokens: Maximum number of tokens allowed Returns: Limited text """ if not text: return "" tokens = self.tokenize_for_counting(text) # If text is already under the limit, return as is if len(tokens) <= max_tokens: return text # Otherwise, truncate text to max_tokens return " ".join(tokens[:max_tokens]) + "... [content truncated due to length]" def search_web(self, query: str) -> List[Dict]: formatted_query = quote_plus(query) url = f"{APP_CONFIG("Connector_AiWebscraping_SEARCH_ENGINE")}{formatted_query}" search_results_soup = self.read_url(url) if not search_results_soup or search_results_soup.select('.result') is None or len(search_results_soup.select('.result')) == 0: logger.warning(f"No search results found for: {query}") return [] # Extract search results results = [] # Find all result containers result_elements = search_results_soup.select('.result') for result in result_elements: # Extract title title_element = result.select_one('.result__a') title = title_element.text.strip() if title_element else 'No title' # Extract URL (DuckDuckGo uses redirects, need to extract from href param) url_element = title_element.get('href') if title_element else '' extracted_url = 'No URL' if url_element: # Extract the actual URL from DuckDuckGo's redirect if url_element.startswith('/d.js?q='): start = url_element.find('?q=') + 3 # Skip '?q=' end = url_element.find('&', start) if '&' in url_element[start:] else None extracted_url = unquote(url_element[start:end]) # Make sure the URL has the correct protocol prefix if not extracted_url.startswith(('http://', 'https://')): if not extracted_url.startswith('//'): extracted_url = 'https://' + extracted_url else: extracted_url = 'https:' + extracted_url else: extracted_url = url_element # Extract snippet directly from search results page snippet_element = result.select_one('.result__snippet') snippet = snippet_element.text.strip() if snippet_element else 'No description' # Now fetch the actual page content for the data field target_page_soup = self.read_url(extracted_url) # Use the new content extraction method to limit content size content = self.extract_main_content(target_page_soup, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) results.append({ 'title': title, 'url': extracted_url, 'snippet': snippet, 'data': content }) # Limit the number of results if needed if len(results) >= self.max_result: break return results def read_url(self, url: str) -> BeautifulSoup: """ Reads a URL and returns a BeautifulSoup parser for the content. Returns an empty BeautifulSoup object for errors. Args: url: The URL to read Returns: BeautifulSoup object with the content or empty for errors """ headers = { 'User-Agent': APP_CONFIG("Connector_AiWebscraping_USER_AGENT"), 'Accept': 'text/html,application/xhtml+xml,application/xml', 'Accept-Language': 'en-US,en;q=0.9', } try: # Initial request response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT"))) # Polling for status 202 if response.status_code == 202: # Maximum 3 attempts with increasing intervals backoff_times = [0.5, 1.0, 2.0, 5.0] # 0.5s, then 1s, then 2s for wait_time in backoff_times: time.sleep(wait_time) # Wait with increasing time response = requests.get(url, headers=headers, timeout=int(APP_CONFIG.get("Connector_AiWebscraping_TIMEOUT"))) # If no 202 anymore, then break if response.status_code != 202: break # For other error statuses, raise an error response.raise_for_status() # Parse HTML return BeautifulSoup(response.text, 'html.parser') except Exception as e: # Create empty BeautifulSoup object return BeautifulSoup("", 'html.parser') def parse_result(self, data: BeautifulSoup, title: str, url: str) -> Dict[str, str]: """ Parse a BeautifulSoup object into a result dictionary. Args: data: BeautifulSoup object containing the page content title: Page title url: Page URL Returns: Dictionary with result data """ # Extract content using the main content extraction method content = self.extract_main_content(data, max_chars=int(APP_CONFIG("Connector_AiWebscraping_MAX_TOKENS"))) result = { 'title': title, 'url': url, 'snippet': 'No description', # Default value 'data': content } return result # Singleton instance _webcrawler_agent = None def get_webcrawler_agent(): """Returns a singleton instance of the WebCrawler Agent""" global _webcrawler_agent if _webcrawler_agent is None: _webcrawler_agent = WebcrawlerAgent() return _webcrawler_agent