import logging from typing import Any, Dict from modules.chat.methodBase import MethodBase, action from modules.interfaces.interfaceChatModel import ActionResult from modules.interfaces.interface_web_objects import WebInterface from modules.interfaces.interface_web_model import ( WebSearchRequest, WebCrawlRequest, WebScrapeRequest, ) logger = logging.getLogger(__name__) class MethodWeb(MethodBase): """Web method implementation for web operations.""" def __init__(self, serviceCenter: Any): super().__init__(serviceCenter) self.name = "web" self.description = "Web search, crawling, and scraping operations using Tavily" @action async def search(self, parameters: Dict[str, Any]) -> ActionResult: """Perform a web search and outputs a .json file with a list of found URLs. Each result contains "title" and "url". Parameters: query (str): Search query to perform maxResults (int, optional): Maximum number of results (default: 10) """ # TODO: Fix docstrings - do we need that format for parsing? try: # Prepare request data web_search_request = WebSearchRequest( query=parameters.get("query"), max_results=parameters.get("maxResults", 10), ) # Perform request web_interface = await WebInterface.create() web_search_result = await web_interface.search(web_search_request) return web_search_result except Exception as e: return ActionResult(success=False, error=str(e)) @action async def crawl(self, parameters: Dict[str, Any]) -> ActionResult: """Crawls a list of URLs and extracts information from them. Parameters: document (str): Document reference containing URL list from search results expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description """ try: document_ref = parameters.get("document") if not document_ref: return ActionResult( success=False, error="No document reference provided." ) # Resolve document reference to ChatDocument objects chat_documents = self.service.getChatDocumentsFromDocumentList( [document_ref] ) if not chat_documents: return ActionResult( success=False, error=f"No documents found for reference: {document_ref}", ) # Get the first document (search results) search_doc = chat_documents[0] # Get file data using the service center file_data = self.service.getFileData(search_doc.fileId) if not file_data: return ActionResult( success=False, error="Could not retrieve file data for document" ) content = file_data.decode("utf-8") # Parse JSON to extract URLs from search results import json try: # The document structure from WebSearchActionResult search_data = json.loads(content) # Extract URLs from the search results structure urls = [] if isinstance(search_data, dict): # Handle the document structure: documentData contains the actual search results doc_data = search_data.get("documentData", search_data) if "results" in doc_data and isinstance(doc_data["results"], list): urls = [ result["url"] for result in doc_data["results"] if isinstance(result, dict) and "url" in result ] elif "urls" in doc_data and isinstance(doc_data["urls"], list): # Fallback: if URLs are stored directly in a 'urls' field urls = [url for url in doc_data["urls"] if isinstance(url, str)] # Fallback: try to parse as plain text with regex (for backward compatibility) if not urls: logger.warning( "Could not extract URLs from JSON structure, trying plain text parsing" ) import re urls = re.split(r"[\n,;]+", content) urls = [ u.strip() for u in urls if u.strip() and ( u.strip().startswith("http://") or u.strip().startswith("https://") ) ] except json.JSONDecodeError: # Fallback to plain text parsing if JSON parsing fails logger.warning("Document is not valid JSON, trying plain text parsing") import re urls = re.split(r"[\n,;]+", content) urls = [ u.strip() for u in urls if u.strip() and ( u.strip().startswith("http://") or u.strip().startswith("https://") ) ] if not urls: return ActionResult( success=False, error="No valid URLs found in the document." ) logger.info(f"Extracted {len(urls)} URLs from document: {urls}") # Prepare request data web_crawl_request = WebCrawlRequest(urls=urls) # Perform request web_interface = await WebInterface.create() web_crawl_result = await web_interface.crawl(web_crawl_request) return web_crawl_result except Exception as e: logger.error(f"Error in crawl method: {str(e)}") return ActionResult(success=False, error=str(e)) @action async def scrape(self, parameters: Dict[str, Any]) -> ActionResult: """Scrapes web content by searching for URLs and then extracting their content. Combines search and crawl operations in one step. Parameters: query (str): Search query to perform maxResults (int, optional): Maximum number of results (default: 10) """ try: query = parameters.get("query") max_results = parameters.get("maxResults", 10) if not query: return ActionResult(success=False, error="Search query is required") # Prepare request data web_scrape_request = WebScrapeRequest( query=query, max_results=max_results, ) # Perform request web_interface = await WebInterface.create() web_scrape_result = await web_interface.scrape(web_scrape_request) return web_scrape_result except Exception as e: return ActionResult(success=False, error=str(e))