From 4c3592d7d9aaa7c811a5afa54283a75ecec14ce8 Mon Sep 17 00:00:00 2001 From: Christopher Gondek Date: Mon, 1 Sep 2025 11:21:37 +0200 Subject: [PATCH] feat: switch to single file approach --- modules/connectors/connector_tavily.py | 115 +++++++++------- modules/interfaces/interface_web_model.py | 42 +++++- modules/methods/method_web.py | 160 +++++++++++++++++++++- 3 files changed, 261 insertions(+), 56 deletions(-) diff --git a/modules/connectors/connector_tavily.py b/modules/connectors/connector_tavily.py index 786dc1f1..af802790 100644 --- a/modules/connectors/connector_tavily.py +++ b/modules/connectors/connector_tavily.py @@ -7,16 +7,19 @@ from modules.interfaces.interface_web_model import ( WebCrawlBase, WebCrawlDocumentData, WebCrawlRequest, + WebCrawlResultItem, WebScrapeActionDocument, WebScrapeActionResult, WebScrapeBase, WebScrapeDocumentData, WebScrapeRequest, + WebScrapeResultItem, WebSearchBase, WebSearchRequest, WebSearchActionResult, WebSearchActionDocument, WebSearchDocumentData, + WebSearchResultItem, WebCrawlActionDocument, WebCrawlActionResult, ) @@ -62,7 +65,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): # Step 2: Build ActionResult try: - result = self._build_search_action_result(search_results) + result = self._build_search_action_result(search_results, request.query) except Exception as e: return WebSearchActionResult(success=False, error=str(e)) @@ -78,7 +81,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): # Step 2: Build ActionResult try: - result = self._build_crawl_action_result(crawl_results) + result = self._build_crawl_action_result(crawl_results, request.urls) except Exception as e: return WebCrawlActionResult(success=False, error=str(e)) @@ -101,7 +104,7 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): # Step 3: Build ActionResult try: - result = self._build_scrape_action_result(crawl_results) + result = self._build_scrape_action_result(crawl_results, request.query) except Exception as e: return WebScrapeActionResult(success=False, error=str(e)) @@ -124,26 +127,32 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): ] def _build_search_action_result( - self, search_results: list[TavilySearchResult] + self, search_results: list[TavilySearchResult], query: str = "" ) -> WebSearchActionResult: """Builds the ActionResult from the search results.""" - documents = [] - for result in search_results: - document_name = f"web_search_{get_utc_timestamp()}.txt" - document_data = WebSearchDocumentData(title=result.title, url=result.url) - mime_type = "application/json" - doc = WebSearchActionDocument( - documentName=document_name, - documentData=document_data, - mimeType=mime_type, - ) - documents.append(doc) + # Convert to result items + result_items = [ + WebSearchResultItem(title=result.title, url=result.url) + for result in search_results + ] - return WebSearchActionResult( - success=True, documents=documents, resultLabel="web_search_results" + # Create document data with all results + document_data = WebSearchDocumentData( + query=query, results=result_items, total_count=len(result_items) ) - async def _crawl(self, urls: list) -> list[str]: + # Create single document + document = WebSearchActionDocument( + documentName=f"web_search_results_{get_utc_timestamp()}.json", + documentData=document_data, + mimeType="application/json", + ) + + return WebSearchActionResult( + success=True, documents=[document], resultLabel="web_search_results" + ) + + async def _crawl(self, urls: list) -> list[TavilyCrawlResult]: """Calls the Tavily API to extract text content from URLs.""" response = await self.client.extract( urls=urls, extract_depth="advanced", format="text" @@ -154,43 +163,57 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): ] def _build_crawl_action_result( - self, crawl_results: list[TavilyCrawlResult] + self, crawl_results: list[TavilyCrawlResult], urls: list[str] = None ) -> WebCrawlActionResult: """Builds the ActionResult from the crawl results.""" - documents = [] - for result in crawl_results: - document_name = f"web_crawl_{get_utc_timestamp()}.txt" - doc_data = WebCrawlDocumentData( - url=result["url"], content=result["raw_content"] - ) - mime_type = "application/json" - doc = WebCrawlActionDocument( - documentName=document_name, - documentData=doc_data, - mimeType=mime_type, - ) - documents.append(doc) + # Convert to result items + result_items = [ + WebCrawlResultItem(url=result.url, content=result.content) + for result in crawl_results + ] + + # Create document data with all results + document_data = WebCrawlDocumentData( + urls=urls or [result.url for result in crawl_results], + results=result_items, + total_count=len(result_items), + ) + + # Create single document + document = WebCrawlActionDocument( + documentName=f"web_crawl_results_{get_utc_timestamp()}.json", + documentData=document_data, + mimeType="application/json", + ) return WebCrawlActionResult( - success=True, documents=documents, resultLabel="web_crawl_results" + success=True, documents=[document], resultLabel="web_crawl_results" ) def _build_scrape_action_result( - self, crawl_results: list[TavilyCrawlResult] + self, crawl_results: list[TavilyCrawlResult], query: str = "" ) -> WebScrapeActionResult: """Builds the ActionResult from the scrape results.""" - documents = [] - for result in crawl_results: - document_name = f"web_scrape_{get_utc_timestamp()}.txt" - doc_data = WebScrapeDocumentData(url=result.url, content=result.content) - mime_type = "application/json" - doc = WebScrapeActionDocument( - documentName=document_name, - documentData=doc_data, - mimeType=mime_type, - ) - documents.append(doc) + # Convert to result items + result_items = [ + WebScrapeResultItem(url=result.url, content=result.content) + for result in crawl_results + ] + + # Create document data with all results + document_data = WebScrapeDocumentData( + query=query, + results=result_items, + total_count=len(result_items), + ) + + # Create single document + document = WebScrapeActionDocument( + documentName=f"web_scrape_results_{get_utc_timestamp()}.json", + documentData=document_data, + mimeType="application/json", + ) return WebScrapeActionResult( - success=True, documents=documents, resultLabel="web_scrape_results" + success=True, documents=[document], resultLabel="web_scrape_results" ) diff --git a/modules/interfaces/interface_web_model.py b/modules/interfaces/interface_web_model.py index 389cd7ed..86f19e08 100644 --- a/modules/interfaces/interface_web_model.py +++ b/modules/interfaces/interface_web_model.py @@ -2,8 +2,6 @@ from abc import ABC, abstractmethod from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult - - from pydantic import BaseModel, Field from typing import List @@ -18,11 +16,21 @@ class WebSearchRequest(BaseModel): max_results: int -class WebSearchDocumentData(BaseModel): +class WebSearchResultItem(BaseModel): + """Individual search result""" + title: str url: str +class WebSearchDocumentData(BaseModel): + """Complete search results document""" + + query: str + results: List[WebSearchResultItem] + total_count: int + + class WebSearchActionDocument(ActionDocument): documentData: WebSearchDocumentData @@ -45,14 +53,24 @@ class WebCrawlRequest(BaseModel): urls: List[str] -class WebCrawlDocumentData(BaseModel): +class WebCrawlResultItem(BaseModel): + """Individual crawl result""" + url: str content: str +class WebCrawlDocumentData(BaseModel): + """Complete crawl results document""" + + urls: List[str] + results: List[WebCrawlResultItem] + total_count: int + + class WebCrawlActionDocument(ActionDocument): documentData: WebCrawlDocumentData = Field( - description="The data extracted from a single crawled URL" + description="The data extracted from crawled URLs" ) @@ -75,14 +93,24 @@ class WebScrapeRequest(BaseModel): max_results: int -class WebScrapeDocumentData(BaseModel): +class WebScrapeResultItem(BaseModel): + """Individual scrape result""" + url: str content: str +class WebScrapeDocumentData(BaseModel): + """Complete scrape results document""" + + query: str + results: List[WebScrapeResultItem] + total_count: int + + class WebScrapeActionDocument(ActionDocument): documentData: WebScrapeDocumentData = Field( - description="The data extracted from a single scraped URL" + description="The data extracted from scraped URLs" ) diff --git a/modules/methods/method_web.py b/modules/methods/method_web.py index 27b82ba5..ccb0f185 100644 --- a/modules/methods/method_web.py +++ b/modules/methods/method_web.py @@ -3,7 +3,11 @@ from typing import Any, Dict from modules.chat.methodBase import MethodBase, action from modules.interfaces.interfaceChatModel import ActionResult from modules.interfaces.interface_web_objects import WebInterface -from modules.interfaces.interface_web_model import WebSearchRequest +from modules.interfaces.interface_web_model import ( + WebSearchRequest, + WebCrawlRequest, + WebScrapeRequest, +) logger = logging.getLogger(__name__) @@ -14,11 +18,14 @@ class MethodWeb(MethodBase): def __init__(self, serviceCenter: Any): super().__init__(serviceCenter) + self.name = "web" + self.description = "Web search, crawling, and scraping operations using Tavily" @action async def search(self, parameters: Dict[str, Any]) -> ActionResult: - """ - Perform a web search and output a .txt file with a plain list of URLs (one per line). + """Perform a web search and outputs a .json file with a list of found URLs. + + Each result contains "title" and "url". Parameters: query (str): Search query to perform @@ -41,3 +48,150 @@ class MethodWeb(MethodBase): except Exception as e: return ActionResult(success=False, error=str(e)) + + @action + async def crawl(self, parameters: Dict[str, Any]) -> ActionResult: + """Crawls a list of URLs and extracts information from them. + + Parameters: + document (str): Document reference containing URL list from search results + expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description + """ + try: + document_ref = parameters.get("document") + + if not document_ref: + return ActionResult( + success=False, error="No document reference provided." + ) + + # Resolve document reference to ChatDocument objects + chat_documents = self.service.getChatDocumentsFromDocumentList( + [document_ref] + ) + + if not chat_documents: + return ActionResult( + success=False, + error=f"No documents found for reference: {document_ref}", + ) + + # Get the first document (search results) + search_doc = chat_documents[0] + + # Get file data using the service center + file_data = self.service.getFileData(search_doc.fileId) + if not file_data: + return ActionResult( + success=False, error="Could not retrieve file data for document" + ) + + content = file_data.decode("utf-8") + + # Parse JSON to extract URLs from search results + import json + + try: + # The document structure from WebSearchActionResult + search_data = json.loads(content) + + # Extract URLs from the search results structure + urls = [] + if isinstance(search_data, dict): + # Handle the document structure: documentData contains the actual search results + doc_data = search_data.get("documentData", search_data) + if "results" in doc_data and isinstance(doc_data["results"], list): + urls = [ + result["url"] + for result in doc_data["results"] + if isinstance(result, dict) and "url" in result + ] + elif "urls" in doc_data and isinstance(doc_data["urls"], list): + # Fallback: if URLs are stored directly in a 'urls' field + urls = [url for url in doc_data["urls"] if isinstance(url, str)] + + # Fallback: try to parse as plain text with regex (for backward compatibility) + if not urls: + logger.warning( + "Could not extract URLs from JSON structure, trying plain text parsing" + ) + import re + + urls = re.split(r"[\n,;]+", content) + urls = [ + u.strip() + for u in urls + if u.strip() + and ( + u.strip().startswith("http://") + or u.strip().startswith("https://") + ) + ] + + except json.JSONDecodeError: + # Fallback to plain text parsing if JSON parsing fails + logger.warning("Document is not valid JSON, trying plain text parsing") + import re + + urls = re.split(r"[\n,;]+", content) + urls = [ + u.strip() + for u in urls + if u.strip() + and ( + u.strip().startswith("http://") + or u.strip().startswith("https://") + ) + ] + + if not urls: + return ActionResult( + success=False, error="No valid URLs found in the document." + ) + + logger.info(f"Extracted {len(urls)} URLs from document: {urls}") + + # Prepare request data + web_crawl_request = WebCrawlRequest(urls=urls) + + # Perform request + web_interface = await WebInterface.create() + web_crawl_result = await web_interface.crawl(web_crawl_request) + + return web_crawl_result + + except Exception as e: + logger.error(f"Error in crawl method: {str(e)}") + return ActionResult(success=False, error=str(e)) + + @action + async def scrape(self, parameters: Dict[str, Any]) -> ActionResult: + """Scrapes web content by searching for URLs and then extracting their content. + + Combines search and crawl operations in one step. + + Parameters: + query (str): Search query to perform + maxResults (int, optional): Maximum number of results (default: 10) + """ + try: + query = parameters.get("query") + max_results = parameters.get("maxResults", 10) + + if not query: + return ActionResult(success=False, error="Search query is required") + + # Prepare request data + web_scrape_request = WebScrapeRequest( + query=query, + max_results=max_results, + ) + + # Perform request + web_interface = await WebInterface.create() + web_scrape_result = await web_interface.scrape(web_scrape_request) + + return web_scrape_result + + except Exception as e: + return ActionResult(success=False, error=str(e))