diff --git a/modules/connectors/connector_tavily.py b/modules/connectors/connector_tavily.py index 783fea8c..786dc1f1 100644 --- a/modules/connectors/connector_tavily.py +++ b/modules/connectors/connector_tavily.py @@ -7,6 +7,11 @@ from modules.interfaces.interface_web_model import ( WebCrawlBase, WebCrawlDocumentData, WebCrawlRequest, + WebScrapeActionDocument, + WebScrapeActionResult, + WebScrapeBase, + WebScrapeDocumentData, + WebScrapeRequest, WebSearchBase, WebSearchRequest, WebSearchActionResult, @@ -25,7 +30,19 @@ logger = logging.getLogger(__name__) @dataclass -class ConnectorTavily(WebSearchBase, WebCrawlBase): +class TavilySearchResult: + title: str + url: str + + +@dataclass +class TavilyCrawlResult: + url: str + content: str + + +@dataclass +class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase): client: AsyncTavilyClient = None @classmethod @@ -67,7 +84,30 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase): return result - async def _search(self, query: str, max_results: int) -> WebSearchActionResult: + async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: + """Turns a query in a list of urls with extracted content.""" + # Step 1: Search + try: + search_results = await self._search(request.query, request.max_results) + except Exception as e: + return WebScrapeActionResult(success=False, error=str(e)) + + # Step 2: Crawl + try: + urls = [result.url for result in search_results] + crawl_results = await self._crawl(urls) + except Exception as e: + return WebScrapeActionResult(success=False, error=str(e)) + + # Step 3: Build ActionResult + try: + result = self._build_scrape_action_result(crawl_results) + except Exception as e: + return WebScrapeActionResult(success=False, error=str(e)) + + return result + + async def _search(self, query: str, max_results: int) -> list[TavilySearchResult]: """Calls the Tavily API to perform a web search.""" # Make sure max_results is within the allowed range if max_results < 0 or max_results > 20: @@ -78,18 +118,19 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase): logger.info(f"Tavily API response:\n{response}") - return response["results"] + return [ + TavilySearchResult(title=result["title"], url=result["url"]) + for result in response["results"] + ] def _build_search_action_result( - self, search_results: list + self, search_results: list[TavilySearchResult] ) -> WebSearchActionResult: """Builds the ActionResult from the search results.""" documents = [] for result in search_results: document_name = f"web_search_{get_utc_timestamp()}.txt" - document_data = WebSearchDocumentData( - title=result["title"], url=result["url"] - ) + document_data = WebSearchDocumentData(title=result.title, url=result.url) mime_type = "application/json" doc = WebSearchActionDocument( documentName=document_name, @@ -107,9 +148,14 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase): response = await self.client.extract( urls=urls, extract_depth="advanced", format="text" ) - return response["results"] + return [ + TavilyCrawlResult(url=result["url"], content=result["raw_content"]) + for result in response["results"] + ] - def _build_crawl_action_result(self, crawl_results: list) -> WebCrawlActionResult: + def _build_crawl_action_result( + self, crawl_results: list[TavilyCrawlResult] + ) -> WebCrawlActionResult: """Builds the ActionResult from the crawl results.""" documents = [] for result in crawl_results: @@ -128,3 +174,23 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase): return WebCrawlActionResult( success=True, documents=documents, resultLabel="web_crawl_results" ) + + def _build_scrape_action_result( + self, crawl_results: list[TavilyCrawlResult] + ) -> WebScrapeActionResult: + """Builds the ActionResult from the scrape results.""" + documents = [] + for result in crawl_results: + document_name = f"web_scrape_{get_utc_timestamp()}.txt" + doc_data = WebScrapeDocumentData(url=result.url, content=result.content) + mime_type = "application/json" + doc = WebScrapeActionDocument( + documentName=document_name, + documentData=doc_data, + mimeType=mime_type, + ) + documents.append(doc) + + return WebScrapeActionResult( + success=True, documents=documents, resultLabel="web_scrape_results" + ) diff --git a/modules/interfaces/interface_web_model.py b/modules/interfaces/interface_web_model.py index 0a258623..389cd7ed 100644 --- a/modules/interfaces/interface_web_model.py +++ b/modules/interfaces/interface_web_model.py @@ -65,6 +65,31 @@ class WebCrawlBase(ABC): async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ... -# --- Web query --- +# --- Web scrape --- -# query -> list of extracted text; combines web search and crawl in one step +# scrape -> list of extracted text; combines web search and crawl in one step + + +class WebScrapeRequest(BaseModel): + query: str + max_results: int + + +class WebScrapeDocumentData(BaseModel): + url: str + content: str + + +class WebScrapeActionDocument(ActionDocument): + documentData: WebScrapeDocumentData = Field( + description="The data extracted from a single scraped URL" + ) + + +class WebScrapeActionResult(ActionResult): + documents: List[WebScrapeActionDocument] = Field(default_factory=list) + + +class WebScrapeBase(ABC): + @abstractmethod + async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ... diff --git a/modules/interfaces/interface_web_objects.py b/modules/interfaces/interface_web_objects.py index b38db6a3..0ea43bd7 100644 --- a/modules/interfaces/interface_web_objects.py +++ b/modules/interfaces/interface_web_objects.py @@ -1,6 +1,10 @@ from modules.interfaces.interface_web_model import ( + WebCrawlActionResult, WebSearchActionResult, WebSearchRequest, + WebCrawlRequest, + WebScrapeActionResult, + WebScrapeRequest, ) from dataclasses import dataclass @@ -22,3 +26,13 @@ class WebInterface: ) -> WebSearchActionResult: # NOTE: Add connectors here return await self.connector_tavily.search_urls(web_search_request) + + async def crawl(self, web_crawl_request: WebCrawlRequest) -> WebCrawlActionResult: + # NOTE: Add connectors here + return await self.connector_tavily.crawl_urls(web_crawl_request) + + async def scrape( + self, web_scrape_request: WebScrapeRequest + ) -> WebScrapeActionResult: + # NOTE: Add connectors here + return await self.connector_tavily.scrape(web_scrape_request)