feat: finish implement web interface w/ tavily connector (untested)

This commit is contained in:
Christopher Gondek 2025-09-01 10:15:10 +02:00
parent 31177063de
commit 0816e7c45c
3 changed files with 116 additions and 11 deletions

View file

@ -7,6 +7,11 @@ from modules.interfaces.interface_web_model import (
WebCrawlBase,
WebCrawlDocumentData,
WebCrawlRequest,
WebScrapeActionDocument,
WebScrapeActionResult,
WebScrapeBase,
WebScrapeDocumentData,
WebScrapeRequest,
WebSearchBase,
WebSearchRequest,
WebSearchActionResult,
@ -25,7 +30,19 @@ logger = logging.getLogger(__name__)
@dataclass
class ConnectorTavily(WebSearchBase, WebCrawlBase):
class TavilySearchResult:
title: str
url: str
@dataclass
class TavilyCrawlResult:
url: str
content: str
@dataclass
class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
client: AsyncTavilyClient = None
@classmethod
@ -67,7 +84,30 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
return result
async def _search(self, query: str, max_results: int) -> WebSearchActionResult:
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult:
"""Turns a query in a list of urls with extracted content."""
# Step 1: Search
try:
search_results = await self._search(request.query, request.max_results)
except Exception as e:
return WebScrapeActionResult(success=False, error=str(e))
# Step 2: Crawl
try:
urls = [result.url for result in search_results]
crawl_results = await self._crawl(urls)
except Exception as e:
return WebScrapeActionResult(success=False, error=str(e))
# Step 3: Build ActionResult
try:
result = self._build_scrape_action_result(crawl_results)
except Exception as e:
return WebScrapeActionResult(success=False, error=str(e))
return result
async def _search(self, query: str, max_results: int) -> list[TavilySearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure max_results is within the allowed range
if max_results < 0 or max_results > 20:
@ -78,18 +118,19 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
logger.info(f"Tavily API response:\n{response}")
return response["results"]
return [
TavilySearchResult(title=result["title"], url=result["url"])
for result in response["results"]
]
def _build_search_action_result(
self, search_results: list
self, search_results: list[TavilySearchResult]
) -> WebSearchActionResult:
"""Builds the ActionResult from the search results."""
documents = []
for result in search_results:
document_name = f"web_search_{get_utc_timestamp()}.txt"
document_data = WebSearchDocumentData(
title=result["title"], url=result["url"]
)
document_data = WebSearchDocumentData(title=result.title, url=result.url)
mime_type = "application/json"
doc = WebSearchActionDocument(
documentName=document_name,
@ -107,9 +148,14 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
response = await self.client.extract(
urls=urls, extract_depth="advanced", format="text"
)
return response["results"]
return [
TavilyCrawlResult(url=result["url"], content=result["raw_content"])
for result in response["results"]
]
def _build_crawl_action_result(self, crawl_results: list) -> WebCrawlActionResult:
def _build_crawl_action_result(
self, crawl_results: list[TavilyCrawlResult]
) -> WebCrawlActionResult:
"""Builds the ActionResult from the crawl results."""
documents = []
for result in crawl_results:
@ -128,3 +174,23 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
return WebCrawlActionResult(
success=True, documents=documents, resultLabel="web_crawl_results"
)
def _build_scrape_action_result(
self, crawl_results: list[TavilyCrawlResult]
) -> WebScrapeActionResult:
"""Builds the ActionResult from the scrape results."""
documents = []
for result in crawl_results:
document_name = f"web_scrape_{get_utc_timestamp()}.txt"
doc_data = WebScrapeDocumentData(url=result.url, content=result.content)
mime_type = "application/json"
doc = WebScrapeActionDocument(
documentName=document_name,
documentData=doc_data,
mimeType=mime_type,
)
documents.append(doc)
return WebScrapeActionResult(
success=True, documents=documents, resultLabel="web_scrape_results"
)

View file

@ -65,6 +65,31 @@ class WebCrawlBase(ABC):
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ...
# --- Web query ---
# --- Web scrape ---
# query -> list of extracted text; combines web search and crawl in one step
# scrape -> list of extracted text; combines web search and crawl in one step
class WebScrapeRequest(BaseModel):
query: str
max_results: int
class WebScrapeDocumentData(BaseModel):
url: str
content: str
class WebScrapeActionDocument(ActionDocument):
documentData: WebScrapeDocumentData = Field(
description="The data extracted from a single scraped URL"
)
class WebScrapeActionResult(ActionResult):
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
class WebScrapeBase(ABC):
@abstractmethod
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ...

View file

@ -1,6 +1,10 @@
from modules.interfaces.interface_web_model import (
WebCrawlActionResult,
WebSearchActionResult,
WebSearchRequest,
WebCrawlRequest,
WebScrapeActionResult,
WebScrapeRequest,
)
from dataclasses import dataclass
@ -22,3 +26,13 @@ class WebInterface:
) -> WebSearchActionResult:
# NOTE: Add connectors here
return await self.connector_tavily.search_urls(web_search_request)
async def crawl(self, web_crawl_request: WebCrawlRequest) -> WebCrawlActionResult:
# NOTE: Add connectors here
return await self.connector_tavily.crawl_urls(web_crawl_request)
async def scrape(
self, web_scrape_request: WebScrapeRequest
) -> WebScrapeActionResult:
# NOTE: Add connectors here
return await self.connector_tavily.scrape(web_scrape_request)