feat: finish implement web interface w/ tavily connector (untested)
This commit is contained in:
parent
31177063de
commit
0816e7c45c
3 changed files with 116 additions and 11 deletions
|
|
@ -7,6 +7,11 @@ from modules.interfaces.interface_web_model import (
|
||||||
WebCrawlBase,
|
WebCrawlBase,
|
||||||
WebCrawlDocumentData,
|
WebCrawlDocumentData,
|
||||||
WebCrawlRequest,
|
WebCrawlRequest,
|
||||||
|
WebScrapeActionDocument,
|
||||||
|
WebScrapeActionResult,
|
||||||
|
WebScrapeBase,
|
||||||
|
WebScrapeDocumentData,
|
||||||
|
WebScrapeRequest,
|
||||||
WebSearchBase,
|
WebSearchBase,
|
||||||
WebSearchRequest,
|
WebSearchRequest,
|
||||||
WebSearchActionResult,
|
WebSearchActionResult,
|
||||||
|
|
@ -25,7 +30,19 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ConnectorTavily(WebSearchBase, WebCrawlBase):
|
class TavilySearchResult:
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TavilyCrawlResult:
|
||||||
|
url: str
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
client: AsyncTavilyClient = None
|
client: AsyncTavilyClient = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -67,7 +84,30 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def _search(self, query: str, max_results: int) -> WebSearchActionResult:
|
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult:
|
||||||
|
"""Turns a query in a list of urls with extracted content."""
|
||||||
|
# Step 1: Search
|
||||||
|
try:
|
||||||
|
search_results = await self._search(request.query, request.max_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 2: Crawl
|
||||||
|
try:
|
||||||
|
urls = [result.url for result in search_results]
|
||||||
|
crawl_results = await self._crawl(urls)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 3: Build ActionResult
|
||||||
|
try:
|
||||||
|
result = self._build_scrape_action_result(crawl_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _search(self, query: str, max_results: int) -> list[TavilySearchResult]:
|
||||||
"""Calls the Tavily API to perform a web search."""
|
"""Calls the Tavily API to perform a web search."""
|
||||||
# Make sure max_results is within the allowed range
|
# Make sure max_results is within the allowed range
|
||||||
if max_results < 0 or max_results > 20:
|
if max_results < 0 or max_results > 20:
|
||||||
|
|
@ -78,18 +118,19 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
|
||||||
|
|
||||||
logger.info(f"Tavily API response:\n{response}")
|
logger.info(f"Tavily API response:\n{response}")
|
||||||
|
|
||||||
return response["results"]
|
return [
|
||||||
|
TavilySearchResult(title=result["title"], url=result["url"])
|
||||||
|
for result in response["results"]
|
||||||
|
]
|
||||||
|
|
||||||
def _build_search_action_result(
|
def _build_search_action_result(
|
||||||
self, search_results: list
|
self, search_results: list[TavilySearchResult]
|
||||||
) -> WebSearchActionResult:
|
) -> WebSearchActionResult:
|
||||||
"""Builds the ActionResult from the search results."""
|
"""Builds the ActionResult from the search results."""
|
||||||
documents = []
|
documents = []
|
||||||
for result in search_results:
|
for result in search_results:
|
||||||
document_name = f"web_search_{get_utc_timestamp()}.txt"
|
document_name = f"web_search_{get_utc_timestamp()}.txt"
|
||||||
document_data = WebSearchDocumentData(
|
document_data = WebSearchDocumentData(title=result.title, url=result.url)
|
||||||
title=result["title"], url=result["url"]
|
|
||||||
)
|
|
||||||
mime_type = "application/json"
|
mime_type = "application/json"
|
||||||
doc = WebSearchActionDocument(
|
doc = WebSearchActionDocument(
|
||||||
documentName=document_name,
|
documentName=document_name,
|
||||||
|
|
@ -107,9 +148,14 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
|
||||||
response = await self.client.extract(
|
response = await self.client.extract(
|
||||||
urls=urls, extract_depth="advanced", format="text"
|
urls=urls, extract_depth="advanced", format="text"
|
||||||
)
|
)
|
||||||
return response["results"]
|
return [
|
||||||
|
TavilyCrawlResult(url=result["url"], content=result["raw_content"])
|
||||||
|
for result in response["results"]
|
||||||
|
]
|
||||||
|
|
||||||
def _build_crawl_action_result(self, crawl_results: list) -> WebCrawlActionResult:
|
def _build_crawl_action_result(
|
||||||
|
self, crawl_results: list[TavilyCrawlResult]
|
||||||
|
) -> WebCrawlActionResult:
|
||||||
"""Builds the ActionResult from the crawl results."""
|
"""Builds the ActionResult from the crawl results."""
|
||||||
documents = []
|
documents = []
|
||||||
for result in crawl_results:
|
for result in crawl_results:
|
||||||
|
|
@ -128,3 +174,23 @@ class ConnectorTavily(WebSearchBase, WebCrawlBase):
|
||||||
return WebCrawlActionResult(
|
return WebCrawlActionResult(
|
||||||
success=True, documents=documents, resultLabel="web_crawl_results"
|
success=True, documents=documents, resultLabel="web_crawl_results"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _build_scrape_action_result(
|
||||||
|
self, crawl_results: list[TavilyCrawlResult]
|
||||||
|
) -> WebScrapeActionResult:
|
||||||
|
"""Builds the ActionResult from the scrape results."""
|
||||||
|
documents = []
|
||||||
|
for result in crawl_results:
|
||||||
|
document_name = f"web_scrape_{get_utc_timestamp()}.txt"
|
||||||
|
doc_data = WebScrapeDocumentData(url=result.url, content=result.content)
|
||||||
|
mime_type = "application/json"
|
||||||
|
doc = WebScrapeActionDocument(
|
||||||
|
documentName=document_name,
|
||||||
|
documentData=doc_data,
|
||||||
|
mimeType=mime_type,
|
||||||
|
)
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
return WebScrapeActionResult(
|
||||||
|
success=True, documents=documents, resultLabel="web_scrape_results"
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,31 @@ class WebCrawlBase(ABC):
|
||||||
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ...
|
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ...
|
||||||
|
|
||||||
|
|
||||||
# --- Web query ---
|
# --- Web scrape ---
|
||||||
|
|
||||||
# query -> list of extracted text; combines web search and crawl in one step
|
# scrape -> list of extracted text; combines web search and crawl in one step
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeRequest(BaseModel):
|
||||||
|
query: str
|
||||||
|
max_results: int
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeDocumentData(BaseModel):
|
||||||
|
url: str
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeActionDocument(ActionDocument):
|
||||||
|
documentData: WebScrapeDocumentData = Field(
|
||||||
|
description="The data extracted from a single scraped URL"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeActionResult(ActionResult):
|
||||||
|
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ...
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
from modules.interfaces.interface_web_model import (
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebCrawlActionResult,
|
||||||
WebSearchActionResult,
|
WebSearchActionResult,
|
||||||
WebSearchRequest,
|
WebSearchRequest,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebScrapeActionResult,
|
||||||
|
WebScrapeRequest,
|
||||||
)
|
)
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
@ -22,3 +26,13 @@ class WebInterface:
|
||||||
) -> WebSearchActionResult:
|
) -> WebSearchActionResult:
|
||||||
# NOTE: Add connectors here
|
# NOTE: Add connectors here
|
||||||
return await self.connector_tavily.search_urls(web_search_request)
|
return await self.connector_tavily.search_urls(web_search_request)
|
||||||
|
|
||||||
|
async def crawl(self, web_crawl_request: WebCrawlRequest) -> WebCrawlActionResult:
|
||||||
|
# NOTE: Add connectors here
|
||||||
|
return await self.connector_tavily.crawl_urls(web_crawl_request)
|
||||||
|
|
||||||
|
async def scrape(
|
||||||
|
self, web_scrape_request: WebScrapeRequest
|
||||||
|
) -> WebScrapeActionResult:
|
||||||
|
# NOTE: Add connectors here
|
||||||
|
return await self.connector_tavily.scrape(web_scrape_request)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue