"""Web-related modules""" from abc import ABC, abstractmethod from pydantic import BaseModel, Field, HttpUrl from typing import List, Optional, Literal from modules.shared.configuration import APP_CONFIG from modules.datamodels.datamodelWorkflow import ActionDocument, ActionResult WEB_SEARCH_MAX_QUERY_LENGTH: int = int(APP_CONFIG.get("Web_Search_MAX_QUERY_LENGTH", "400")) WEB_SEARCH_MAX_RESULTS: int = int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20")) WEB_SEARCH_MIN_RESULTS: int = int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1")) class WebSearchRequest(BaseModel): query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS) # Tavily tuning options search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) time_range: Optional[Literal["d", "w", "m", "y"]] = Field( default=None, description="Limit results to last day/week/month/year" ) topic: Optional[Literal["general", "news", "academic"]] = Field(default=None) include_domains: Optional[List[str]] = Field(default=None) exclude_domains: Optional[List[str]] = Field(default=None) language: Optional[str] = Field(default=None, description="ISO language code like 'en', 'de'") include_answer: Optional[bool] = Field(default=None) include_raw_content: Optional[bool] = Field(default=None) class WebSearchResultItem(BaseModel): """Individual search result""" title: str url: HttpUrl class WebSearchDocumentData(BaseModel): """Complete search (and scrape) results document""" query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) # Allow both WebSearchResultItem and WebScrapeResultItem to be stored here results: List[object] total_count: int class WebSearchActionDocument(ActionDocument): documentData: WebSearchDocumentData class WebSearchActionResult(ActionResult): documents: List[WebSearchActionDocument] = Field(default_factory=list) class WebSearchBase(ABC): @abstractmethod async def search_urls(self, request: WebSearchRequest) -> WebSearchActionResult: ... # --- Web crawl --- class WebCrawlRequest(BaseModel): urls: List[HttpUrl] # Tavily extract options extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) format: Optional[Literal["text", "markdown"]] = Field(default=None) class WebCrawlResultItem(BaseModel): """Individual crawl result""" url: HttpUrl content: str class WebCrawlDocumentData(BaseModel): """Complete crawl results document""" urls: List[HttpUrl] results: List[WebCrawlResultItem] total_count: int class WebCrawlActionDocument(ActionDocument): documentData: WebCrawlDocumentData = Field( description="The data extracted from crawled URLs" ) class WebCrawlActionResult(ActionResult): documents: List[WebCrawlActionDocument] = Field(default_factory=list) class WebCrawlBase(ABC): @abstractmethod async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ... # --- Web scrape --- class WebScrapeRequest(BaseModel): query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH) max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS) # Pass-through search options search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None) topic: Optional[Literal["general", "news", "academic"]] = Field(default=None) include_domains: Optional[List[str]] = Field(default=None) exclude_domains: Optional[List[str]] = Field(default=None) language: Optional[str] = Field(default=None) include_answer: Optional[bool] = Field(default=None) include_raw_content: Optional[bool] = Field(default=None) # Extract options extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None) format: Optional[Literal["text", "markdown"]] = Field(default=None) class WebScrapeResultItem(BaseModel): """Individual scrape result""" url: HttpUrl content: str class WebScrapeActionDocument(ActionDocument): documentData: WebSearchDocumentData = Field( description="The data extracted from scraped URLs" ) class WebScrapeActionResult(ActionResult): documents: List[WebScrapeActionDocument] = Field(default_factory=list) class WebScrapeBase(ABC): @abstractmethod async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ...