141 lines
4.5 KiB
Python
141 lines
4.5 KiB
Python
"""Web-related modules"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from pydantic import BaseModel, Field, HttpUrl
|
|
from typing import List, Optional, Literal
|
|
from modules.shared.configuration import APP_CONFIG
|
|
from modules.datamodels.datamodelWorkflow import ActionDocument, ActionResult
|
|
|
|
|
|
WEB_SEARCH_MAX_QUERY_LENGTH: int = int(APP_CONFIG.get("Web_Search_MAX_QUERY_LENGTH", "400"))
|
|
WEB_SEARCH_MAX_RESULTS: int = int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20"))
|
|
WEB_SEARCH_MIN_RESULTS: int = int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1"))
|
|
|
|
|
|
class WebSearchRequest(BaseModel):
|
|
query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH)
|
|
max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS)
|
|
# Tavily tuning options
|
|
search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None)
|
|
time_range: Optional[Literal["d", "w", "m", "y"]] = Field(
|
|
default=None, description="Limit results to last day/week/month/year"
|
|
)
|
|
topic: Optional[Literal["general", "news", "academic"]] = Field(default=None)
|
|
include_domains: Optional[List[str]] = Field(default=None)
|
|
exclude_domains: Optional[List[str]] = Field(default=None)
|
|
language: Optional[str] = Field(default=None, description="ISO language code like 'en', 'de'")
|
|
include_answer: Optional[bool] = Field(default=None)
|
|
include_raw_content: Optional[bool] = Field(default=None)
|
|
|
|
|
|
class WebSearchResultItem(BaseModel):
|
|
"""Individual search result"""
|
|
|
|
title: str
|
|
url: HttpUrl
|
|
|
|
|
|
class WebSearchDocumentData(BaseModel):
|
|
"""Complete search (and scrape) results document"""
|
|
|
|
query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH)
|
|
# Allow both WebSearchResultItem and WebScrapeResultItem to be stored here
|
|
results: List[object]
|
|
total_count: int
|
|
|
|
|
|
class WebSearchActionDocument(ActionDocument):
|
|
documentData: WebSearchDocumentData
|
|
|
|
|
|
class WebSearchActionResult(ActionResult):
|
|
documents: List[WebSearchActionDocument] = Field(default_factory=list)
|
|
|
|
|
|
class WebSearchBase(ABC):
|
|
@abstractmethod
|
|
async def search_urls(self, request: WebSearchRequest) -> WebSearchActionResult: ...
|
|
|
|
|
|
# --- Web crawl ---
|
|
|
|
|
|
class WebCrawlRequest(BaseModel):
|
|
urls: List[HttpUrl]
|
|
# Tavily extract options
|
|
extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None)
|
|
format: Optional[Literal["text", "markdown"]] = Field(default=None)
|
|
|
|
|
|
class WebCrawlResultItem(BaseModel):
|
|
"""Individual crawl result"""
|
|
|
|
url: HttpUrl
|
|
content: str
|
|
|
|
|
|
class WebCrawlDocumentData(BaseModel):
|
|
"""Complete crawl results document"""
|
|
|
|
urls: List[HttpUrl]
|
|
results: List[WebCrawlResultItem]
|
|
total_count: int
|
|
|
|
|
|
class WebCrawlActionDocument(ActionDocument):
|
|
documentData: WebCrawlDocumentData = Field(
|
|
description="The data extracted from crawled URLs"
|
|
)
|
|
|
|
|
|
class WebCrawlActionResult(ActionResult):
|
|
documents: List[WebCrawlActionDocument] = Field(default_factory=list)
|
|
|
|
|
|
class WebCrawlBase(ABC):
|
|
@abstractmethod
|
|
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ...
|
|
|
|
|
|
# --- Web scrape ---
|
|
|
|
|
|
class WebScrapeRequest(BaseModel):
|
|
query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH)
|
|
max_results: int = Field(ge=WEB_SEARCH_MIN_RESULTS, le=WEB_SEARCH_MAX_RESULTS)
|
|
# Pass-through search options
|
|
search_depth: Optional[Literal["basic", "advanced"]] = Field(default=None)
|
|
time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None)
|
|
topic: Optional[Literal["general", "news", "academic"]] = Field(default=None)
|
|
include_domains: Optional[List[str]] = Field(default=None)
|
|
exclude_domains: Optional[List[str]] = Field(default=None)
|
|
language: Optional[str] = Field(default=None)
|
|
include_answer: Optional[bool] = Field(default=None)
|
|
include_raw_content: Optional[bool] = Field(default=None)
|
|
# Extract options
|
|
extract_depth: Optional[Literal["basic", "advanced"]] = Field(default=None)
|
|
format: Optional[Literal["text", "markdown"]] = Field(default=None)
|
|
|
|
|
|
class WebScrapeResultItem(BaseModel):
|
|
"""Individual scrape result"""
|
|
|
|
url: HttpUrl
|
|
content: str
|
|
|
|
|
|
class WebScrapeActionDocument(ActionDocument):
|
|
documentData: WebSearchDocumentData = Field(
|
|
description="The data extracted from scraped URLs"
|
|
)
|
|
|
|
|
|
class WebScrapeActionResult(ActionResult):
|
|
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
|
|
|
|
|
|
class WebScrapeBase(ABC):
|
|
@abstractmethod
|
|
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ...
|
|
|
|
|