gateway/modules/datamodels/datamodelWeb.py
2025-10-02 21:29:21 +02:00

157 lines
6.1 KiB
Python

"""Web-related modules"""
from abc import ABC, abstractmethod
from pydantic import BaseModel, Field, HttpUrl
from typing import List, Optional, Literal, Dict, Any
from modules.shared.configuration import APP_CONFIG
from modules.datamodels.datamodelWorkflow import ActionDocument, ActionResult
WEB_SEARCH_MAX_QUERY_LENGTH: int = int(APP_CONFIG.get("Web_Search_MAX_QUERY_LENGTH", "400"))
WEB_SEARCH_MAX_RESULTS: int = int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20"))
WEB_SEARCH_MIN_RESULTS: int = int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1"))
class WebResearchOptions(BaseModel):
"""Advanced options for web research workflow"""
max_pages: int = Field(default=10, ge=1, le=50, description="Maximum pages to crawl")
search_depth: Literal["basic", "advanced"] = Field(default="basic", description="Tavily search depth")
extract_depth: Literal["basic", "advanced"] = Field(default="advanced", description="Tavily extract depth")
format: Literal["text", "markdown"] = Field(default="markdown", description="Content format")
return_report: bool = Field(default=True, description="Return formatted report or raw data")
pages_search_depth: int = Field(default=1, ge=1, le=5, description="How deep to crawl: 1=main pages only, 2=main+sub-pages, 3=main+sub+sub-sub, etc.")
country: Optional[str] = Field(default=None, description="Country code for search bias")
time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None, description="Time range for search")
topic: Optional[Literal["general", "news", "academic"]] = Field(default=None, description="Search topic")
language: Optional[str] = Field(default=None, description="Language code")
include_answer: Optional[bool] = Field(default=None, description="Include AI answer")
include_raw_content: Optional[bool] = Field(default=None, description="Include raw content")
class WebResearchRequest(BaseModel):
"""Main web research request"""
search_query: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH, description="User's research question")
urls: Optional[List[str]] = Field(default=None, description="Specific URLs to crawl (optional)")
max_results: int = Field(default=5, ge=1, le=WEB_SEARCH_MAX_RESULTS, description="Max search results")
options: WebResearchOptions = Field(default_factory=WebResearchOptions, description="Advanced options")
class WebSearchResultItem(BaseModel):
"""Individual search result"""
title: str
url: HttpUrl
raw_content: Optional[str] = Field(default=None, description="Raw HTML content")
class WebCrawlResultItem(BaseModel):
"""Individual crawl result"""
url: HttpUrl
content: str
class WebResearchDocumentData(BaseModel):
"""Complete web research results"""
search_query: str
websites_analyzed: int
additional_links_found: int
analysis_result: str
sources: List[WebSearchResultItem]
additional_links: List[str]
individual_content: Optional[Dict[str, str]] = None # URL -> content mapping
debug_info: Optional[Dict[str, Any]] = None
class WebResearchActionDocument(ActionDocument):
documentData: WebResearchDocumentData
class WebResearchActionResult(ActionResult):
documents: List[WebResearchActionDocument] = Field(default_factory=list)
class WebResearchBase(ABC):
@abstractmethod
async def web_research(self, request: WebResearchRequest) -> WebResearchActionResult: ...
# Legacy models for connector compatibility
class WebSearchDocumentData(BaseModel):
"""Search results document data"""
query: str
results: List[WebSearchResultItem]
total_count: int
class WebSearchActionDocument(ActionDocument):
documentData: WebSearchDocumentData
class WebSearchActionResult(ActionResult):
documents: List[WebSearchActionDocument] = Field(default_factory=list)
class WebCrawlDocumentData(BaseModel):
"""Crawl results document data"""
urls: List[HttpUrl]
results: List[WebCrawlResultItem]
total_count: int
class WebCrawlActionDocument(ActionDocument):
documentData: WebCrawlDocumentData
class WebCrawlActionResult(ActionResult):
documents: List[WebCrawlActionDocument] = Field(default_factory=list)
class WebScrapeDocumentData(BaseModel):
"""Scrape results document data"""
query: str
results: List[WebSearchResultItem]
total_count: int
class WebScrapeActionDocument(ActionDocument):
documentData: WebScrapeDocumentData
class WebScrapeActionResult(ActionResult):
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
class WebSearchRequest(BaseModel):
"""Search request for Tavily"""
query: str
max_results: int = 5
search_depth: Optional[Literal["basic", "advanced"]] = None
time_range: Optional[Literal["d", "w", "m", "y"]] = None
topic: Optional[Literal["general", "news", "academic"]] = None
include_domains: Optional[List[str]] = None
exclude_domains: Optional[List[str]] = None
language: Optional[str] = None
include_answer: Optional[bool] = None
include_raw_content: Optional[bool] = None
auto_parameters: Optional[bool] = None
country: Optional[str] = None
class WebCrawlRequest(BaseModel):
"""Crawl request for Tavily"""
urls: List[HttpUrl]
extract_depth: Optional[Literal["basic", "advanced"]] = None
format: Optional[Literal["text", "markdown"]] = None
class WebScrapeRequest(BaseModel):
"""Scrape request for Tavily"""
query: str
max_results: int = 5
search_depth: Optional[Literal["basic", "advanced"]] = None
time_range: Optional[Literal["d", "w", "m", "y"]] = None
topic: Optional[Literal["general", "news", "academic"]] = None
include_domains: Optional[List[str]] = None
exclude_domains: Optional[List[str]] = None
language: Optional[str] = None
include_answer: Optional[bool] = None
include_raw_content: Optional[bool] = None
auto_parameters: Optional[bool] = None
country: Optional[str] = None
extract_depth: Optional[Literal["basic", "advanced"]] = None
format: Optional[Literal["text", "markdown"]] = None
class WebScrapeResultItem(BaseModel):
"""Individual scrape result"""
url: HttpUrl
content: str