142 lines
5.9 KiB
Python
142 lines
5.9 KiB
Python
"""Web-related modules"""
|
|
from pydantic import BaseModel, Field, HttpUrl
|
|
from typing import List, Optional, Literal, Dict, Any
|
|
from modules.shared.configuration import APP_CONFIG
|
|
from modules.datamodels.datamodelChat import ActionDocument, ActionResult
|
|
|
|
|
|
WEB_SEARCH_MAX_QUERY_LENGTH: int = int(APP_CONFIG.get("Web_Search_MAX_QUERY_LENGTH", "400"))
|
|
WEB_SEARCH_MAX_RESULTS: int = int(APP_CONFIG.get("Web_Search_MAX_RESULTS", "20"))
|
|
WEB_SEARCH_MIN_RESULTS: int = int(APP_CONFIG.get("Web_Search_MIN_RESULTS", "1"))
|
|
|
|
|
|
class WebResearchOptions(BaseModel):
|
|
"""Advanced options for web research workflow"""
|
|
max_pages: int = Field(default=10, ge=1, le=50, description="Maximum pages to crawl")
|
|
search_depth: Literal["basic", "advanced"] = Field(default="basic", description="Tavily search depth")
|
|
extract_depth: Literal["basic", "advanced"] = Field(default="advanced", description="Tavily extract depth")
|
|
format: Literal["text", "markdown"] = Field(default="markdown", description="Content format")
|
|
return_report: bool = Field(default=True, description="Return formatted report or raw data")
|
|
pages_search_depth: int = Field(default=1, ge=1, le=5, description="How deep to crawl: 1=main pages only, 2=main+sub-pages, 3=main+sub+sub-sub, etc.")
|
|
country: Optional[str] = Field(default=None, description="Country code for search bias")
|
|
time_range: Optional[Literal["d", "w", "m", "y"]] = Field(default=None, description="Time range for search")
|
|
topic: Optional[Literal["general", "news", "academic"]] = Field(default=None, description="Search topic")
|
|
language: Optional[str] = Field(default=None, description="Language code")
|
|
include_answer: Optional[bool] = Field(default=None, description="Include AI answer")
|
|
include_raw_content: Optional[bool] = Field(default=None, description="Include raw content")
|
|
|
|
class WebResearchRequest(BaseModel):
|
|
"""Main web research request"""
|
|
user_prompt: str = Field(min_length=1, max_length=WEB_SEARCH_MAX_QUERY_LENGTH, description="User's research question or prompt")
|
|
urls: Optional[List[str]] = Field(default=None, description="Specific URLs to crawl (optional)")
|
|
max_results: int = Field(default=5, ge=1, le=WEB_SEARCH_MAX_RESULTS, description="Max search results")
|
|
options: WebResearchOptions = Field(default_factory=WebResearchOptions, description="Advanced options")
|
|
|
|
class WebSearchResultItem(BaseModel):
|
|
"""Individual search result"""
|
|
title: str
|
|
url: HttpUrl
|
|
raw_content: Optional[str] = Field(default=None, description="Raw HTML content")
|
|
|
|
class WebCrawlResultItem(BaseModel):
|
|
"""Individual crawl result"""
|
|
url: HttpUrl
|
|
content: str
|
|
|
|
class WebResearchDocumentData(BaseModel):
|
|
"""Complete web research results"""
|
|
user_prompt: str
|
|
websites_analyzed: int
|
|
additional_links_found: int
|
|
analysis_result: str
|
|
sources: List[WebSearchResultItem]
|
|
additional_links: List[str]
|
|
individual_content: Optional[Dict[str, str]] = None # URL -> content mapping
|
|
debug_info: Optional[Dict[str, Any]] = None
|
|
|
|
class WebResearchActionDocument(ActionDocument):
|
|
documentData: WebResearchDocumentData
|
|
|
|
class WebResearchActionResult(ActionResult):
|
|
documents: List[WebResearchActionDocument] = Field(default_factory=list)
|
|
|
|
# Legacy models for connector compatibility
|
|
|
|
class WebSearchDocumentData(BaseModel):
|
|
"""Search results document data"""
|
|
query: str
|
|
results: List[WebSearchResultItem]
|
|
total_count: int
|
|
|
|
class WebSearchActionDocument(ActionDocument):
|
|
documentData: WebSearchDocumentData
|
|
|
|
class WebSearchActionResult(ActionResult):
|
|
documents: List[WebSearchActionDocument] = Field(default_factory=list)
|
|
|
|
class WebCrawlDocumentData(BaseModel):
|
|
"""Crawl results document data"""
|
|
urls: List[HttpUrl]
|
|
results: List[WebCrawlResultItem]
|
|
total_count: int
|
|
|
|
class WebCrawlActionDocument(ActionDocument):
|
|
documentData: WebCrawlDocumentData
|
|
|
|
class WebCrawlActionResult(ActionResult):
|
|
documents: List[WebCrawlActionDocument] = Field(default_factory=list)
|
|
|
|
class WebScrapeDocumentData(BaseModel):
|
|
"""Scrape results document data"""
|
|
query: str
|
|
results: List[WebSearchResultItem]
|
|
total_count: int
|
|
|
|
class WebScrapeActionDocument(ActionDocument):
|
|
documentData: WebScrapeDocumentData
|
|
|
|
class WebScrapeActionResult(ActionResult):
|
|
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
|
|
|
|
class WebSearchRequest(BaseModel):
|
|
"""Search request for Tavily"""
|
|
query: str
|
|
max_results: int = 5
|
|
search_depth: Optional[Literal["basic", "advanced"]] = None
|
|
time_range: Optional[Literal["d", "w", "m", "y"]] = None
|
|
topic: Optional[Literal["general", "news", "academic"]] = None
|
|
include_domains: Optional[List[str]] = None
|
|
exclude_domains: Optional[List[str]] = None
|
|
language: Optional[str] = None
|
|
include_answer: Optional[bool] = None
|
|
include_raw_content: Optional[bool] = None
|
|
auto_parameters: Optional[bool] = None
|
|
country: Optional[str] = None
|
|
|
|
class WebCrawlRequest(BaseModel):
|
|
"""Crawl request for Tavily"""
|
|
urls: List[HttpUrl]
|
|
extract_depth: Optional[Literal["basic", "advanced"]] = None
|
|
format: Optional[Literal["text", "markdown"]] = None
|
|
|
|
class WebScrapeRequest(BaseModel):
|
|
"""Scrape request for Tavily"""
|
|
query: str
|
|
max_results: int = 5
|
|
search_depth: Optional[Literal["basic", "advanced"]] = None
|
|
time_range: Optional[Literal["d", "w", "m", "y"]] = None
|
|
topic: Optional[Literal["general", "news", "academic"]] = None
|
|
include_domains: Optional[List[str]] = None
|
|
exclude_domains: Optional[List[str]] = None
|
|
language: Optional[str] = None
|
|
include_answer: Optional[bool] = None
|
|
include_raw_content: Optional[bool] = None
|
|
auto_parameters: Optional[bool] = None
|
|
country: Optional[str] = None
|
|
extract_depth: Optional[Literal["basic", "advanced"]] = None
|
|
format: Optional[Literal["text", "markdown"]] = None
|
|
|
|
class WebScrapeResultItem(BaseModel):
|
|
"""Individual scrape result"""
|
|
url: HttpUrl
|
|
content: str
|