Merge branch 'int' of https://github.com/valueonag/gateway into int
This commit is contained in:
commit
f661b67580
16 changed files with 1136 additions and 0 deletions
0
modules/__init__.py
Normal file
0
modules/__init__.py
Normal file
223
modules/connectors/connector_tavily.py
Normal file
223
modules/connectors/connector_tavily.py
Normal file
|
|
@ -0,0 +1,223 @@
|
||||||
|
"""Tavily web search class."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebCrawlBase,
|
||||||
|
WebCrawlDocumentData,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebCrawlResultItem,
|
||||||
|
WebScrapeActionDocument,
|
||||||
|
WebScrapeActionResult,
|
||||||
|
WebScrapeBase,
|
||||||
|
WebScrapeDocumentData,
|
||||||
|
WebScrapeRequest,
|
||||||
|
WebScrapeResultItem,
|
||||||
|
WebSearchBase,
|
||||||
|
WebSearchRequest,
|
||||||
|
WebSearchActionResult,
|
||||||
|
WebSearchActionDocument,
|
||||||
|
WebSearchDocumentData,
|
||||||
|
WebSearchResultItem,
|
||||||
|
WebCrawlActionDocument,
|
||||||
|
WebCrawlActionResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
# from modules.interfaces.interfaceChatModel import ActionResult, ActionDocument
|
||||||
|
from tavily import AsyncTavilyClient
|
||||||
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TavilySearchResult:
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TavilyCrawlResult:
|
||||||
|
url: str
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ConnectorTavily(WebSearchBase, WebCrawlBase, WebScrapeBase):
|
||||||
|
client: AsyncTavilyClient = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls):
|
||||||
|
return cls(client=AsyncTavilyClient(api_key=os.getenv("TAVILY_API_KEY")))
|
||||||
|
|
||||||
|
async def search_urls(self, request: WebSearchRequest) -> WebSearchActionResult:
|
||||||
|
"""Handles the web search request.
|
||||||
|
|
||||||
|
Takes a query and returns a list of URLs.
|
||||||
|
"""
|
||||||
|
# Step 1: Search
|
||||||
|
try:
|
||||||
|
search_results = await self._search(request.query, request.max_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebSearchActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 2: Build ActionResult
|
||||||
|
try:
|
||||||
|
result = self._build_search_action_result(search_results, request.query)
|
||||||
|
except Exception as e:
|
||||||
|
return WebSearchActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult:
|
||||||
|
"""Crawls the given URLs and returns the extracted text content."""
|
||||||
|
# Step 1: Crawl
|
||||||
|
try:
|
||||||
|
crawl_results = await self._crawl(request.urls)
|
||||||
|
except Exception as e:
|
||||||
|
return WebCrawlActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 2: Build ActionResult
|
||||||
|
try:
|
||||||
|
result = self._build_crawl_action_result(crawl_results, request.urls)
|
||||||
|
except Exception as e:
|
||||||
|
return WebCrawlActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult:
|
||||||
|
"""Turns a query in a list of urls with extracted content."""
|
||||||
|
# Step 1: Search
|
||||||
|
try:
|
||||||
|
search_results = await self._search(request.query, request.max_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 2: Crawl
|
||||||
|
try:
|
||||||
|
urls = [result.url for result in search_results]
|
||||||
|
crawl_results = await self._crawl(urls)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 3: Build ActionResult
|
||||||
|
try:
|
||||||
|
result = self._build_scrape_action_result(crawl_results, request.query)
|
||||||
|
except Exception as e:
|
||||||
|
return WebScrapeActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _search(self, query: str, max_results: int) -> list[TavilySearchResult]:
|
||||||
|
"""Calls the Tavily API to perform a web search."""
|
||||||
|
# Make sure max_results is within the allowed range
|
||||||
|
if max_results < 0 or max_results > 20:
|
||||||
|
raise ValueError("max_results must be between 0 and 20")
|
||||||
|
|
||||||
|
# Perform actual API call
|
||||||
|
response = await self.client.search(query=query, max_results=max_results)
|
||||||
|
|
||||||
|
logger.info(f"Tavily API search response:\n{response}")
|
||||||
|
|
||||||
|
return [
|
||||||
|
TavilySearchResult(title=result["title"], url=result["url"])
|
||||||
|
for result in response["results"]
|
||||||
|
]
|
||||||
|
|
||||||
|
def _build_search_action_result(
|
||||||
|
self, search_results: list[TavilySearchResult], query: str = ""
|
||||||
|
) -> WebSearchActionResult:
|
||||||
|
"""Builds the ActionResult from the search results."""
|
||||||
|
# Convert to result items
|
||||||
|
result_items = [
|
||||||
|
WebSearchResultItem(title=result.title, url=result.url)
|
||||||
|
for result in search_results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create document data with all results
|
||||||
|
document_data = WebSearchDocumentData(
|
||||||
|
query=query, results=result_items, total_count=len(result_items)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create single document
|
||||||
|
document = WebSearchActionDocument(
|
||||||
|
documentName=f"web_search_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
return WebSearchActionResult(
|
||||||
|
success=True, documents=[document], resultLabel="web_search_results"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _crawl(self, urls: list) -> list[TavilyCrawlResult]:
|
||||||
|
"""Calls the Tavily API to extract text content from URLs."""
|
||||||
|
response = await self.client.extract(
|
||||||
|
urls=urls, extract_depth="advanced", format="text"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log the result
|
||||||
|
logger.info(f"Tavily API extract (crawl) response:\n{response}")
|
||||||
|
|
||||||
|
return [
|
||||||
|
TavilyCrawlResult(url=result["url"], content=result["raw_content"])
|
||||||
|
for result in response["results"]
|
||||||
|
]
|
||||||
|
|
||||||
|
def _build_crawl_action_result(
|
||||||
|
self, crawl_results: list[TavilyCrawlResult], urls: list[str] = None
|
||||||
|
) -> WebCrawlActionResult:
|
||||||
|
"""Builds the ActionResult from the crawl results."""
|
||||||
|
# Convert to result items
|
||||||
|
result_items = [
|
||||||
|
WebCrawlResultItem(url=result.url, content=result.content)
|
||||||
|
for result in crawl_results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create document data with all results
|
||||||
|
document_data = WebCrawlDocumentData(
|
||||||
|
urls=urls or [result.url for result in crawl_results],
|
||||||
|
results=result_items,
|
||||||
|
total_count=len(result_items),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create single document
|
||||||
|
document = WebCrawlActionDocument(
|
||||||
|
documentName=f"web_crawl_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
return WebCrawlActionResult(
|
||||||
|
success=True, documents=[document], resultLabel="web_crawl_results"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_scrape_action_result(
|
||||||
|
self, crawl_results: list[TavilyCrawlResult], query: str = ""
|
||||||
|
) -> WebScrapeActionResult:
|
||||||
|
"""Builds the ActionResult from the scrape results."""
|
||||||
|
# Convert to result items
|
||||||
|
result_items = [
|
||||||
|
WebScrapeResultItem(url=result.url, content=result.content)
|
||||||
|
for result in crawl_results
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create document data with all results
|
||||||
|
document_data = WebScrapeDocumentData(
|
||||||
|
query=query,
|
||||||
|
results=result_items,
|
||||||
|
total_count=len(result_items),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create single document
|
||||||
|
document = WebScrapeActionDocument(
|
||||||
|
documentName=f"web_scrape_results_{get_utc_timestamp()}.json",
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
return WebScrapeActionResult(
|
||||||
|
success=True, documents=[document], resultLabel="web_scrape_results"
|
||||||
|
)
|
||||||
123
modules/interfaces/interface_web_model.py
Normal file
123
modules/interfaces/interface_web_model.py
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
"""Base class for web classes."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult
|
||||||
|
from pydantic import BaseModel, Field, HttpUrl
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
# --- Web search ---
|
||||||
|
|
||||||
|
# query -> list of URLs
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchRequest(BaseModel):
|
||||||
|
query: str = Field(min_length=1, max_length=400)
|
||||||
|
max_results: int = Field(ge=1, le=20)
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchResultItem(BaseModel):
|
||||||
|
"""Individual search result"""
|
||||||
|
|
||||||
|
title: str
|
||||||
|
url: HttpUrl
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchDocumentData(BaseModel):
|
||||||
|
"""Complete search results document"""
|
||||||
|
|
||||||
|
query: str = Field(min_length=1, max_length=400)
|
||||||
|
results: List[WebSearchResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchActionDocument(ActionDocument):
|
||||||
|
documentData: WebSearchDocumentData
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchActionResult(ActionResult):
|
||||||
|
documents: List[WebSearchActionDocument] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def search_urls(self, request: WebSearchRequest) -> WebSearchActionResult: ...
|
||||||
|
|
||||||
|
|
||||||
|
# --- Web crawl ---
|
||||||
|
|
||||||
|
# list of URLs -> list of extracted HTML content
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlRequest(BaseModel):
|
||||||
|
urls: List[HttpUrl]
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlResultItem(BaseModel):
|
||||||
|
"""Individual crawl result"""
|
||||||
|
|
||||||
|
url: HttpUrl
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlDocumentData(BaseModel):
|
||||||
|
"""Complete crawl results document"""
|
||||||
|
|
||||||
|
urls: List[HttpUrl]
|
||||||
|
results: List[WebCrawlResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlActionDocument(ActionDocument):
|
||||||
|
documentData: WebCrawlDocumentData = Field(
|
||||||
|
description="The data extracted from crawled URLs"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlActionResult(ActionResult):
|
||||||
|
documents: List[WebCrawlActionDocument] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl_urls(self, request: WebCrawlRequest) -> WebCrawlActionResult: ...
|
||||||
|
|
||||||
|
|
||||||
|
# --- Web scrape ---
|
||||||
|
|
||||||
|
# scrape -> list of extracted text; combines web search and crawl in one step
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeRequest(BaseModel):
|
||||||
|
query: str = Field(min_length=1, max_length=400)
|
||||||
|
max_results: int = Field(ge=1, le=20)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeResultItem(BaseModel):
|
||||||
|
"""Individual scrape result"""
|
||||||
|
|
||||||
|
url: HttpUrl
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeDocumentData(BaseModel):
|
||||||
|
"""Complete scrape results document"""
|
||||||
|
|
||||||
|
query: str = Field(min_length=1, max_length=400)
|
||||||
|
results: List[WebScrapeResultItem]
|
||||||
|
total_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeActionDocument(ActionDocument):
|
||||||
|
documentData: WebScrapeDocumentData = Field(
|
||||||
|
description="The data extracted from scraped URLs"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeActionResult(ActionResult):
|
||||||
|
documents: List[WebScrapeActionDocument] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def scrape(self, request: WebScrapeRequest) -> WebScrapeActionResult: ...
|
||||||
46
modules/interfaces/interface_web_objects.py
Normal file
46
modules/interfaces/interface_web_objects.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
from typing import Optional
|
||||||
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebCrawlActionResult,
|
||||||
|
WebSearchActionResult,
|
||||||
|
WebSearchRequest,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebScrapeActionResult,
|
||||||
|
WebScrapeRequest,
|
||||||
|
)
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from modules.connectors.connector_tavily import ConnectorTavily
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class WebInterface:
|
||||||
|
connector_tavily: ConnectorTavily
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if self.connector_tavily is None:
|
||||||
|
raise TypeError(
|
||||||
|
"connector_tavily must be provided. "
|
||||||
|
"Use `await WebInterface.create()` or pass a ConnectorTavily."
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls) -> "WebInterface":
|
||||||
|
connector_tavily = await ConnectorTavily.create()
|
||||||
|
|
||||||
|
return WebInterface(connector_tavily=connector_tavily)
|
||||||
|
|
||||||
|
async def search(
|
||||||
|
self, web_search_request: WebSearchRequest
|
||||||
|
) -> WebSearchActionResult:
|
||||||
|
# NOTE: Add connectors here
|
||||||
|
return await self.connector_tavily.search_urls(web_search_request)
|
||||||
|
|
||||||
|
async def crawl(self, web_crawl_request: WebCrawlRequest) -> WebCrawlActionResult:
|
||||||
|
# NOTE: Add connectors here
|
||||||
|
return await self.connector_tavily.crawl_urls(web_crawl_request)
|
||||||
|
|
||||||
|
async def scrape(
|
||||||
|
self, web_scrape_request: WebScrapeRequest
|
||||||
|
) -> WebScrapeActionResult:
|
||||||
|
# NOTE: Add connectors here
|
||||||
|
return await self.connector_tavily.scrape(web_scrape_request)
|
||||||
197
modules/methods/method_web.py
Normal file
197
modules/methods/method_web.py
Normal file
|
|
@ -0,0 +1,197 @@
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict
|
||||||
|
from modules.chat.methodBase import MethodBase, action
|
||||||
|
from modules.interfaces.interfaceChatModel import ActionResult
|
||||||
|
from modules.interfaces.interface_web_objects import WebInterface
|
||||||
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebSearchRequest,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebScrapeRequest,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MethodWeb(MethodBase):
|
||||||
|
"""Web method implementation for web operations."""
|
||||||
|
|
||||||
|
def __init__(self, serviceCenter: Any):
|
||||||
|
super().__init__(serviceCenter)
|
||||||
|
self.name = "web"
|
||||||
|
self.description = "Web search, crawling, and scraping operations using Tavily"
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""Perform a web search and outputs a .json file with a list of found URLs.
|
||||||
|
|
||||||
|
Each result contains "title" and "url".
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
query (str): Search query to perform
|
||||||
|
maxResults (int, optional): Maximum number of results (default: 10)
|
||||||
|
"""
|
||||||
|
# TODO: Fix docstrings - do we need that format for parsing?
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Prepare request data
|
||||||
|
web_search_request = WebSearchRequest(
|
||||||
|
query=parameters.get("query"),
|
||||||
|
max_results=parameters.get("maxResults", 10),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
web_interface = await WebInterface.create()
|
||||||
|
web_search_result = await web_interface.search(web_search_request)
|
||||||
|
|
||||||
|
return web_search_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""Crawls a list of URLs and extracts information from them.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
document (str): Document reference containing URL list from search results
|
||||||
|
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
document_ref = parameters.get("document")
|
||||||
|
|
||||||
|
if not document_ref:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="No document reference provided."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Resolve document reference to ChatDocument objects
|
||||||
|
chat_documents = self.service.getChatDocumentsFromDocumentList(
|
||||||
|
[document_ref]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chat_documents:
|
||||||
|
return ActionResult(
|
||||||
|
success=False,
|
||||||
|
error=f"No documents found for reference: {document_ref}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the first document (search results)
|
||||||
|
search_doc = chat_documents[0]
|
||||||
|
|
||||||
|
# Get file data using the service center
|
||||||
|
file_data = self.service.getFileData(search_doc.fileId)
|
||||||
|
if not file_data:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="Could not retrieve file data for document"
|
||||||
|
)
|
||||||
|
|
||||||
|
content = file_data.decode("utf-8")
|
||||||
|
|
||||||
|
# Parse JSON to extract URLs from search results
|
||||||
|
import json
|
||||||
|
|
||||||
|
try:
|
||||||
|
# The document structure from WebSearchActionResult
|
||||||
|
search_data = json.loads(content)
|
||||||
|
|
||||||
|
# Extract URLs from the search results structure
|
||||||
|
urls = []
|
||||||
|
if isinstance(search_data, dict):
|
||||||
|
# Handle the document structure: documentData contains the actual search results
|
||||||
|
doc_data = search_data.get("documentData", search_data)
|
||||||
|
if "results" in doc_data and isinstance(doc_data["results"], list):
|
||||||
|
urls = [
|
||||||
|
result["url"]
|
||||||
|
for result in doc_data["results"]
|
||||||
|
if isinstance(result, dict) and "url" in result
|
||||||
|
]
|
||||||
|
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
|
||||||
|
# Fallback: if URLs are stored directly in a 'urls' field
|
||||||
|
urls = [url for url in doc_data["urls"] if isinstance(url, str)]
|
||||||
|
|
||||||
|
# Fallback: try to parse as plain text with regex (for backward compatibility)
|
||||||
|
if not urls:
|
||||||
|
logger.warning(
|
||||||
|
"Could not extract URLs from JSON structure, trying plain text parsing"
|
||||||
|
)
|
||||||
|
import re
|
||||||
|
|
||||||
|
urls = re.split(r"[\n,;]+", content)
|
||||||
|
urls = [
|
||||||
|
u.strip()
|
||||||
|
for u in urls
|
||||||
|
if u.strip()
|
||||||
|
and (
|
||||||
|
u.strip().startswith("http://")
|
||||||
|
or u.strip().startswith("https://")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Fallback to plain text parsing if JSON parsing fails
|
||||||
|
logger.warning("Document is not valid JSON, trying plain text parsing")
|
||||||
|
import re
|
||||||
|
|
||||||
|
urls = re.split(r"[\n,;]+", content)
|
||||||
|
urls = [
|
||||||
|
u.strip()
|
||||||
|
for u in urls
|
||||||
|
if u.strip()
|
||||||
|
and (
|
||||||
|
u.strip().startswith("http://")
|
||||||
|
or u.strip().startswith("https://")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
return ActionResult(
|
||||||
|
success=False, error="No valid URLs found in the document."
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(urls)} URLs from document: {urls}")
|
||||||
|
|
||||||
|
# Prepare request data
|
||||||
|
web_crawl_request = WebCrawlRequest(urls=urls)
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
web_interface = await WebInterface.create()
|
||||||
|
web_crawl_result = await web_interface.crawl(web_crawl_request)
|
||||||
|
|
||||||
|
return web_crawl_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in crawl method: {str(e)}")
|
||||||
|
return ActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
@action
|
||||||
|
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
|
||||||
|
"""Scrapes web content by searching for URLs and then extracting their content.
|
||||||
|
|
||||||
|
Combines search and crawl operations in one step.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
query (str): Search query to perform
|
||||||
|
maxResults (int, optional): Maximum number of results (default: 10)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
query = parameters.get("query")
|
||||||
|
max_results = parameters.get("maxResults", 10)
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return ActionResult(success=False, error="Search query is required")
|
||||||
|
|
||||||
|
# Prepare request data
|
||||||
|
web_scrape_request = WebScrapeRequest(
|
||||||
|
query=query,
|
||||||
|
max_results=max_results,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform request
|
||||||
|
web_interface = await WebInterface.create()
|
||||||
|
web_scrape_result = await web_interface.scrape(web_scrape_request)
|
||||||
|
|
||||||
|
return web_scrape_result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ActionResult(success=False, error=str(e))
|
||||||
31
modules/methods/web/web_search/web_search_base.py
Normal file
31
modules/methods/web/web_search/web_search_base.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""Base class for web search classes."""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from modules.interfaces.interfaceChatModel import ActionDocument, ActionResult
|
||||||
|
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchRequest(BaseModel):
|
||||||
|
query: str
|
||||||
|
max_results: int
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchDocumentData(BaseModel):
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchActionDocument(ActionDocument):
|
||||||
|
documentData: List[WebSearchDocumentData]
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchActionResult(ActionResult):
|
||||||
|
documents: List[WebSearchActionDocument] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def __call__(self, request: WebSearchRequest) -> WebSearchActionResult: ...
|
||||||
70
modules/methods/web/web_search/web_search_tavily.py
Normal file
70
modules/methods/web/web_search/web_search_tavily.py
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
"""Tavily web search class."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from web_search_base import (
|
||||||
|
WebSearchBase,
|
||||||
|
WebSearchRequest,
|
||||||
|
WebSearchActionResult,
|
||||||
|
WebSearchActionDocument,
|
||||||
|
WebSearchDocumentData,
|
||||||
|
)
|
||||||
|
|
||||||
|
# from modules.interfaces.interfaceChatModel import ActionResult, ActionDocument
|
||||||
|
from tavily import AsyncTavilyClient
|
||||||
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WebSearchTavily(WebSearchBase):
|
||||||
|
client: AsyncTavilyClient = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def create(cls):
|
||||||
|
return cls(client=AsyncTavilyClient(api_key=os.getenv("TAVILY_API_KEY")))
|
||||||
|
|
||||||
|
async def __call__(self, request: WebSearchRequest) -> WebSearchActionResult:
|
||||||
|
"""Handles the web search request."""
|
||||||
|
# Step 1: Search
|
||||||
|
try:
|
||||||
|
search_results = await self._search(request.query, request.max_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebSearchActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
# Step 2: Build ActionResult
|
||||||
|
try:
|
||||||
|
result = self._build_action_result(search_results)
|
||||||
|
except Exception as e:
|
||||||
|
return WebSearchActionResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _search(self, query: str, max_results: int) -> WebSearchActionResult:
|
||||||
|
"""Calls the Tavily API to perform a web search."""
|
||||||
|
# Make sure max_results is within the allowed range
|
||||||
|
if max_results < 0 or max_results > 20:
|
||||||
|
raise ValueError("max_results must be between 0 and 20")
|
||||||
|
|
||||||
|
# Perform actual API call
|
||||||
|
response = await self.client.search(query=query, max_results=max_results)
|
||||||
|
return response["results"]
|
||||||
|
|
||||||
|
def _build_action_result(self, search_results: list) -> WebSearchActionResult:
|
||||||
|
"""Builds the ActionResult from the search results."""
|
||||||
|
documents = []
|
||||||
|
for result in search_results:
|
||||||
|
document_name = f"web_search_{get_utc_timestamp()}.txt"
|
||||||
|
document_data = WebSearchDocumentData(
|
||||||
|
title=result["title"], url=result["url"]
|
||||||
|
)
|
||||||
|
mime_type = "text/plain"
|
||||||
|
doc = WebSearchActionDocument(
|
||||||
|
documentName=document_name,
|
||||||
|
documentData=document_data,
|
||||||
|
mimeType=mime_type,
|
||||||
|
)
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
return WebSearchActionResult(
|
||||||
|
success=True, documents=documents, resultLabel="web_search_results"
|
||||||
|
)
|
||||||
13
pytest.ini
Normal file
13
pytest.ini
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
[pytest]
|
||||||
|
testpaths = tests
|
||||||
|
python_paths = .
|
||||||
|
python_files = test_*.py
|
||||||
|
python_classes = Test*
|
||||||
|
python_functions = test_*
|
||||||
|
log_file = logs/test_logs.log
|
||||||
|
log_file_level = INFO
|
||||||
|
log_file_format = %(asctime)s %(levelname)s %(message)s
|
||||||
|
log_file_date_format = %Y-%m-%d %H:%M:%S
|
||||||
|
# Only run non-expensive tests by default, verbose log, short traceback
|
||||||
|
# Use 'pytest -m ""' to run ALL tests.
|
||||||
|
addopts = -v --tb=short -m 'not expensive'
|
||||||
|
|
@ -42,6 +42,7 @@ requests==2.31.0
|
||||||
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
|
chardet>=5.0.0 # Für Zeichensatzerkennung bei Webinhalten
|
||||||
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
|
aiohttp>=3.8.0 # Required for SharePoint operations (async HTTP)
|
||||||
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages
|
selenium>=4.15.0 # Required for web automation and JavaScript-heavy pages
|
||||||
|
tavily-python==0.7.11 # Tavily SDK
|
||||||
|
|
||||||
## Image Processing
|
## Image Processing
|
||||||
Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert)
|
Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert)
|
||||||
|
|
@ -67,3 +68,7 @@ PyPDF2>=3.0.0
|
||||||
PyMuPDF>=1.20.0
|
PyMuPDF>=1.20.0
|
||||||
beautifulsoup4>=4.11.0
|
beautifulsoup4>=4.11.0
|
||||||
chardet>=4.0.0 # For encoding detection
|
chardet>=4.0.0 # For encoding detection
|
||||||
|
|
||||||
|
## Testing Dependencies
|
||||||
|
pytest>=8.0.0
|
||||||
|
pytest-asyncio>=0.21.0
|
||||||
|
|
|
||||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# noqa
|
||||||
0
tests/connectors/__init__.py
Normal file
0
tests/connectors/__init__.py
Normal file
108
tests/connectors/test_connector_tavily.py
Normal file
108
tests/connectors/test_connector_tavily.py
Normal file
|
|
@ -0,0 +1,108 @@
|
||||||
|
"""Tests for Tavliy web search."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from modules.interfaces.interfaceChatModel import ActionResult
|
||||||
|
from modules.interfaces.interface_web_model import (
|
||||||
|
WebSearchRequest,
|
||||||
|
WebCrawlRequest,
|
||||||
|
WebScrapeRequest,
|
||||||
|
)
|
||||||
|
from modules.connectors.connector_tavily import ConnectorTavily
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_tavily_connector_search_test_live_api():
|
||||||
|
logger.info("Testing Tavliy connector search with live API calls")
|
||||||
|
|
||||||
|
# Test request
|
||||||
|
request = WebSearchRequest(query="How old is the Earth?", max_results=5)
|
||||||
|
|
||||||
|
# Tavily instance
|
||||||
|
connector_tavily = await ConnectorTavily.create()
|
||||||
|
|
||||||
|
# Search test
|
||||||
|
action_result = await connector_tavily.search_urls(request=request)
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
assert isinstance(action_result, ActionResult)
|
||||||
|
|
||||||
|
logger.info("=" * 20)
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info("-" * 10)
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" - Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" - Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_tavily_connector_crawl_test_live_api():
|
||||||
|
logger.info("Testing Tavily connector crawl with live API calls")
|
||||||
|
|
||||||
|
# Test request
|
||||||
|
urls = [
|
||||||
|
"https://en.wikipedia.org/wiki/Earth",
|
||||||
|
"https://valueon.ch",
|
||||||
|
]
|
||||||
|
request = WebCrawlRequest(urls=urls)
|
||||||
|
|
||||||
|
# Tavily instance
|
||||||
|
connector_tavily = await ConnectorTavily.create()
|
||||||
|
|
||||||
|
# Crawl test
|
||||||
|
action_result = await connector_tavily.crawl_urls(request=request)
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
assert isinstance(action_result, ActionResult)
|
||||||
|
|
||||||
|
logger.info("=" * 20)
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info("-" * 10)
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" - Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" - Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_tavily_connector_scrape_test_live_api():
|
||||||
|
logger.info("Testing Tavily connector scrape with live API calls")
|
||||||
|
|
||||||
|
# Test request with query
|
||||||
|
request = WebScrapeRequest(query="How old is the Earth?", max_results=3)
|
||||||
|
|
||||||
|
# Tavily instance
|
||||||
|
connector_tavily = await ConnectorTavily.create()
|
||||||
|
|
||||||
|
# Scrape test
|
||||||
|
action_result = await connector_tavily.scrape(request=request)
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
assert isinstance(action_result, ActionResult)
|
||||||
|
|
||||||
|
logger.info("=" * 20)
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info("-" * 10)
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" - Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" - Document Data: {doc.documentData}")
|
||||||
0
tests/fixtures/__init__.py
vendored
Normal file
0
tests/fixtures/__init__.py
vendored
Normal file
71
tests/fixtures/tavily_responses.py
vendored
Normal file
71
tests/fixtures/tavily_responses.py
vendored
Normal file
File diff suppressed because one or more lines are too long
0
tests/methods/__init__.py
Normal file
0
tests/methods/__init__.py
Normal file
248
tests/methods/test_method_web.py
Normal file
248
tests/methods/test_method_web.py
Normal file
|
|
@ -0,0 +1,248 @@
|
||||||
|
"""Tests for method web.py"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch
|
||||||
|
from modules.methods.method_web import MethodWeb
|
||||||
|
from tests.fixtures.tavily_responses import (
|
||||||
|
RESPONSE_SEARCH_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
RESPONSE_EXTRACT_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_method_web_search_live():
|
||||||
|
"""Tests method web search with live API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Search Live")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Actual request
|
||||||
|
action_result = await method_web.search(
|
||||||
|
{"query": "How old is the earth", "maxResults": 5}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_method_web_search_dummy():
|
||||||
|
"""Tests method web search with dummy response data - no external API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Search Dummy")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Mock the Tavily API response
|
||||||
|
with patch(
|
||||||
|
"tavily.AsyncTavilyClient.search",
|
||||||
|
return_value=RESPONSE_SEARCH_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
) as mock_client:
|
||||||
|
action_result = await method_web.search(
|
||||||
|
{"query": "How old is the earth", "maxResults": 5}
|
||||||
|
)
|
||||||
|
mock_client.assert_called_once()
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_method_web_crawl_live():
|
||||||
|
"""Tests method web crawl with live API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Crawl Live")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Create mock document data with URLs from search results
|
||||||
|
search_results_json = {
|
||||||
|
"documentData": {
|
||||||
|
"results": [
|
||||||
|
{"url": "https://en.wikipedia.org/wiki/Age_of_Earth"},
|
||||||
|
{"url": "https://www.planetary.org/articles/how-old-is-the-earth"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mock the service center methods
|
||||||
|
with patch.object(method_web, "service") as mock_service:
|
||||||
|
mock_service.getChatDocumentsFromDocumentList.return_value = [
|
||||||
|
type("MockDoc", (), {"fileId": "test-file-id"})()
|
||||||
|
]
|
||||||
|
mock_service.getFileData.return_value = json.dumps(search_results_json).encode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Actual request
|
||||||
|
action_result = await method_web.crawl({"document": "test-document-ref"})
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_method_web_crawl_dummy():
|
||||||
|
"""Tests method web crawl with dummy response data - no external API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Crawl Dummy")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Create mock document data with URLs from search results
|
||||||
|
search_results_json = {
|
||||||
|
"documentData": {
|
||||||
|
"results": [
|
||||||
|
{"url": "https://en.wikipedia.org/wiki/Age_of_Earth"},
|
||||||
|
{"url": "https://www.planetary.org/articles/how-old-is-the-earth"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mock both the service center and Tavily API
|
||||||
|
with (
|
||||||
|
patch.object(method_web, "service") as mock_service,
|
||||||
|
patch(
|
||||||
|
"tavily.AsyncTavilyClient.extract",
|
||||||
|
return_value=RESPONSE_EXTRACT_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
) as mock_client,
|
||||||
|
):
|
||||||
|
mock_service.getChatDocumentsFromDocumentList.return_value = [
|
||||||
|
type("MockDoc", (), {"fileId": "test-file-id"})()
|
||||||
|
]
|
||||||
|
mock_service.getFileData.return_value = json.dumps(search_results_json).encode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
action_result = await method_web.crawl({"document": "test-document-ref"})
|
||||||
|
mock_client.assert_called_once()
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.expensive
|
||||||
|
async def test_method_web_scrape_live():
|
||||||
|
"""Tests method web scrape with live API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Scrape Live")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Actual request
|
||||||
|
action_result = await method_web.scrape(
|
||||||
|
{"query": "How old is the earth", "maxResults": 3}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_method_web_scrape_dummy():
|
||||||
|
"""Tests method web scrape with dummy response data - no external API calls."""
|
||||||
|
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("==> Test: Method Web Scrape Dummy")
|
||||||
|
|
||||||
|
method_web = MethodWeb(serviceCenter=None)
|
||||||
|
|
||||||
|
# Mock both Tavily API responses (search + extract)
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"tavily.AsyncTavilyClient.search",
|
||||||
|
return_value=RESPONSE_SEARCH_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
) as mock_search,
|
||||||
|
patch(
|
||||||
|
"tavily.AsyncTavilyClient.extract",
|
||||||
|
return_value=RESPONSE_EXTRACT_HOW_OLD_IS_EARTH_NO_ANSWER,
|
||||||
|
) as mock_extract,
|
||||||
|
):
|
||||||
|
action_result = await method_web.scrape(
|
||||||
|
{"query": "How old is the earth", "maxResults": 3}
|
||||||
|
)
|
||||||
|
mock_search.assert_called_once()
|
||||||
|
mock_extract.assert_called_once()
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
assert action_result.success
|
||||||
|
assert len(action_result.documents) > 0
|
||||||
|
|
||||||
|
logger.info(f"Action result success status: {action_result.success}")
|
||||||
|
logger.info(f"Action result error: {action_result.error}")
|
||||||
|
logger.info(f"Action result label: {action_result.resultLabel}")
|
||||||
|
|
||||||
|
logger.info("Documents:")
|
||||||
|
for doc in action_result.documents:
|
||||||
|
logger.info(f" - Document Name: {doc.documentName}")
|
||||||
|
logger.info(f" --> Document Mime Type: {doc.mimeType}")
|
||||||
|
logger.info(f" --> Document Data: {doc.documentData}")
|
||||||
Loading…
Reference in a new issue