gateway/modules/methods/method_web.py
2025-09-01 11:21:37 +02:00

197 lines
7.1 KiB
Python

import logging
from typing import Any, Dict
from modules.chat.methodBase import MethodBase, action
from modules.interfaces.interfaceChatModel import ActionResult
from modules.interfaces.interface_web_objects import WebInterface
from modules.interfaces.interface_web_model import (
WebSearchRequest,
WebCrawlRequest,
WebScrapeRequest,
)
logger = logging.getLogger(__name__)
class MethodWeb(MethodBase):
"""Web method implementation for web operations."""
def __init__(self, serviceCenter: Any):
super().__init__(serviceCenter)
self.name = "web"
self.description = "Web search, crawling, and scraping operations using Tavily"
@action
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
"""Perform a web search and outputs a .json file with a list of found URLs.
Each result contains "title" and "url".
Parameters:
query (str): Search query to perform
maxResults (int, optional): Maximum number of results (default: 10)
"""
# TODO: Fix docstrings - do we need that format for parsing?
try:
# Prepare request data
web_search_request = WebSearchRequest(
query=parameters.get("query"),
max_results=parameters.get("maxResults", 10),
)
# Perform request
web_interface = await WebInterface.create()
web_search_result = await web_interface.search(web_search_request)
return web_search_result
except Exception as e:
return ActionResult(success=False, error=str(e))
@action
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
"""Crawls a list of URLs and extracts information from them.
Parameters:
document (str): Document reference containing URL list from search results
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
"""
try:
document_ref = parameters.get("document")
if not document_ref:
return ActionResult(
success=False, error="No document reference provided."
)
# Resolve document reference to ChatDocument objects
chat_documents = self.service.getChatDocumentsFromDocumentList(
[document_ref]
)
if not chat_documents:
return ActionResult(
success=False,
error=f"No documents found for reference: {document_ref}",
)
# Get the first document (search results)
search_doc = chat_documents[0]
# Get file data using the service center
file_data = self.service.getFileData(search_doc.fileId)
if not file_data:
return ActionResult(
success=False, error="Could not retrieve file data for document"
)
content = file_data.decode("utf-8")
# Parse JSON to extract URLs from search results
import json
try:
# The document structure from WebSearchActionResult
search_data = json.loads(content)
# Extract URLs from the search results structure
urls = []
if isinstance(search_data, dict):
# Handle the document structure: documentData contains the actual search results
doc_data = search_data.get("documentData", search_data)
if "results" in doc_data and isinstance(doc_data["results"], list):
urls = [
result["url"]
for result in doc_data["results"]
if isinstance(result, dict) and "url" in result
]
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
# Fallback: if URLs are stored directly in a 'urls' field
urls = [url for url in doc_data["urls"] if isinstance(url, str)]
# Fallback: try to parse as plain text with regex (for backward compatibility)
if not urls:
logger.warning(
"Could not extract URLs from JSON structure, trying plain text parsing"
)
import re
urls = re.split(r"[\n,;]+", content)
urls = [
u.strip()
for u in urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
except json.JSONDecodeError:
# Fallback to plain text parsing if JSON parsing fails
logger.warning("Document is not valid JSON, trying plain text parsing")
import re
urls = re.split(r"[\n,;]+", content)
urls = [
u.strip()
for u in urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
if not urls:
return ActionResult(
success=False, error="No valid URLs found in the document."
)
logger.info(f"Extracted {len(urls)} URLs from document: {urls}")
# Prepare request data
web_crawl_request = WebCrawlRequest(urls=urls)
# Perform request
web_interface = await WebInterface.create()
web_crawl_result = await web_interface.crawl(web_crawl_request)
return web_crawl_result
except Exception as e:
logger.error(f"Error in crawl method: {str(e)}")
return ActionResult(success=False, error=str(e))
@action
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
"""Scrapes web content by searching for URLs and then extracting their content.
Combines search and crawl operations in one step.
Parameters:
query (str): Search query to perform
maxResults (int, optional): Maximum number of results (default: 10)
"""
try:
query = parameters.get("query")
max_results = parameters.get("maxResults", 10)
if not query:
return ActionResult(success=False, error="Search query is required")
# Prepare request data
web_scrape_request = WebScrapeRequest(
query=query,
max_results=max_results,
)
# Perform request
web_interface = await WebInterface.create()
web_scrape_result = await web_interface.scrape(web_scrape_request)
return web_scrape_result
except Exception as e:
return ActionResult(success=False, error=str(e))