197 lines
7.1 KiB
Python
197 lines
7.1 KiB
Python
import logging
|
|
from typing import Any, Dict
|
|
from modules.chat.methodBase import MethodBase, action
|
|
from modules.interfaces.interfaceChatModel import ActionResult
|
|
from modules.interfaces.interface_web_objects import WebInterface
|
|
from modules.interfaces.interface_web_model import (
|
|
WebSearchRequest,
|
|
WebCrawlRequest,
|
|
WebScrapeRequest,
|
|
)
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MethodWeb(MethodBase):
|
|
"""Web method implementation for web operations."""
|
|
|
|
def __init__(self, serviceCenter: Any):
|
|
super().__init__(serviceCenter)
|
|
self.name = "web"
|
|
self.description = "Web search, crawling, and scraping operations using Tavily"
|
|
|
|
@action
|
|
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Perform a web search and outputs a .json file with a list of found URLs.
|
|
|
|
Each result contains "title" and "url".
|
|
|
|
Parameters:
|
|
query (str): Search query to perform
|
|
maxResults (int, optional): Maximum number of results (default: 10)
|
|
"""
|
|
# TODO: Fix docstrings - do we need that format for parsing?
|
|
|
|
try:
|
|
# Prepare request data
|
|
web_search_request = WebSearchRequest(
|
|
query=parameters.get("query"),
|
|
max_results=parameters.get("maxResults", 10),
|
|
)
|
|
|
|
# Perform request
|
|
web_interface = await WebInterface.create()
|
|
web_search_result = await web_interface.search(web_search_request)
|
|
|
|
return web_search_result
|
|
|
|
except Exception as e:
|
|
return ActionResult(success=False, error=str(e))
|
|
|
|
@action
|
|
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Crawls a list of URLs and extracts information from them.
|
|
|
|
Parameters:
|
|
document (str): Document reference containing URL list from search results
|
|
expectedDocumentFormats (list, optional): Expected document formats with extension, mimeType, description
|
|
"""
|
|
try:
|
|
document_ref = parameters.get("document")
|
|
|
|
if not document_ref:
|
|
return ActionResult(
|
|
success=False, error="No document reference provided."
|
|
)
|
|
|
|
# Resolve document reference to ChatDocument objects
|
|
chat_documents = self.service.getChatDocumentsFromDocumentList(
|
|
[document_ref]
|
|
)
|
|
|
|
if not chat_documents:
|
|
return ActionResult(
|
|
success=False,
|
|
error=f"No documents found for reference: {document_ref}",
|
|
)
|
|
|
|
# Get the first document (search results)
|
|
search_doc = chat_documents[0]
|
|
|
|
# Get file data using the service center
|
|
file_data = self.service.getFileData(search_doc.fileId)
|
|
if not file_data:
|
|
return ActionResult(
|
|
success=False, error="Could not retrieve file data for document"
|
|
)
|
|
|
|
content = file_data.decode("utf-8")
|
|
|
|
# Parse JSON to extract URLs from search results
|
|
import json
|
|
|
|
try:
|
|
# The document structure from WebSearchActionResult
|
|
search_data = json.loads(content)
|
|
|
|
# Extract URLs from the search results structure
|
|
urls = []
|
|
if isinstance(search_data, dict):
|
|
# Handle the document structure: documentData contains the actual search results
|
|
doc_data = search_data.get("documentData", search_data)
|
|
if "results" in doc_data and isinstance(doc_data["results"], list):
|
|
urls = [
|
|
result["url"]
|
|
for result in doc_data["results"]
|
|
if isinstance(result, dict) and "url" in result
|
|
]
|
|
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
|
|
# Fallback: if URLs are stored directly in a 'urls' field
|
|
urls = [url for url in doc_data["urls"] if isinstance(url, str)]
|
|
|
|
# Fallback: try to parse as plain text with regex (for backward compatibility)
|
|
if not urls:
|
|
logger.warning(
|
|
"Could not extract URLs from JSON structure, trying plain text parsing"
|
|
)
|
|
import re
|
|
|
|
urls = re.split(r"[\n,;]+", content)
|
|
urls = [
|
|
u.strip()
|
|
for u in urls
|
|
if u.strip()
|
|
and (
|
|
u.strip().startswith("http://")
|
|
or u.strip().startswith("https://")
|
|
)
|
|
]
|
|
|
|
except json.JSONDecodeError:
|
|
# Fallback to plain text parsing if JSON parsing fails
|
|
logger.warning("Document is not valid JSON, trying plain text parsing")
|
|
import re
|
|
|
|
urls = re.split(r"[\n,;]+", content)
|
|
urls = [
|
|
u.strip()
|
|
for u in urls
|
|
if u.strip()
|
|
and (
|
|
u.strip().startswith("http://")
|
|
or u.strip().startswith("https://")
|
|
)
|
|
]
|
|
|
|
if not urls:
|
|
return ActionResult(
|
|
success=False, error="No valid URLs found in the document."
|
|
)
|
|
|
|
logger.info(f"Extracted {len(urls)} URLs from document: {urls}")
|
|
|
|
# Prepare request data
|
|
web_crawl_request = WebCrawlRequest(urls=urls)
|
|
|
|
# Perform request
|
|
web_interface = await WebInterface.create()
|
|
web_crawl_result = await web_interface.crawl(web_crawl_request)
|
|
|
|
return web_crawl_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in crawl method: {str(e)}")
|
|
return ActionResult(success=False, error=str(e))
|
|
|
|
@action
|
|
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""Scrapes web content by searching for URLs and then extracting their content.
|
|
|
|
Combines search and crawl operations in one step.
|
|
|
|
Parameters:
|
|
query (str): Search query to perform
|
|
maxResults (int, optional): Maximum number of results (default: 10)
|
|
"""
|
|
try:
|
|
query = parameters.get("query")
|
|
max_results = parameters.get("maxResults", 10)
|
|
|
|
if not query:
|
|
return ActionResult(success=False, error="Search query is required")
|
|
|
|
# Prepare request data
|
|
web_scrape_request = WebScrapeRequest(
|
|
query=query,
|
|
max_results=max_results,
|
|
)
|
|
|
|
# Perform request
|
|
web_interface = await WebInterface.create()
|
|
web_scrape_result = await web_interface.scrape(web_scrape_request)
|
|
|
|
return web_scrape_result
|
|
|
|
except Exception as e:
|
|
return ActionResult(success=False, error=str(e))
|