gateway/modules/workflows/methods/methodWeb.py
2025-09-23 22:47:54 +02:00

405 lines
19 KiB
Python

import logging
import csv
import io
from typing import Any, Dict
from modules.workflows.methods.methodBase import MethodBase, action
from modules.interfaces.interfaceChatModel import ActionResult, ActionDocument
from modules.interfaces.interfaceWebObjects import WebInterface
from modules.interfaces.interfaceWebModel import (
WebSearchRequest,
WebCrawlRequest,
WebScrapeRequest,
)
logger = logging.getLogger(__name__)
class MethodWeb(MethodBase):
"""Web method implementation for web operations."""
def __init__(self, serviceCenter: Any):
super().__init__(serviceCenter)
self.name = "web"
self.description = "Web search, crawling, and scraping operations using Tavily"
# Centralized services interface (for AI)
from modules.services import getInterface as getServices
self.services = getServices(self.service.user, self.service.workflow)
@action
async def search(self, parameters: Dict[str, Any]) -> ActionResult:
"""Perform a web search and output a CSV with the found URLs. Each result row contains columns "url" and "title".
Parameters:
query (str, required): Search query.
maxResults (int, optional): Max number of results. Default: 10.
searchDepth ("basic"|"advanced", optional): Search depth. Default: provider default.
timeRange ("d"|"w"|"m"|"y", optional): Limit to last day/week/month/year.
topic ("general"|"news"|"academic", optional): Result domain preference.
includeDomains (list[str], optional): Only include these domains.
excludeDomains (list[str], optional): Exclude these domains.
language (str, optional): ISO code like "de", "en" to bias results.
includeAnswer (bool, optional): Ask provider to generate a short answer.
includeRawContent (bool, optional): Include raw content where possible.
"""
try:
# Prepare request data (generic, no region/language bias)
raw_query = parameters.get("query")
max_results = parameters.get("maxResults", 10)
if not raw_query or not isinstance(raw_query, str):
return ActionResult(success=False, error="Search query is required")
web_search_request = WebSearchRequest(
query=raw_query.strip(),
max_results=max_results,
search_depth=parameters.get("searchDepth"),
time_range=parameters.get("timeRange"),
topic=parameters.get("topic"),
include_domains=parameters.get("includeDomains"),
exclude_domains=parameters.get("excludeDomains"),
language=parameters.get("language"),
include_answer=parameters.get("includeAnswer"),
include_raw_content=parameters.get("includeRawContent"),
)
# Perform request
web_interface = await WebInterface.create()
web_search_result = await web_interface.search(web_search_request)
# Convert search results to CSV format (generic)
if web_search_result.success and web_search_result.documents:
csv_content = web_interface.convert_web_search_result_to_csv(web_search_result)
csv_document = web_interface.create_csv_action_document(
csv_content, f"web_search_results.csv"
)
return ActionResult(success=True, documents=[csv_document])
else:
return web_search_result
except Exception as e:
return ActionResult(success=False, error=str(e))
def _read_csv_with_urls(self, csv_content: str) -> list:
"""Read CSV content and extract URLs from url,title or title,url format (both ; and , delimiters)"""
urls = []
# Try both semicolon and comma delimiters
for delimiter in [';', ',']:
try:
reader = csv.DictReader(io.StringIO(csv_content), delimiter=delimiter)
for row in reader:
# Look for url column (case insensitive)
url = None
for key in row.keys():
if key.lower() == 'url':
url = row[key].strip()
break
if url and (url.startswith('http://') or url.startswith('https://')):
urls.append(url)
# If we found URLs with this delimiter, return them
if urls:
return urls
except Exception:
# Try next delimiter
continue
# If no valid CSV found, try simple text parsing as fallback
lines = csv_content.split('\n')
for line in lines:
line = line.strip()
if line and (line.startswith('http://') or line.startswith('https://')):
urls.append(line)
return urls
@action
async def crawl(self, parameters: Dict[str, Any]) -> ActionResult:
"""Crawl a list of URLs and extract text content.
Parameters:
documentList (list[str]|str, required): Reference(s) to documents containing URLs (e.g., CSV from search). Can be a single ref or list.
expectedDocumentFormats (list, optional): Hint for downstream handling.
extractDepth ("basic"|"advanced", optional): Extraction depth. Default: "advanced".
format ("text"|"markdown", optional): Output format. Default: "text".
"""
try:
document_list = parameters.get("documentList")
# Normalize to list if a single string reference is provided
if isinstance(document_list, str):
document_list = [document_list]
if not document_list:
return ActionResult(
success=False, error="No document list reference provided."
)
# Resolve document list reference to ChatDocument objects
chat_documents = self.service.getChatDocumentsFromDocumentList(document_list)
if not chat_documents:
return ActionResult(
success=False,
error=f"No documents found for reference: {document_list}",
)
# Extract URLs from all documents and combine them
all_urls = []
import json
import re
for i, doc in enumerate(chat_documents):
logger.info(f"Processing document {i+1}/{len(chat_documents)}: {doc.fileName}")
# Get file data using the service center
file_data = self.service.getFileData(doc.fileId)
if not file_data:
logger.warning(f"Could not retrieve file data for document: {doc.fileName}")
continue
content = file_data.decode("utf-8")
# Try to parse as CSV first (for new CSV format)
if doc.fileName.lower().endswith('.csv') or 'csv' in doc.mimeType.lower():
logger.info(f"Processing CSV file: {doc.fileName}")
doc_urls = self._read_csv_with_urls(content)
else:
# Parse JSON to extract URLs from search results
try:
# The document structure from WebSearchActionResult
search_data = json.loads(content)
# Extract URLs from the search results structure
doc_urls = []
if isinstance(search_data, dict):
# Handle the document structure: documentData contains the actual search results
doc_data = search_data.get("documentData", search_data)
if "results" in doc_data and isinstance(doc_data["results"], list):
doc_urls = [
result["url"]
for result in doc_data["results"]
if isinstance(result, dict) and "url" in result
]
elif "urls" in doc_data and isinstance(doc_data["urls"], list):
# Fallback: if URLs are stored directly in a 'urls' field
doc_urls = [url for url in doc_data["urls"] if isinstance(url, str)]
# Fallback: try to parse as plain text with regex (for backward compatibility)
if not doc_urls:
logger.warning(
f"Could not extract URLs from JSON structure in {doc.fileName}, trying plain text parsing"
)
doc_urls = re.split(r"[\n,;]+", content)
doc_urls = [
u.strip()
for u in doc_urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
except json.JSONDecodeError:
# Fallback to plain text parsing if JSON parsing fails
logger.warning(f"Document {doc.fileName} is not valid JSON, trying plain text parsing")
doc_urls = re.split(r"[\n,;]+", content)
doc_urls = [
u.strip()
for u in doc_urls
if u.strip()
and (
u.strip().startswith("http://")
or u.strip().startswith("https://")
)
]
if doc_urls:
all_urls.extend(doc_urls)
logger.info(f"Extracted {len(doc_urls)} URLs from {doc.fileName}")
else:
logger.warning(f"No valid URLs found in document: {doc.fileName}")
if not all_urls:
return ActionResult(
success=False, error="No valid URLs found in any of the documents."
)
# Remove duplicates while preserving order
unique_urls = list(dict.fromkeys(all_urls))
logger.info(f"Extracted {len(unique_urls)} unique URLs from {len(chat_documents)} documents")
# Prepare request data with normalization
allowed_extract_depth = {"basic", "advanced"}
allowed_formats = {"text", "markdown"}
extract_depth = parameters.get("extractDepth")
if extract_depth and extract_depth not in allowed_extract_depth:
logger.warning(f"Invalid extractDepth '{extract_depth}' provided. Falling back to 'advanced'.")
extract_depth = "advanced"
fmt = parameters.get("format")
if fmt and fmt not in allowed_formats:
logger.warning(f"Invalid format '{fmt}' provided. Falling back to 'text'.")
fmt = "text"
web_crawl_request = WebCrawlRequest(
urls=unique_urls,
extract_depth=extract_depth,
format=fmt,
)
# Perform request
web_interface = await WebInterface.create()
web_crawl_result = await web_interface.crawl(web_crawl_request)
# Convert and enrich with concise summaries per URL for better context
if web_crawl_result.success:
try:
doc = web_crawl_result.documents[0]
results = getattr(doc.documentData, "results", [])
enriched = []
# Summarize each result briefly using AI for added context
for item in results:
url = str(getattr(item, "url", ""))
content = str(getattr(item, "content", ""))
summary = ""
try:
if content:
prompt = (
"Summarize the following webpage content in 3-5 concise bullet points. "
"Focus on key points, figures, named entities (companies/institutions), and location context. "
"Return only bullet points without any preface."
)
context = content[:4000]
# Centralized AI summary (balanced analyse_content)
summary = await self.services.ai.callAi(
prompt=prompt,
documents=None,
options={
"process_type": "text",
"operation_type": "analyse_content",
"priority": "balanced",
"compress_prompt": True,
"compress_documents": False,
"processing_mode": "advanced",
"max_cost": 0.05,
"max_processing_time": 30
}
)
summary = summary.strip()
except Exception:
summary = ""
enriched.append({
"url": url,
"summary": summary,
"snippet": content[:500]
})
import json as _json
payload = {
"success": True,
"total_count": len(enriched),
"results": enriched,
}
json_content = _json.dumps(payload, ensure_ascii=False, indent=2)
except Exception:
# Fallback to original conversion
json_content = web_interface.convert_web_result_to_json(web_crawl_result)
json_document = web_interface.create_json_action_document(
json_content, f"web_crawl_results.json"
)
return ActionResult(success=True, documents=[json_document])
else:
return web_crawl_result
except Exception as e:
logger.error(f"Error in crawl method: {str(e)}")
return ActionResult(success=False, error=str(e))
@action
async def scrape(self, parameters: Dict[str, Any]) -> ActionResult:
"""Search and then crawl the found URLs in one step. To use for market analysis, web research, internet searches
Parameters:
query (str, required): Search query.
maxResults (int, optional): Max number of results. Default: 10.
searchDepth ("basic"|"advanced", optional): Search depth.
timeRange ("d"|"w"|"m"|"y", optional): Time window.
topic ("general"|"news"|"academic", optional): Result domain preference.
includeDomains (list[str], optional): Only include these domains.
excludeDomains (list[str], optional): Exclude these domains.
language (str, optional): ISO language bias.
includeAnswer (bool, optional): Ask provider to include an answer.
includeRawContent (bool, optional): Include raw content where possible.
extractDepth ("basic"|"advanced", optional): Crawl extraction depth. Default: "advanced".
format ("text"|"markdown", optional): Crawl output format. Default: "text".
"""
try:
query = parameters.get("query")
max_results = parameters.get("maxResults", 10)
# Normalize optional enums to avoid validation errors
allowed_search_depth = {"basic", "advanced"}
allowed_extract_depth = {"basic", "advanced"}
allowed_formats = {"text", "markdown"}
search_depth = parameters.get("searchDepth")
if search_depth and search_depth not in allowed_search_depth:
logger.warning(f"Invalid searchDepth '{search_depth}' provided. Falling back to None.")
search_depth = None
extract_depth = parameters.get("extractDepth")
if extract_depth and extract_depth not in allowed_extract_depth:
logger.warning(f"Invalid extractDepth '{extract_depth}' provided. Falling back to 'advanced'.")
extract_depth = "advanced"
fmt = parameters.get("format")
if fmt and fmt not in allowed_formats:
logger.warning(f"Invalid format '{fmt}' provided. Falling back to 'text'.")
fmt = "text"
if not query:
return ActionResult(success=False, error="Search query is required")
# Prepare request data
web_scrape_request = WebScrapeRequest(
query=query,
max_results=max_results,
search_depth=search_depth,
time_range=parameters.get("timeRange"),
topic=parameters.get("topic"),
include_domains=parameters.get("includeDomains"),
exclude_domains=parameters.get("excludeDomains"),
language=parameters.get("language"),
include_answer=parameters.get("includeAnswer"),
include_raw_content=parameters.get("includeRawContent"),
extract_depth=extract_depth,
format=fmt,
)
# Perform request
web_interface = await WebInterface.create()
web_scrape_result = await web_interface.scrape(web_scrape_request)
# Convert to proper JSON format
if web_scrape_result.success:
json_content = web_interface.convert_web_result_to_json(web_scrape_result)
json_document = web_interface.create_json_action_document(
json_content,
f"web_scrape_results.json"
)
return ActionResult(
success=True,
documents=[json_document]
)
else:
return web_scrape_result
except Exception as e:
return ActionResult(success=False, error=str(e))