import logging import csv import io import json as _json from typing import Any, Dict from modules.workflows.methods.methodBase import MethodBase, action from modules.datamodels.datamodelWorkflow import ActionResult, ActionDocument from modules.datamodels.datamodelAi import AiCallOptions, OperationType, Priority from modules.datamodels.datamodelWeb import ( WebSearchRequest, WebCrawlRequest, WebScrapeRequest, ) logger = logging.getLogger(__name__) class MethodWeb(MethodBase): """Web method implementation for web operations.""" def __init__(self, services): super().__init__(services) self.name = "web" self.description = "Web search, crawling, and scraping operations using Tavily" @action async def search(self, parameters: Dict[str, Any]) -> ActionResult: """Perform a web search and output a CSV with the found URLs. Each result row contains columns "url" and "title". Parameters: query (str, required): Search query. maxResults (int, optional): Max number of results. Default: 10. searchDepth ("basic"|"advanced", optional): Search depth. Default: provider default. timeRange ("d"|"w"|"m"|"y", optional): Limit to last day/week/month/year. topic ("general"|"news"|"academic", optional): Result domain preference. includeDomains (list[str], optional): Only include these domains. excludeDomains (list[str], optional): Exclude these domains. language (str, optional): ISO code like "de", "en" to bias results. includeAnswer (bool, optional): Ask provider to generate a short answer. includeRawContent (bool, optional): Include raw content where possible. """ try: # Prepare request data (generic, no region/language bias) raw_query = parameters.get("query") max_results = parameters.get("maxResults", 10) if not raw_query or not isinstance(raw_query, str): return ActionResult(success=False, error="Search query is required") web_search_request = WebSearchRequest( query=raw_query.strip(), max_results=max_results, search_depth=parameters.get("searchDepth"), time_range=parameters.get("timeRange"), topic=parameters.get("topic"), include_domains=parameters.get("includeDomains"), exclude_domains=parameters.get("excludeDomains"), language=parameters.get("language"), include_answer=parameters.get("includeAnswer"), include_raw_content=parameters.get("includeRawContent"), ) # Perform request via centralized service wrappers web_search_result = await self.services.web.webSearch(web_search_request) # Convert search results to CSV format (generic) if web_search_result.success and web_search_result.documents: csv_content = self._convert_web_result_to_csv(web_search_result) csv_document = ActionDocument( documentName=f"web_search_results.csv", documentData=csv_content, mimeType="text/csv" ) return ActionResult(success=True, documents=[csv_document]) else: return web_search_result except Exception as e: return ActionResult(success=False, error=str(e)) def _read_csv_with_urls(self, csv_content: str) -> list: """Read CSV content and extract URLs from url,title or title,url format (both ; and , delimiters)""" urls = [] # Try both semicolon and comma delimiters for delimiter in [';', ',']: try: reader = csv.DictReader(io.StringIO(csv_content), delimiter=delimiter) for row in reader: # Look for url column (case insensitive) url = None for key in row.keys(): if key.lower() == 'url': url = row[key].strip() break if url and (url.startswith('http://') or url.startswith('https://')): urls.append(url) # If we found URLs with this delimiter, return them if urls: return urls except Exception: # Try next delimiter continue # If no valid CSV found, try simple text parsing as fallback lines = csv_content.split('\n') for line in lines: line = line.strip() if line and (line.startswith('http://') or line.startswith('https://')): urls.append(line) return urls @action async def crawl(self, parameters: Dict[str, Any]) -> ActionResult: """Crawl a list of URLs and extract text content. Parameters: documentList (list[str]|str, required): Reference(s) to documents containing URLs (e.g., CSV from search). Can be a single ref or list. expectedDocumentFormats (list, optional): Hint for downstream handling. extractDepth ("basic"|"advanced", optional): Extraction depth. Default: "advanced". format ("text"|"markdown", optional): Output format. Default: "text". """ try: document_list = parameters.get("documentList") # Normalize to list if a single string reference is provided if isinstance(document_list, str): document_list = [document_list] if not document_list: return ActionResult( success=False, error="No document list reference provided." ) # Resolve document list reference to ChatDocument objects chat_documents = self.services.workflow.getChatDocumentsFromDocumentList(document_list) if not chat_documents: return ActionResult( success=False, error=f"No documents found for reference: {document_list}", ) # Extract URLs from all documents and combine them all_urls = [] import json import re for i, doc in enumerate(chat_documents): logger.info(f"Processing document {i+1}/{len(chat_documents)}: {doc.fileName}") # Get file data using the service center file_data = self.services.workflow.getFileData(doc.fileId) if not file_data: logger.warning(f"Could not retrieve file data for document: {doc.fileName}") continue content = file_data.decode("utf-8") # Try to parse as CSV first (for new CSV format) if doc.fileName.lower().endswith('.csv') or 'csv' in doc.mimeType.lower(): logger.info(f"Processing CSV file: {doc.fileName}") doc_urls = self._read_csv_with_urls(content) else: # Parse JSON to extract URLs from search results try: # The document structure from WebSearchActionResult search_data = json.loads(content) # Extract URLs from the search results structure doc_urls = [] if isinstance(search_data, dict): # Handle the document structure: documentData contains the actual search results doc_data = search_data.get("documentData", search_data) if "results" in doc_data and isinstance(doc_data["results"], list): doc_urls = [ result["url"] for result in doc_data["results"] if isinstance(result, dict) and "url" in result ] elif "urls" in doc_data and isinstance(doc_data["urls"], list): # Fallback: if URLs are stored directly in a 'urls' field doc_urls = [url for url in doc_data["urls"] if isinstance(url, str)] # Fallback: try to parse as plain text with regex (for backward compatibility) if not doc_urls: logger.warning( f"Could not extract URLs from JSON structure in {doc.fileName}, trying plain text parsing" ) doc_urls = re.split(r"[\n,;]+", content) doc_urls = [ u.strip() for u in doc_urls if u.strip() and ( u.strip().startswith("http://") or u.strip().startswith("https://") ) ] except json.JSONDecodeError: # Fallback to plain text parsing if JSON parsing fails logger.warning(f"Document {doc.fileName} is not valid JSON, trying plain text parsing") doc_urls = re.split(r"[\n,;]+", content) doc_urls = [ u.strip() for u in doc_urls if u.strip() and ( u.strip().startswith("http://") or u.strip().startswith("https://") ) ] if doc_urls: all_urls.extend(doc_urls) logger.info(f"Extracted {len(doc_urls)} URLs from {doc.fileName}") else: logger.warning(f"No valid URLs found in document: {doc.fileName}") if not all_urls: return ActionResult( success=False, error="No valid URLs found in any of the documents." ) # Remove duplicates while preserving order unique_urls = list(dict.fromkeys(all_urls)) logger.info(f"Extracted {len(unique_urls)} unique URLs from {len(chat_documents)} documents") # Prepare request data with normalization allowed_extract_depth = {"basic", "advanced"} allowed_formats = {"text", "markdown"} extract_depth = parameters.get("extractDepth") if extract_depth and extract_depth not in allowed_extract_depth: logger.warning(f"Invalid extractDepth '{extract_depth}' provided. Falling back to 'advanced'.") extract_depth = "advanced" fmt = parameters.get("format") if fmt and fmt not in allowed_formats: logger.warning(f"Invalid format '{fmt}' provided. Falling back to 'text'.") fmt = "text" web_crawl_request = WebCrawlRequest( urls=unique_urls, extract_depth=extract_depth, format=fmt, ) # Perform request via centralized service wrappers web_crawl_result = await self.services.web.webCrawl(web_crawl_request) # Convert and enrich with concise summaries per URL for better context if web_crawl_result.success: try: doc = web_crawl_result.documents[0] results = getattr(doc.documentData, "results", []) enriched = [] # Summarize each result briefly using AI for added context for item in results: url = str(getattr(item, "url", "")) content = str(getattr(item, "content", "")) summary = "" try: if content: prompt = ( "Summarize the following webpage content in 3-5 concise bullet points. " "Focus on key points, figures, named entities (companies/institutions), and location context. " "Return only bullet points without any preface." ) context = content[:4000] # Centralized AI summary (balanced analyse_content) summary = await self.services.ai.callAi( prompt=prompt, documents=None, options=AiCallOptions( operationType=OperationType.ANALYSE_CONTENT, priority=Priority.BALANCED, compressPrompt=True, compressContext=False, processingMode="advanced", maxCost=0.05, maxProcessingTime=30 ) ) summary = summary.strip() except Exception: summary = "" enriched.append({ "url": url, "summary": summary, "snippet": content[:500] }) import json as _json payload = { "success": True, "total_count": len(enriched), "results": enriched, } json_content = _json.dumps(payload, ensure_ascii=False, indent=2) except Exception: # Fallback to original conversion json_content = self._convert_web_result_to_json(web_crawl_result) json_document = ActionDocument( documentName=f"web_crawl_results.json", documentData=json_content, mimeType="application/json" ) return ActionResult(success=True, documents=[json_document]) else: return web_crawl_result except Exception as e: logger.error(f"Error in crawl method: {str(e)}") return ActionResult(success=False, error=str(e)) @action async def scrape(self, parameters: Dict[str, Any]) -> ActionResult: """Search and then crawl the found URLs in one step. To use for market analysis, web research, internet searches Parameters: query (str, required): Search query. maxResults (int, optional): Max number of results. Default: 10. searchDepth ("basic"|"advanced", optional): Search depth. timeRange ("d"|"w"|"m"|"y", optional): Time window. topic ("general"|"news"|"academic", optional): Result domain preference. includeDomains (list[str], optional): Only include these domains. excludeDomains (list[str], optional): Exclude these domains. language (str, optional): ISO language bias. includeAnswer (bool, optional): Ask provider to include an answer. includeRawContent (bool, optional): Include raw content where possible. extractDepth ("basic"|"advanced", optional): Crawl extraction depth. Default: "advanced". format ("text"|"markdown", optional): Crawl output format. Default: "text". """ try: query = parameters.get("query") max_results = parameters.get("maxResults", 10) # Normalize optional enums to avoid validation errors allowed_search_depth = {"basic", "advanced"} allowed_extract_depth = {"basic", "advanced"} allowed_formats = {"text", "markdown"} search_depth = parameters.get("searchDepth") if search_depth and search_depth not in allowed_search_depth: logger.warning(f"Invalid searchDepth '{search_depth}' provided. Falling back to None.") search_depth = None extract_depth = parameters.get("extractDepth") if extract_depth and extract_depth not in allowed_extract_depth: logger.warning(f"Invalid extractDepth '{extract_depth}' provided. Falling back to 'advanced'.") extract_depth = "advanced" fmt = parameters.get("format") if fmt and fmt not in allowed_formats: logger.warning(f"Invalid format '{fmt}' provided. Falling back to 'text'.") fmt = "text" if not query: return ActionResult(success=False, error="Search query is required") # Prepare request data web_scrape_request = WebScrapeRequest( query=query, max_results=max_results, search_depth=search_depth, time_range=parameters.get("timeRange"), topic=parameters.get("topic"), include_domains=parameters.get("includeDomains"), exclude_domains=parameters.get("excludeDomains"), language=parameters.get("language"), include_answer=parameters.get("includeAnswer"), include_raw_content=parameters.get("includeRawContent"), extract_depth=extract_depth, format=fmt, ) # Perform request via centralized service wrappers web_scrape_result = await self.services.web.webScrape(web_scrape_request) # Convert to proper JSON format if web_scrape_result.success: json_content = self._convert_web_result_to_json(web_scrape_result) json_document = ActionDocument( documentName=f"web_scrape_results.json", documentData=json_content, mimeType="application/json" ) return ActionResult( success=True, documents=[json_document] ) else: return web_scrape_result except Exception as e: return ActionResult(success=False, error=str(e)) # Helpers def _convert_web_result_to_json(self, web_result): if not getattr(web_result, 'success', False) or not getattr(web_result, 'documents', None): return _json.dumps({"success": getattr(web_result, 'success', False), "error": getattr(web_result, 'error', None)}) document_data = web_result.documents[0].documentData result_dict = { "success": True, "results": [ { "url": str(getattr(result, 'url', "")), "content": getattr(result, 'content', "") } for result in getattr(document_data, 'results', []) ], "total_count": getattr(document_data, 'total_count', 0) } if hasattr(document_data, 'urls'): result_dict["urls"] = [str(url) for url in getattr(document_data, 'urls', [])] elif hasattr(document_data, 'query'): result_dict["query"] = getattr(document_data, 'query', None) return _json.dumps(result_dict, indent=2, ensure_ascii=False) def _convert_web_result_to_csv(self, web_search_result): if not getattr(web_search_result, 'success', False) or not getattr(web_search_result, 'documents', None): return "" output = io.StringIO() writer = csv.writer(output, delimiter=';') writer.writerow(['url', 'title']) document_data = web_search_result.documents[0].documentData for result in getattr(document_data, 'results', []): writer.writerow([str(getattr(result, 'url', "")), getattr(result, 'title', "")]) return output.getvalue()