refactored web research to be generic and in 2 steps: url, crawl

This commit is contained in:
ValueOn AG 2025-10-26 14:02:44 +01:00
parent e8c3052176
commit 72e0687826
9 changed files with 1079 additions and 2169 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -21,11 +21,8 @@ class OperationTypeEnum(str, Enum):
IMAGE_GENERATE = "imageGenerate"
# Web Operations
WEB_SEARCH = "webSearch" # Returns list of URLs only
WEB_CRAWL = "webCrawl" # Returns content from given URLs
WEB_RESEARCH = "webResearch" # WEB_SEARCH + WEB_CRAWL combined (scrape function)
WEB_QUESTIONS = "webQuestions" # Question-answering web research
WEB_NEWS = "webNews" # News-specific web research
WEB_SEARCH = "webSearch" # Returns list of URLs only
WEB_CRAWL = "webCrawl" # Web crawl for a given URL
# Operation Type Rating - Helper class for capability ratings
@ -49,8 +46,8 @@ def createOperationTypeRatings(*ratings: Tuple[OperationTypeEnum, int]) -> List[
Usage:
operationTypes = createOperationTypeRatings(
(OperationTypeEnum.DATA_ANALYSE, 8),
(OperationTypeEnum.WEB_RESEARCH, 10),
(OperationTypeEnum.WEB_NEWS, 7)
(OperationTypeEnum.WEB_SEARCH, 10),
(OperationTypeEnum.WEB_CRAWL, 9)
)
"""
return [OperationTypeRating(operationType=ot, rating=rating) for ot, rating in ratings]
@ -195,3 +192,42 @@ class AiModelResponse(BaseModel):
class Config:
arbitraryTypesAllowed = True
# Structured prompt models for specialized operations
class AiCallPromptWebSearch(BaseModel):
"""Structured prompt format for WEB_SEARCH operation - returns list of URLs."""
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
class Config:
pass
class AiCallPromptWebCrawl(BaseModel):
"""Structured prompt format for WEB_CRAWL operation - crawls ONE specific URL and returns content."""
instruction: str = Field(description="Instruction for what content to extract from URL")
url: str = Field(description="Single URL to crawl")
maxDepth: Optional[int] = Field(default=2, description="Maximum number of hops from starting page (default: 2)")
maxWidth: Optional[int] = Field(default=10, description="Maximum pages to crawl per level (default: 10)")
class Config:
pass
class AiCallPromptImage(BaseModel):
"""Structured prompt format for image generation."""
prompt: str = Field(description="Text description of the image to generate")
size: Optional[str] = Field(default="1024x1024", description="Image size (1024x1024, 1792x1024, 1024x1792)")
quality: Optional[str] = Field(default="standard", description="Image quality (standard, hd)")
style: Optional[str] = Field(default="vivid", description="Image style (vivid, natural)")
class Config:
pass

View file

@ -0,0 +1,225 @@
"""
Utility data models and classes for common tools and mappings.
"""
class CountryCodes:
"""
Centralized country code mapping for different services.
Maps ISO-2 country codes to service-specific country names.
Each service may have different requirements for country names.
"""
# Mapping: ISO-2 code -> (Tavily country name, Perplexity country name)
_COUNTRY_MAP = {
"AF": ("afghanistan", "Afghanistan"),
"AL": ("albania", "Albania"),
"DZ": ("algeria", "Algeria"),
"AD": ("andorra", "Andorra"),
"AO": ("angola", "Angola"),
"AR": ("argentina", "Argentina"),
"AM": ("armenia", "Armenia"),
"AU": ("australia", "Australia"),
"AT": ("austria", "Austria"),
"AZ": ("azerbaijan", "Azerbaijan"),
"BS": ("bahamas", "Bahamas"),
"BH": ("bahrain", "Bahrain"),
"BD": ("bangladesh", "Bangladesh"),
"BB": ("barbados", "Barbados"),
"BY": ("belarus", "Belarus"),
"BE": ("belgium", "Belgium"),
"BZ": ("belize", "Belize"),
"BJ": ("benin", "Benin"),
"BT": ("bhutan", "Bhutan"),
"BO": ("bolivia", "Bolivia"),
"BA": ("bosnia and herzegovina", "Bosnia and Herzegovina"),
"BW": ("botswana", "Botswana"),
"BR": ("brazil", "Brazil"),
"BN": ("brunei", "Brunei"),
"BG": ("bulgaria", "Bulgaria"),
"BF": ("burkina faso", "Burkina Faso"),
"BI": ("burundi", "Burundi"),
"KH": ("cambodia", "Cambodia"),
"CM": ("cameroon", "Cameroon"),
"CA": ("canada", "Canada"),
"CV": ("cape verde", "Cape Verde"),
"CF": ("central african republic", "Central African Republic"),
"TD": ("chad", "Chad"),
"CL": ("chile", "Chile"),
"CN": ("china", "China"),
"CO": ("colombia", "Colombia"),
"KM": ("comoros", "Comoros"),
"CG": ("congo", "Congo"),
"CR": ("costa rica", "Costa Rica"),
"HR": ("croatia", "Croatia"),
"CU": ("cuba", "Cuba"),
"CY": ("cyprus", "Cyprus"),
"CZ": ("czech republic", "Czech Republic"),
"DK": ("denmark", "Denmark"),
"DJ": ("djibouti", "Djibouti"),
"DO": ("dominican republic", "Dominican Republic"),
"EC": ("ecuador", "Ecuador"),
"EG": ("egypt", "Egypt"),
"SV": ("el salvador", "El Salvador"),
"GQ": ("equatorial guinea", "Equatorial Guinea"),
"ER": ("eritrea", "Eritrea"),
"EE": ("estonia", "Estonia"),
"ET": ("ethiopia", "Ethiopia"),
"FJ": ("fiji", "Fiji"),
"FI": ("finland", "Finland"),
"FR": ("france", "France"),
"GA": ("gabon", "Gabon"),
"GM": ("gambia", "Gambia"),
"GE": ("georgia", "Georgia"),
"DE": ("germany", "Germany"),
"GH": ("ghana", "Ghana"),
"GR": ("greece", "Greece"),
"GT": ("guatemala", "Guatemala"),
"GN": ("guinea", "Guinea"),
"HT": ("haiti", "Haiti"),
"HN": ("honduras", "Honduras"),
"HU": ("hungary", "Hungary"),
"IS": ("iceland", "Iceland"),
"IN": ("india", "India"),
"ID": ("indonesia", "Indonesia"),
"IR": ("iran", "Iran"),
"IQ": ("iraq", "Iraq"),
"IE": ("ireland", "Ireland"),
"IL": ("israel", "Israel"),
"IT": ("italy", "Italy"),
"JM": ("jamaica", "Jamaica"),
"JP": ("japan", "Japan"),
"JO": ("jordan", "Jordan"),
"KZ": ("kazakhstan", "Kazakhstan"),
"KE": ("kenya", "Kenya"),
"KW": ("kuwait", "Kuwait"),
"KG": ("kyrgyzstan", "Kyrgyzstan"),
"LV": ("latvia", "Latvia"),
"LB": ("lebanon", "Lebanon"),
"LS": ("lesotho", "Lesotho"),
"LR": ("liberia", "Liberia"),
"LY": ("libya", "Libya"),
"LI": ("liechtenstein", "Liechtenstein"),
"LT": ("lithuania", "Lithuania"),
"LU": ("luxembourg", "Luxembourg"),
"MG": ("madagascar", "Madagascar"),
"MW": ("malawi", "Malawi"),
"MY": ("malaysia", "Malaysia"),
"MV": ("maldives", "Maldives"),
"ML": ("mali", "Mali"),
"MT": ("malta", "Malta"),
"MR": ("mauritania", "Mauritania"),
"MU": ("mauritius", "Mauritius"),
"MX": ("mexico", "Mexico"),
"MD": ("moldova", "Moldova"),
"MC": ("monaco", "Monaco"),
"MN": ("mongolia", "Mongolia"),
"ME": ("montenegro", "Montenegro"),
"MA": ("morocco", "Morocco"),
"MZ": ("mozambique", "Mozambique"),
"MM": ("myanmar", "Myanmar"),
"NA": ("namibia", "Namibia"),
"NP": ("nepal", "Nepal"),
"NL": ("netherlands", "Netherlands"),
"NZ": ("new zealand", "New Zealand"),
"NI": ("nicaragua", "Nicaragua"),
"NE": ("niger", "Niger"),
"NG": ("nigeria", "Nigeria"),
"KP": ("north korea", "North Korea"),
"MK": ("north macedonia", "North Macedonia"),
"NO": ("norway", "Norway"),
"OM": ("oman", "Oman"),
"PK": ("pakistan", "Pakistan"),
"PA": ("panama", "Panama"),
"PG": ("papua new guinea", "Papua New Guinea"),
"PY": ("paraguay", "Paraguay"),
"PE": ("peru", "Peru"),
"PH": ("philippines", "Philippines"),
"PL": ("poland", "Poland"),
"PT": ("portugal", "Portugal"),
"QA": ("qatar", "Qatar"),
"RO": ("romania", "Romania"),
"RU": ("russia", "Russia"),
"RW": ("rwanda", "Rwanda"),
"SA": ("saudi arabia", "Saudi Arabia"),
"SN": ("senegal", "Senegal"),
"RS": ("serbia", "Serbia"),
"SG": ("singapore", "Singapore"),
"SK": ("slovakia", "Slovakia"),
"SI": ("slovenia", "Slovenia"),
"SO": ("somalia", "Somalia"),
"ZA": ("south africa", "South Africa"),
"KR": ("south korea", "South Korea"),
"SS": ("south sudan", "South Sudan"),
"ES": ("spain", "Spain"),
"LK": ("sri lanka", "Sri Lanka"),
"SD": ("sudan", "Sudan"),
"SE": ("sweden", "Sweden"),
"CH": ("switzerland", "Switzerland"),
"SY": ("syria", "Syria"),
"TW": ("taiwan", "Taiwan"),
"TJ": ("tajikistan", "Tajikistan"),
"TZ": ("tanzania", "Tanzania"),
"TH": ("thailand", "Thailand"),
"TG": ("togo", "Togo"),
"TT": ("trinidad and tobago", "Trinidad and Tobago"),
"TN": ("tunisia", "Tunisia"),
"TR": ("turkey", "Turkey"),
"TM": ("turkmenistan", "Turkmenistan"),
"UG": ("uganda", "Uganda"),
"UA": ("ukraine", "Ukraine"),
"AE": ("united arab emirates", "United Arab Emirates"),
"GB": ("united kingdom", "United Kingdom"),
"US": ("united states", "United States"),
"UY": ("uruguay", "Uruguay"),
"UZ": ("uzbekistan", "Uzbekistan"),
"VE": ("venezuela", "Venezuela"),
"VN": ("vietnam", "Vietnam"),
"YE": ("yemen", "Yemen"),
"ZM": ("zambia", "Zambia"),
"ZW": ("zimbabwe", "Zimbabwe"),
}
@classmethod
def getForTavily(cls, isoCode: str) -> str:
"""
Get Tavily-compatible country name from ISO-2 code.
Args:
isoCode: ISO-2 country code (e.g., "CH", "US")
Returns:
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
"""
isoCodeUpper = isoCode.upper()
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
return mapping[0] if mapping else isoCode
@classmethod
def getForPerplexity(cls, isoCode: str) -> str:
"""
Get Perplexity-compatible country name from ISO-2 code.
Args:
isoCode: ISO-2 country code (e.g., "CH", "US")
Returns:
Full country name as required by Perplexity (e.g., "Switzerland", "United States")
"""
isoCodeUpper = isoCode.upper()
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
return mapping[1] if mapping else isoCode
@classmethod
def isValid(cls, isoCode: str) -> bool:
"""
Check if ISO-2 code is valid.
Args:
isoCode: ISO-2 country code to check
Returns:
True if valid, False otherwise
"""
return isoCode.upper() in cls._COUNTRY_MAP

View file

@ -81,6 +81,9 @@ class Services:
from .serviceUtils.mainServiceUtils import UtilsService
self.utils = PublicService(UtilsService(self))
from .serviceWeb.mainServiceWeb import WebService
self.web = PublicService(WebService(self))
def getInterface(user: User, workflow: ChatWorkflow) -> Services:
return Services(user, workflow)

View file

@ -3,7 +3,6 @@ from typing import Dict, Any, List, Optional, Union
from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.services.serviceAi.subCoreAi import SubCoreAi
from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing

View file

@ -0,0 +1,314 @@
"""
Web crawl service for handling web research operations.
Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
"""
import json
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
logger = logging.getLogger(__name__)
class WebcrawlService:
"""Service for web search and crawling operations."""
def __init__(self, services):
"""Initialize webcrawl service with service center access."""
self.services = services
async def performWebResearch(
self,
prompt: str,
urls: List[str],
country: Optional[str],
language: Optional[str],
researchDepth: str = "general",
operationId: str = None
) -> Dict[str, Any]:
"""
Perform web research in two steps:
1. Use AI to analyze prompt and extract parameters + URLs
2. Call WEB_SEARCH to get URLs (if needed)
3. Combine URLs and filter to maxNumberPages
4. Call WEB_CRAWL for each URL
5. Return consolidated result
Args:
prompt: Natural language research prompt
urls: Optional list of URLs provided by user
country: Optional country code
language: Optional language code
operationId: Operation ID for progress tracking
Returns:
Consolidated research results as dictionary
"""
try:
# Step 1: AI intention analysis - extract URLs and parameters from prompt
self.services.workflow.progressLogUpdate(operationId, 0.1, "Analyzing research intent")
analysisResult = await self._analyzeResearchIntent(prompt, urls, country, language, researchDepth)
# Extract parameters from AI analysis
instruction = analysisResult.get("instruction", prompt)
extractedUrls = analysisResult.get("urls", [])
needsSearch = analysisResult.get("needsSearch", True) # Default to True
maxNumberPages = analysisResult.get("maxNumberPages", 10)
timeRange = analysisResult.get("timeRange")
countryCode = analysisResult.get("country", country)
languageCode = analysisResult.get("language", language)
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
logger.info(f"AI Analysis: instruction='{instruction[:100]}...', urls={len(extractedUrls)}, needsSearch={needsSearch}, maxNumberPages={maxNumberPages}, researchDepth={finalResearchDepth}")
# Combine URLs (from user + from prompt extraction)
allUrls = []
if urls:
allUrls.extend(urls)
if extractedUrls:
allUrls.extend(extractedUrls)
# Step 2: Search for URLs if needed (based on needsSearch flag)
if needsSearch and (not allUrls or len(allUrls) < maxNumberPages):
self.services.workflow.progressLogUpdate(operationId, 0.3, "Searching for URLs")
searchUrls = await self._performWebSearch(
instruction=instruction,
maxNumberPages=maxNumberPages - len(allUrls),
timeRange=timeRange,
country=countryCode,
language=languageCode
)
# Add search URLs to the list
allUrls.extend(searchUrls)
self.services.workflow.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
# Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering)
if len(allUrls) > maxNumberPages:
allUrls = allUrls[:maxNumberPages]
logger.info(f"Limited URLs to {maxNumberPages}")
if not allUrls:
return {"error": "No URLs found to crawl"}
# Step 4: Translate researchDepth to maxDepth
depthMap = {"fast": 1, "general": 2, "deep": 3}
maxDepth = depthMap.get(finalResearchDepth.lower(), 2)
# Step 5: Crawl all URLs
self.services.workflow.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs")
crawlResult = await self._performWebCrawl(
instruction=instruction,
urls=allUrls,
maxDepth=maxDepth
)
self.services.workflow.progressLogUpdate(operationId, 0.9, "Consolidating results")
# Return consolidated result
return {
"instruction": instruction,
"urls_crawled": allUrls,
"total_urls": len(allUrls),
"results": crawlResult,
"total_results": len(crawlResult) if isinstance(crawlResult, list) else 1
}
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
raise
async def _analyzeResearchIntent(
self,
prompt: str,
urls: List[str],
country: Optional[str],
language: Optional[str],
researchDepth: str = "general"
) -> Dict[str, Any]:
"""
Use AI to analyze prompt and extract:
- URLs from the prompt text
- Research instruction
- maxNumberPages, timeRange, country, language from context
"""
# Build analysis prompt for AI
analysisPrompt = f"""Analyze this web research request and extract structured information.
RESEARCH REQUEST:
{prompt}
USER PROVIDED:
- URLs: {json.dumps(urls) if urls else "None"}
- Country: {country or "Not specified"}
- Language: {language or "Not specified"}
Extract and provide a JSON response with:
1. instruction: The core research instruction (cleaned prompt without URLs)
2. urls: List of URLs found in the prompt text
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
5. timeRange: Time range if mentioned (d, w, m, y, or null)
6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
7. language: Language code if specified (lowercase, e.g., de, en, fr)
8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
Return ONLY valid JSON, no additional text:
{{
"instruction": "cleaned research instruction",
"urls": ["url1", "url2"],
"needsSearch": true,
"maxNumberPages": 10,
"timeRange": null,
"country": "ch",
"language": "de",
"researchDepth": "general"
}}"""
try:
# Call AI planning to analyze intent
analysisJson = await self.services.ai.callAiPlanning(analysisPrompt)
# Parse JSON response
result = json.loads(analysisJson)
logger.info(f"Intent analysis result: {result}")
return result
except Exception as e:
logger.warning(f"Error in AI intent analysis: {str(e)}")
# Fallback to basic extraction
return {
"instruction": prompt,
"urls": [],
"needsSearch": True,
"maxNumberPages": 10,
"timeRange": None,
"country": country,
"language": language,
"researchDepth": researchDepth
}
async def _performWebSearch(
self,
instruction: str,
maxNumberPages: int,
timeRange: Optional[str],
country: Optional[str],
language: Optional[str]
) -> List[str]:
"""Perform web search to find URLs."""
try:
# Build search prompt model
searchPromptModel = AiCallPromptWebSearch(
instruction=instruction,
country=country,
maxNumberPages=maxNumberPages,
timeRange=timeRange,
language=language
)
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
# Call AI with WEB_SEARCH operation
searchOptions = AiCallOptions(
operationType=OperationTypeEnum.WEB_SEARCH,
resultFormat="json"
)
searchResult = await self.services.ai.callAiDocuments(
prompt=searchPrompt,
documents=None,
options=searchOptions,
outputFormat="json"
)
# Parse and extract URLs
if isinstance(searchResult, str):
searchData = json.loads(searchResult)
else:
searchData = searchResult
# Extract URLs from response
urls = []
if isinstance(searchData, dict):
if "urls" in searchData:
urls = searchData["urls"]
elif "results" in searchData:
urls = [r.get("url") for r in searchData["results"] if r.get("url")]
elif isinstance(searchData, list):
urls = [item.get("url") for item in searchData if item.get("url")]
logger.info(f"Web search returned {len(urls)} URLs")
return urls
except Exception as e:
logger.error(f"Error in web search: {str(e)}")
return []
async def _performWebCrawl(
self,
instruction: str,
urls: List[str],
maxDepth: int = 2
) -> List[Dict[str, Any]]:
"""Perform web crawl on list of URLs - calls plugin for each URL individually."""
crawlResults = []
# Loop over each URL and crawl one at a time
for url in urls:
try:
logger.info(f"Crawling URL: {url}")
# Build crawl prompt model for single URL
crawlPromptModel = AiCallPromptWebCrawl(
instruction=instruction,
url=url, # Single URL
maxDepth=maxDepth,
maxWidth=10
)
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
# Call AI with WEB_CRAWL operation
crawlOptions = AiCallOptions(
operationType=OperationTypeEnum.WEB_CRAWL,
resultFormat="json"
)
crawlResult = await self.services.ai.callAiDocuments(
prompt=crawlPrompt,
documents=None,
options=crawlOptions,
outputFormat="json"
)
# Parse crawl result
if isinstance(crawlResult, str):
try:
crawlData = json.loads(crawlResult)
except:
crawlData = {"url": url, "content": crawlResult}
else:
crawlData = crawlResult
# Ensure it's a list of results
if isinstance(crawlData, list):
crawlResults.extend(crawlData)
elif isinstance(crawlData, dict):
if "results" in crawlData:
crawlResults.extend(crawlData["results"])
else:
crawlResults.append(crawlData)
else:
crawlResults.append({"url": url, "content": str(crawlData)})
except Exception as e:
logger.error(f"Error crawling URL {url}: {str(e)}")
crawlResults.append({"url": url, "error": str(e)})
return crawlResults

View file

@ -10,9 +10,7 @@ from datetime import datetime, UTC
from modules.workflows.methods.methodBase import MethodBase, action
from modules.datamodels.datamodelChat import ActionResult
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.datamodels.datamodelChat import ChatDocument
from modules.aicore.aicorePluginTavily import WebResearchRequest
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptImage
logger = logging.getLogger(__name__)
@ -163,272 +161,50 @@ class MethodAi(MethodBase):
)
@action
async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Search the web and return a list of relevant URLs only.
- Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
- Output format: JSON with search results and URLs.
Parameters:
- searchPrompt (str, required): Natural language search prompt describing what to search for.
- maxResults (int, optional): Maximum number of search results. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
searchPrompt = parameters.get("searchPrompt")
if not searchPrompt:
return ActionResult.isFailure(error="Search prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange")
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web search
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_SEARCH,
resultFormat="json"
)
# Create unified prompt JSON that both Tavily and Perplexity can understand
promptData = {
"searchPrompt": searchPrompt,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Process result to ensure consistent format
processedResult = self._processWebSearchResult(result)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_search",
extension="json",
action_name="search"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=processedResult,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web search: {str(e)}")
return ActionResult.isFailure(error=str(e))
def _processWebSearchResult(self, result: str) -> str:
"""
Process web search result to ensure consistent JSON format with URL list.
Both Tavily and Perplexity now return proper JSON format.
"""
try:
import json
data = json.loads(result)
# If it's already a proper search result format, return as-is
if isinstance(data, dict) and "results" in data:
return result
# If it's a different JSON format, try to extract URLs
if isinstance(data, dict):
# Look for URL patterns in the JSON
urls = self._extractUrlsFromJson(data)
if urls:
processedData = {
"query": data.get("query", "web search"),
"results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
"total_count": len(urls)
}
return json.dumps(processedData, indent=2)
# No URLs found, return original result in a structured format
processedData = {
"query": "web search",
"results": [],
"total_count": 0,
"raw_response": result
}
return json.dumps(processedData, indent=2)
except Exception as e:
logger.warning(f"Error processing web search result: {str(e)}")
# Return original result wrapped in error format
errorData = {
"query": "web search",
"results": [],
"total_count": 0,
"error": f"Failed to process result: {str(e)}",
"raw_response": result
}
return json.dumps(errorData, indent=2)
def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
"""Extract URLs from JSON data structure."""
urls = []
def _extractFromValue(value):
if isinstance(value, str):
# Check if it's a URL
if value.startswith(('http://', 'https://')):
urls.append(value)
elif isinstance(value, dict):
for v in value.values():
_extractFromValue(v)
elif isinstance(value, list):
for item in value:
_extractFromValue(item)
_extractFromValue(data)
return list(set(urls)) # Remove duplicates
@action
async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Extract content from specific URLs.
- Input requirements: urls (required); optional extractDepth, format.
- Output format: JSON with extracted content from URLs.
Parameters:
- urls (list, required): List of URLs to crawl and extract content from.
- extractDepth (str, optional): basic | advanced. Default: advanced.
- format (str, optional): markdown | html | text. Default: markdown.
"""
try:
urls = parameters.get("urls")
if not urls or not isinstance(urls, list):
return ActionResult.isFailure(error="URLs list is required")
# Extract optional parameters
extractDepth = parameters.get("extractDepth", "advanced")
formatType = parameters.get("format", "markdown")
# Build AI call options for web crawling
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_CRAWL,
resultFormat="json"
)
# Create unified prompt JSON for web crawling
promptData = {
"urls": urls,
"extractDepth": extractDepth,
"format": formatType,
"instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_crawl",
extension="json",
action_name="crawl"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web crawl: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Comprehensive web research combining search and content extraction.
- Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
- Output format: JSON with research results, sources, and analysis.
- Purpose: Web research with two-step process: search for URLs, then crawl content.
- Input requirements: prompt (required); optional list(url), country, language, researchDepth.
- Output format: JSON with research results including URLs and content.
Parameters:
- researchPrompt (str, required): Natural language research prompt describing what to research.
- maxResults (int, optional): Maximum search results. Default: 5.
- urls (list, optional): Specific URLs to include in research.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
- prompt (str, required): Natural language research instruction, including time range if relevant.
- list(url) (list, optional): Specific URLs to crawl, if needed.
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
- language (str, optional): Language code (lowercase, e.g., de, en, fr).
- researchDepth (str, optional): Research depth - fast, general, or deep. Default: general.
"""
try:
researchPrompt = parameters.get("researchPrompt")
if not researchPrompt:
prompt = parameters.get("prompt")
if not prompt:
return ActionResult.isFailure(error="Research prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
urls = parameters.get("urls")
timeRange = parameters.get("timeRange")
country = parameters.get("country")
language = parameters.get("language")
# Init progress logger
operationId = f"web_research_{self.services.currentWorkflow.id}_{int(time.time())}"
# Build AI call options for web research
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_RESEARCH,
resultFormat="json"
# Start progress tracking
self.services.workflow.progressLogStart(
operationId,
"Web Research",
"Searching and Crawling",
"Extracting URLs and Content"
)
# Create unified prompt JSON for web research
promptData = {
"researchPrompt": researchPrompt,
"maxResults": maxResults,
"urls": urls,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
# Call webcrawl service - service handles all AI intention analysis and processing
result = await self.services.web.performWebResearch(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
urls=parameters.get("list(url)", []),
country=parameters.get("country"),
language=parameters.get("language"),
researchDepth=parameters.get("researchDepth", "general"),
operationId=operationId
)
# Complete progress tracking
self.services.workflow.progressLogFinish(operationId, True)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_research",
@ -447,157 +223,10 @@ class MethodAi(MethodBase):
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Answer questions using web research and AI analysis.
- Input requirements: question (required); optional context, maxResults, timeRange, country, language.
- Output format: JSON with question answer and supporting sources.
Parameters:
- question (str, required): Question to be answered using web research.
- context (str, optional): Additional context for the question.
- maxResults (int, optional): Maximum search results. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering.
- country (str, optional): Country name for localized results.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
question = parameters.get("question")
if not question:
return ActionResult.isFailure(error="Question is required")
# Extract optional parameters
context = parameters.get("context", "")
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange")
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web questions
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_QUESTIONS,
resultFormat="json"
)
# Create unified prompt JSON for web questions
promptData = {
"question": question,
"context": context,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_questions",
extension="json",
action_name="questions"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web questions: {str(e)}")
return ActionResult.isFailure(error=str(e))
@action
async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: Search and analyze news articles on specific topics.
- Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
- Output format: JSON with news articles, summaries, and analysis.
Parameters:
- newsPrompt (str, required): Natural language prompt describing what news to search for.
- maxResults (int, optional): Maximum news articles. Default: 5.
- timeRange (str, optional): d | w | m | y for time filtering. Default: w.
- country (str, optional): Country name for localized news.
- language (str, optional): Language code (e.g., de, en, fr).
"""
try:
newsPrompt = parameters.get("newsPrompt")
if not newsPrompt:
return ActionResult.isFailure(error="News prompt is required")
# Extract optional parameters
maxResults = parameters.get("maxResults", 5)
timeRange = parameters.get("timeRange", "w") # Default to week
country = parameters.get("country")
language = parameters.get("language")
# Build AI call options for web news
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_NEWS,
resultFormat="json"
)
# Create unified prompt JSON for web news
promptData = {
"newsPrompt": newsPrompt,
"maxResults": maxResults,
"timeRange": timeRange,
"country": country,
"language": language,
"instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
}
import json
prompt = json.dumps(promptData, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(
prompt=prompt,
documents=None,
options=options,
outputFormat="json"
)
# Create meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_news",
extension="json",
action_name="news"
)
from modules.datamodels.datamodelChat import ActionDocument
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=result,
mimeType="application/json"
)
return ActionResult.isSuccess(documents=[actionDocument])
except Exception as e:
logger.error(f"Error in web news: {str(e)}")
try:
self.services.workflow.progressLogFinish(operationId, False)
except:
pass
return ActionResult.isFailure(error=str(e))
@ -631,17 +260,16 @@ class MethodAi(MethodBase):
resultFormat="base64"
)
# Create unified prompt JSON for image generation
promptData = {
"prompt": prompt,
"size": size,
"quality": quality,
"style": style,
"instructions": "Generate an image based on the prompt and return the base64 encoded image data."
}
# Create structured prompt using Pydantic model
promptModel = AiCallPromptImage(
prompt=prompt,
size=size,
quality=quality,
style=style
)
import json
promptJson = json.dumps(promptData, indent=2)
# Convert to JSON string for prompt
promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
# Call AI service through unified path
result = await self.services.ai.callAiDocuments(

View file

@ -91,26 +91,18 @@ class AIModelsTester:
print(f"TESTING MODEL: {modelName}")
print(f"{'='*60}")
# Choose test prompt based on model type - Web models get JSON formatted prompts
# Use same prompt for all web models
import json
if "tavily" in modelName.lower():
# Tavily models get web search prompt in JSON format (from methodAi.py)
if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
# All web models use the same JSON formatted prompt
# Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
testPrompt = json.dumps({
"searchPrompt": "Search for recent news about artificial intelligence developments in 2024. Return the top 3 results as JSON with fields: title, url, snippet.",
"maxResults": 3,
"timeRange": "y",
"country": "United States",
"instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
}, indent=2)
elif "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
# Perplexity models get web research prompt in JSON format (from methodAi.py)
testPrompt = json.dumps({
"researchPrompt": "Research the latest trends in renewable energy technology. Provide a comprehensive overview with key developments, companies involved, and future prospects. Return as JSON.",
"prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
"maxResults": 5,
"timeRange": "y",
"country": "United States",
"instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
"country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
"format": "json"
}, indent=2)
else:
# Fallback for other models
@ -444,9 +436,7 @@ Is Valid JSON: {result.get('isValidJson', False)}
# "dall-e-3", # Skipped - image generation, test later
"sonar", # Perplexity web model
"sonar-pro", # Perplexity web model
"tavily-search", # Tavily web model
"tavily-extract", # Tavily web model
"tavily-search-extract", # Tavily web model
"tavily-search", # Tavily web model (unified research)
# "internal-extractor", # Skipped - internal model, test later
# "internal-generator", # Skipped - internal model, test later
# "internal-renderer" # Skipped - internal model, test later