From 2489719c629f3ec9196b0079a08167a22346e37b Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 26 Oct 2025 18:17:17 +0100
Subject: [PATCH] ai models ready for web and txt
---
modules/aicore/aicorePluginPerplexity.py | 126 ++++--
modules/aicore/aicorePluginTavily.py | 233 +++++-----
modules/datamodels/datamodelAi.py | 1 -
modules/datamodels/datamodelTools.py | 5 +-
modules/services/serviceWeb/mainServiceWeb.py | 17 +-
modules/workflows/methods/methodAi.py | 2 +-
test_ai_models.py | 425 ++++++++++++++----
7 files changed, 577 insertions(+), 232 deletions(-)
diff --git a/modules/aicore/aicorePluginPerplexity.py b/modules/aicore/aicorePluginPerplexity.py
index 23bffc48..51b585b9 100644
--- a/modules/aicore/aicorePluginPerplexity.py
+++ b/modules/aicore/aicorePluginPerplexity.py
@@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2,
- maxTokens=4000,
+ maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000,
costPer1kTokensInput=0.005,
costPer1kTokensOutput=0.005,
@@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2,
- maxTokens=4000,
+ maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000,
costPer1kTokensInput=0.01,
costPer1kTokensOutput=0.01,
speedRating=6, # Slower due to AI analysis
- qualityRating=10, # Best AI analysis quality
+ qualityRating=9, # Best AI analysis quality
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.QUALITY,
@@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
# Fallback to basic call
return await self.callAiBasic(modelCall)
+ def _getDepthInstructions(self, maxDepth: int) -> str:
+ """
+ Map maxDepth (numeric) to instructional text for LLM.
+
+ Args:
+ maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
+
+ Returns:
+ Instructional text for the LLM
+ """
+ depthMap = {
+ 1: "Basic overview - extract main content from the main page only",
+ 2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
+ 3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
+ }
+ return depthMap.get(maxDepth, depthMap[2])
+
+ def _getWidthInstructions(self, maxWidth: int) -> str:
+ """
+ Map maxWidth (numeric) to instructional text for LLM.
+
+ Args:
+ maxWidth: Number of pages to crawl at each level (default: 10)
+
+ Returns:
+ Instructional text for the LLM
+ """
+ if maxWidth <= 5:
+ return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
+ elif maxWidth <= 15:
+ return f"Standard breadth - crawl up to {maxWidth} pages per level"
+ elif maxWidth <= 30:
+ return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
+ else:
+ return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
+
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
"""
WEB_SEARCH operation - returns list of URLs based on search query.
@@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
{'' if not countryName else f'Focus on results from {countryName}.'}
-{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
-{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}
Return ONLY a JSON array of URLs, no additional text:
[
@@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
"""
WEB_CRAWL operation - crawls ONE URL and returns content.
+ Perplexity API Parameters Used:
+ - messages: The prompt containing URL and instruction
+ - max_tokens: Maximum response length
+ - max_results: Number of search results (1-20, default: 10)
+ - temperature: Response randomness (not web search specific)
+
+ Pagination: Perplexity does NOT return paginated responses.
+ A single response contains all results within max_tokens limit.
+
Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
@@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Build crawl request for Perplexity - ONE URL
- crawlPrompt = f"""Crawl and extract content from this URL based on the instruction:
-
-INSTRUCTION: '{webCrawlPrompt.instruction}'
-
-URL to crawl (maxDepth={webCrawlPrompt.maxDepth}):
-{webCrawlPrompt.url}
-
-Extract and return the relevant content based on the instruction.
-Return as JSON object with this structure:
-{{
- "url": "{webCrawlPrompt.url}",
- "title": "Page title",
- "content": "Extracted content relevant to the instruction"
-}}
-
-Return ONLY valid JSON, no additional text."""
+ # Match playground prompt style: just URL + question
+ # This allows Perplexity to return detailed multi-source results
+ crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
+
+ # Build payload with optional Perplexity parameters
+ # Note: max_tokens_per_page may not be supported by chat/completions endpoint
+ # The playground Python SDK might use a different internal API
+ maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results
payload = {
"model": model.name,
"messages": [{"role": "user", "content": crawlPrompt}],
"temperature": temperature,
- "max_tokens": maxTokens
+ "max_tokens": maxTokens, # Use model's configured maxTokens (24000)
+ "max_results": maxResults,
+ "return_citations": True # Request citations explicitly
}
+ logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
+
response = await self.httpClient.post(model.apiUrl, json=payload)
if response.status_code != 200:
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
apiResponse = response.json()
+
+ # Extract the main content
content = apiResponse["choices"][0]["message"]["content"]
- # Parse JSON content and ensure it's a single object
- import json
- try:
- parsedContent = json.loads(content)
- # Ensure it's a single object, not an array
- if isinstance(parsedContent, list):
- parsedContent = parsedContent[0] if parsedContent else {}
- except:
- # If not JSON, create structured response
- parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
+ # Check for citations or search results in the response
+ citations = apiResponse.get("citations", [])
+ searchResults = apiResponse.get("search_results", [])
- # Return as JSON string
+ # Log what we found
+ if citations:
+ logger.info(f"Found {len(citations)} citations in response")
+ if searchResults:
+ logger.info(f"Found {len(searchResults)} search results in response")
+ logger.debug(f"API response keys: {list(apiResponse.keys())}")
+
+ # Build comprehensive response with citations if available
+ import json
+ responseData = {
+ "content": content,
+ "citations": citations if citations else [],
+ "search_results": searchResults if searchResults else []
+ }
+
+ # Return comprehensive response
return AiModelResponse(
- content=json.dumps(parsedContent, indent=2),
+ content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
success=True,
modelId=model.name,
- metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
+ metadata={
+ "response_id": apiResponse.get("id", ""),
+ "operation": "WEB_CRAWL",
+ "url": webCrawlPrompt.url,
+ "actualPromptSent": crawlPrompt,
+ "has_citations": len(citations) > 0,
+ "has_search_results": len(searchResults) > 0
+ }
)
except Exception as e:
diff --git a/modules/aicore/aicorePluginTavily.py b/modules/aicore/aicorePluginTavily.py
index 9320bba7..fb454f1f 100644
--- a/modules/aicore/aicorePluginTavily.py
+++ b/modules/aicore/aicorePluginTavily.py
@@ -27,7 +27,8 @@ class WebCrawlResult:
content: str
title: Optional[str] = None
-class ConnectorWeb(BaseConnectorAi):
+
+class AiTavily(BaseConnectorAi):
"""Tavily web search connector."""
def __init__(self):
@@ -42,7 +43,36 @@ class ConnectorWeb(BaseConnectorAi):
self.webSearchMaxResults: int = 20
# Initialize client if API key is available
self._initializeClient()
-
+
+
+ def getModels(self) -> List[AiModel]:
+ """Get all available Tavily models."""
+ return [
+ AiModel(
+ name="tavily-search",
+ displayName="Tavily Search & Research",
+ connectorType="tavily",
+ apiUrl="https://api.tavily.com",
+ temperature=0.0, # Web search doesn't use temperature
+ maxTokens=0, # Web search doesn't use tokens
+ contextLength=0,
+ costPer1kTokensInput=0.0,
+ costPer1kTokensOutput=0.0,
+ speedRating=8, # Good speed for search and extract
+ qualityRating=9, # Excellent quality for web research
+ # capabilities removed (not used in business logic)
+ functionCall=self._routeWebOperation,
+ priority=PriorityEnum.BALANCED,
+ processingMode=ProcessingModeEnum.BASIC,
+ operationTypes=createOperationTypeRatings(
+ (OperationTypeEnum.WEB_SEARCH, 9),
+ (OperationTypeEnum.WEB_CRAWL, 10)
+ ),
+ version="tavily-search",
+ calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
+ )
+ ]
+
def _initializeClient(self):
"""Initialize the Tavily client if API key is available."""
try:
@@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
return filteredResults
- def getModels(self) -> List[AiModel]:
- """Get all available Tavily models."""
- return [
- AiModel(
- name="tavily-search",
- displayName="Tavily Search & Research",
- connectorType="tavily",
- apiUrl="https://api.tavily.com",
- temperature=0.0, # Web search doesn't use temperature
- maxTokens=0, # Web search doesn't use tokens
- contextLength=0,
- costPer1kTokensInput=0.0,
- costPer1kTokensOutput=0.0,
- speedRating=8, # Good speed for search and extract
- qualityRating=9, # Excellent quality for web research
- # capabilities removed (not used in business logic)
- functionCall=self._routeWebOperation,
- priority=PriorityEnum.BALANCED,
- processingMode=ProcessingModeEnum.BASIC,
- operationTypes=createOperationTypeRatings(
- (OperationTypeEnum.WEB_SEARCH, 9),
- (OperationTypeEnum.WEB_CRAWL, 8)
- ),
- version="tavily-search",
- calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
- )
- ]
-
@classmethod
async def create(cls):
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
@@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
topic: str | None = None,
includeDomains: list[str] | None = None,
excludeDomains: list[str] | None = None,
- language: str | None = None,
country: str | None = None,
- includeAnswer: bool | None = None,
- includeRawContent: bool | None = None,
+ includeAnswer: str | None = None,
+ includeRawContent: str | None = None,
) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure maxResults is within the allowed range (use cached values)
@@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
kwargs["include_domains"] = includeDomains
if excludeDomains is not None:
kwargs["exclude_domains"] = excludeDomains
- if language is not None:
- kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if includeAnswer is not None:
@@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
if includeRawContent is not None:
kwargs["include_raw_content"] = includeRawContent
- logger.debug(f"Tavily.search kwargs: {kwargs}")
+ # Log the final API call parameters for comparison
+ logger.info(f"Tavily API call parameters: {kwargs}")
# Ensure client is initialized
if self.client is None:
@@ -316,7 +316,11 @@ class ConnectorWeb(BaseConnectorAi):
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await self.client.search(**kwargs)
-
+
+ # Return all results without score filtering
+ # Tavily's scoring is already applied by the API
+ logger.info(f"Tavily returned {len(response.get('results', []))} results")
+
return [
WebSearchResult(
title=result["title"],
@@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):
async def _crawl(
self,
- urls: list,
- extractDepth: str | None = None,
- format: str | None = None,
+ url: str,
+ instructions: str | None = None,
+ limit: int = 20,
+ maxDepth: int = 2,
+ maxBreadth: int = 40,
) -> list[WebCrawlResult]:
- """Calls the Tavily API to extract text content from URLs with retry logic."""
+ """Calls the Tavily API to crawl ONE URL with link following and retry logic."""
maxRetries = self.crawlMaxRetries
retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout
- logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
- logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
+ logger.debug(f"Starting crawl of URL: {url}")
+ logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
for attempt in range(maxRetries + 1):
try:
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
- # Use asyncio.wait_for for timeout
- # Build kwargs for extract
- kwargsExtract: dict = {"urls": urls}
- kwargsExtract["extract_depth"] = extractDepth or "advanced"
- kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
-
- logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
-
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
+ logger.debug(f"Crawling URL: {url}")
+
+ # Build kwargs for crawl
+ kwargsCrawl: dict = {"url": url}
+ if instructions:
+ kwargsCrawl["instructions"] = instructions
+ if limit:
+ kwargsCrawl["limit"] = limit
+ if maxDepth:
+ kwargsCrawl["max_depth"] = maxDepth
+ if maxBreadth:
+ kwargsCrawl["max_breadth"] = maxBreadth
+
+ logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
+
response = await asyncio.wait_for(
- self.client.extract(**kwargsExtract),
+ self.client.crawl(**kwargsCrawl),
timeout=timeout
)
-
- logger.debug(f"Tavily response received: {list(response.keys())}")
- # Debug: Log what Tavily actually returns
- if "results" in response and response["results"]:
- logger.debug(f"Tavily returned {len(response['results'])} results")
- logger.debug(f"First result keys: {list(response['results'][0].keys())}")
- logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
-
- # Log each result
- for i, result in enumerate(response["results"]):
- logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
+ logger.debug(f"Tavily response received: {type(response)}")
+
+ # Parse response - could be dict with results or list
+ if isinstance(response, dict) and "results" in response:
+ pageResults = response["results"]
+ elif isinstance(response, list):
+ pageResults = response
else:
- logger.warning(f"Tavily returned no results in response: {response}")
+ logger.warning(f"Unexpected response format: {type(response)}")
+ pageResults = []
- results = [
- WebCrawlResult(
- url=result["url"],
- content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
- title=result.get("title", "") # Extract title if available
- )
- for result in response["results"]
- ]
+ logger.debug(f"Got {len(pageResults)} pages from crawl")
- logger.debug(f"Crawl successful: extracted {len(results)} results")
+ # Convert to WebCrawlResult format
+ results = []
+ for result in pageResults:
+ results.append(WebCrawlResult(
+ url=result.get("url", url),
+ content=result.get("raw_content", result.get("content", "")),
+ title=result.get("title", "")
+ ))
+
+ logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
return results
except asyncio.TimeoutError:
- logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
+ logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
@@ -398,21 +410,20 @@ class ConnectorWeb(BaseConnectorAi):
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
except Exception as e:
- logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
+ logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
# Check if it's a validation error and log more details
if "validation" in str(e).lower():
logger.debug(f"URL validation failed. Checking URL format:")
- for i, url in enumerate(urls):
- logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
- # Check for common URL issues
- if ' ' in url:
- logger.debug(f" WARNING: URL contains spaces!")
- if not url.startswith(('http://', 'https://')):
- logger.debug(f" WARNING: URL doesn't start with http/https!")
- if len(url) > 2000:
- logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
+ logger.debug(f" URL: '{url}' (length: {len(url)})")
+ # Check for common URL issues
+ if ' ' in url:
+ logger.debug(f" WARNING: URL contains spaces!")
+ if not url.startswith(('http://', 'https://')):
+ logger.debug(f" WARNING: URL doesn't start with http/https!")
+ if len(url) > 2000:
+ logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
@@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
if countryName:
countryName = self._convertIsoCodeToCountryName(countryName)
- # Perform search
+ # Perform search - use exact parameters from prompt
+ # NOTE: timeRange parameter causes generic results, so we don't use it
searchResults = await self._search(
query=webSearchPrompt.instruction,
maxResults=webSearchPrompt.maxNumberPages,
- timeRange=webSearchPrompt.timeRange,
+ timeRange=None, # Not used - causes generic results
country=countryName,
- language=webSearchPrompt.language,
- includeAnswer=False,
- includeRawContent=False
+ includeAnswer="basic",
+ includeRawContent="text"
)
# Extract URLs from results
@@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
"""
- WEB_CRAWL operation - crawls one URL using Tavily.
+ WEB_CRAWL operation - crawls one URL using Tavily with link following.
Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
Returns:
- AiModelResponse with crawl results as JSON
+ AiModelResponse with crawl results as JSON (may include multiple pages)
"""
try:
# Extract parameters
@@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
# Create Pydantic model
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
- # Perform crawl for ONE URL
- # Note: _crawl expects a list, so we wrap the single URL in a list
+ # Perform crawl for ONE URL with link following
+ # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
crawlResults = await self._crawl(
- urls=[webCrawlPrompt.url],
- extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
- format="markdown"
+ url=webCrawlPrompt.url,
+ instructions=webCrawlPrompt.instruction,
+ limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages
+ maxDepth=webCrawlPrompt.maxDepth or 2,
+ maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth
)
- # Format result for single URL - consistent with Perplexity format
+ # If we got multiple pages from the crawl, we need to format them differently
+ # Return the first result for backwards compatibility, but include total page count
if crawlResults and len(crawlResults) > 0:
- firstResult = crawlResults[0]
+ # Get all pages content
+ allContent = ""
+ for i, result in enumerate(crawlResults, 1):
+ pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
+ if result.title:
+ allContent += f"{pageHeader}Title: {result.title}\n\n"
+ allContent += f"{result.content}\n"
+
resultData = {
- "url": firstResult.url,
- "title": firstResult.title if firstResult.title else "Content",
- "content": firstResult.content
+ "url": webCrawlPrompt.url,
+ "title": crawlResults[0].title if crawlResults[0].title else "Content",
+ "content": allContent,
+ "pagesCrawled": len(crawlResults),
+ "pageUrls": [result.url for result in crawlResults]
}
else:
- resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
+ resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
- # Return as JSON - same format as Perplexity
+ # Return as JSON - same format as Perplexity but with multiple pages content
import json
return AiModelResponse(
content=json.dumps(resultData, indent=2),
success=True,
- metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
+ metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
)
except Exception as e:
logger.error(f"Error in Tavily web crawl: {str(e)}")
import json
- errorResult = {"error": str(e), "url": ""}
+ errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
return AiModelResponse(
content=json.dumps(errorResult, indent=2),
success=False,
diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py
index b7e883d1..f73cbd08 100644
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
- timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
diff --git a/modules/datamodels/datamodelTools.py b/modules/datamodels/datamodelTools.py
index 39ba8bda..45227903 100644
--- a/modules/datamodels/datamodelTools.py
+++ b/modules/datamodels/datamodelTools.py
@@ -186,12 +186,13 @@ class CountryCodes:
Get Tavily-compatible country name from ISO-2 code.
Args:
- isoCode: ISO-2 country code (e.g., "CH", "US")
+ isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
Returns:
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
"""
- isoCodeUpper = isoCode.upper()
+ # Convert to uppercase for lookup
+ isoCodeUpper = isoCode.upper() if isoCode else ""
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
return mapping[0] if mapping else isoCode
diff --git a/modules/services/serviceWeb/mainServiceWeb.py b/modules/services/serviceWeb/mainServiceWeb.py
index fc08aa7c..3e43da4a 100644
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
logger = logging.getLogger(__name__)
-class WebcrawlService:
+class WebService:
"""Service for web search and crawling operations."""
def __init__(self, services):
@@ -56,7 +56,6 @@ class WebcrawlService:
extractedUrls = analysisResult.get("urls", [])
needsSearch = analysisResult.get("needsSearch", True) # Default to True
maxNumberPages = analysisResult.get("maxNumberPages", 10)
- timeRange = analysisResult.get("timeRange")
countryCode = analysisResult.get("country", country)
languageCode = analysisResult.get("language", language)
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
@@ -77,7 +76,6 @@ class WebcrawlService:
searchUrls = await self._performWebSearch(
instruction=instruction,
maxNumberPages=maxNumberPages - len(allUrls),
- timeRange=timeRange,
country=countryCode,
language=languageCode
)
@@ -153,10 +151,9 @@ Extract and provide a JSON response with:
2. urls: List of URLs found in the prompt text
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
-5. timeRange: Time range if mentioned (d, w, m, y, or null)
-6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
-7. language: Language code if specified (lowercase, e.g., de, en, fr)
-8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
+5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
+6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
+7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
Return ONLY valid JSON, no additional text:
{{
@@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
"urls": ["url1", "url2"],
"needsSearch": true,
"maxNumberPages": 10,
- "timeRange": null,
"country": "ch",
"language": "de",
"researchDepth": "general"
@@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
"urls": [],
"needsSearch": True,
"maxNumberPages": 10,
- "timeRange": None,
"country": country,
"language": language,
"researchDepth": researchDepth
@@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
self,
instruction: str,
maxNumberPages: int,
- timeRange: Optional[str],
country: Optional[str],
language: Optional[str]
) -> List[str]:
@@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
instruction=instruction,
country=country,
maxNumberPages=maxNumberPages,
- timeRange=timeRange,
language=language
)
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
@@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
instruction=instruction,
url=url, # Single URL
maxDepth=maxDepth,
- maxWidth=10
+ maxWidth=50
)
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py
index 708ee91b..178b6264 100644
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@@ -170,7 +170,7 @@ class MethodAi(MethodBase):
- Output format: JSON with research results including URLs and content.
Parameters:
- - prompt (str, required): Natural language research instruction, including time range if relevant.
+ - prompt (str, required): Natural language research instruction.
- list(url) (list, optional): Specific URLs to crawl, if needed.
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
- language (str, optional): Language code (lowercase, e.g., de, en, fr).
diff --git a/test_ai_models.py b/test_ai_models.py
index 2906afd1..37772ee3 100644
--- a/test_ai_models.py
+++ b/test_ai_models.py
@@ -1,6 +1,9 @@
#!/usr/bin/env python3
"""
-AI Models Test - Tests all available AI models individually
+AI Models Test - Tests WEB_CRAWL functionality on all models that support it
+
+This script tests all models that have WEB_CRAWL capability, validates that
+they can crawl specific URLs and return content, and analyzes the quality of results.
"""
import asyncio
@@ -53,9 +56,18 @@ class AIModelsTester:
async def initialize(self):
"""Initialize the AI service."""
- # Set logging level to INFO to reduce noise
+ # Set logging level to DEBUG for detailed output
import logging
- logging.getLogger().setLevel(logging.INFO)
+ logging.getLogger().setLevel(logging.DEBUG)
+
+ # Initialize the model registry with all connectors
+ from modules.aicore.aicoreModelRegistry import modelRegistry
+ from modules.aicore.aicorePluginTavily import AiTavily
+ from modules.aicore.aicorePluginPerplexity import AiPerplexity
+
+ # Register web connectors that support WEB_CRAWL
+ modelRegistry.registerConnector(AiTavily())
+ modelRegistry.registerConnector(AiPerplexity())
# The AI service needs to be recreated with proper initialization
from modules.services.serviceAi.mainServiceAi import AiService
@@ -86,27 +98,53 @@ class AIModelsTester:
print(f"📁 Results will be saved to: {self.modelTestDir}")
async def testModel(self, modelName: str) -> Dict[str, Any]:
- """Test a specific AI model with a simple prompt."""
+ """Test a specific AI model with WEB_CRAWL operation."""
print(f"\n{'='*60}")
print(f"TESTING MODEL: {modelName}")
+ print(f"OPERATION TYPE: WEB_CRAWL")
print(f"{'='*60}")
- # Use same prompt for all web models
- import json
+ # CRAWL CONFIGURATION
+ # Deep and Broad Web Crawl Example:
+ # - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
+ # - Level 1: Starting page
+ # - Level 2: Pages linked from starting page
+ # - Level 3: Pages linked from Level 2 pages
+ # - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
+ # This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
+ #
+ # Common configurations:
+ # - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused)
+ # - General/Standard: maxDepth=2, maxWidth=10 (balanced)
+ # - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive)
- if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
- # All web models use the same JSON formatted prompt
- # Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
- testPrompt = json.dumps({
- "prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
- "maxResults": 5,
- "timeRange": "y",
- "country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
- "format": "json"
- }, indent=2)
- else:
- # Fallback for other models
- testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON."
+ CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep
+ CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level
+
+ print(f"Crawl Configuration:")
+ print(f" - Depth: {CRAWL_DEPTH} levels (deep)")
+ print(f" - Width: {CRAWL_WIDTH} pages per level (broad)")
+ print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
+
+ # Use WEB_CRAWL specific prompt format
+ from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
+
+ # Test with simple prompt like playground example
+ simplePrompt = f"https://www.valueon.ch: Who works in this company?"
+
+ # But keep structured format for now to match our API
+ testPrompt = json.dumps({
+ "instruction": "Who works in this company?",
+ "url": "https://www.valueon.ch",
+ "maxDepth": CRAWL_DEPTH,
+ "maxWidth": CRAWL_WIDTH
+ }, indent=2)
+
+ print(f"Simple prompt (playground style): {simplePrompt}")
+
+ # For Tavily models, test direct API call for better link following
+ if "tavily" in modelName.lower():
+ return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
print(f"Test prompt: {testPrompt}")
print(f"Prompt length: {len(testPrompt)} characters")
@@ -114,17 +152,11 @@ class AIModelsTester:
startTime = asyncio.get_event_loop().time()
try:
- # Create options to force this specific model
- if "internal" in modelName.lower():
- options = AiCallOptions(
- operationType=OperationTypeEnum.DATA_EXTRACT,
- preferredModel=modelName
- )
- else:
- options = AiCallOptions(
- operationType=OperationTypeEnum.DATA_GENERATE,
- preferredModel=modelName
- )
+ # Create options for WEB_CRAWL operation
+ options = AiCallOptions(
+ operationType=OperationTypeEnum.WEB_CRAWL,
+ preferredModel=modelName
+ )
# Call the AI service DIRECTLY through the model's functionCall
# This tests the actual model, not the document generation pipeline
@@ -140,29 +172,14 @@ class AIModelsTester:
import base64
import os
- # Prepare messages and options based on model type
- if "vision" in modelName.lower():
- # For vision models, skip for now since they require special handling
- print(f"⚠️ Skipping vision model {modelName} - requires special image handling")
- return {
- "modelName": modelName,
- "status": "SKIPPED",
- "processingTime": 0.0,
- "responseLength": 0,
- "responseType": "skipped",
- "hasContent": False,
- "error": "Vision model requires special image handling",
- "fullResponse": "Skipped - vision model requires special image handling"
- }
- else:
- # For other models, use normal functionCall
- messages = [{"role": "user", "content": testPrompt}]
- modelCall = AiModelCall(
- messages=messages,
- model=model,
- options=options
- )
- response = await model.functionCall(modelCall)
+ # For WEB_CRAWL models, use normal functionCall with structured prompt
+ messages = [{"role": "user", "content": testPrompt}]
+ modelCall = AiModelCall(
+ messages=messages,
+ model=model,
+ options=options
+ )
+ response = await model.functionCall(modelCall)
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
@@ -185,6 +202,10 @@ class AIModelsTester:
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
}
+ # Extract actual prompt sent if available in metadata
+ if hasattr(response, 'metadata') and response.metadata:
+ result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
+
# Try to parse content as JSON
if response.content:
try:
@@ -289,9 +310,16 @@ class AIModelsTester:
print(f"📄 Response length: {len(str(response))} characters")
print(f"📄 Response preview: {result['responsePreview']}")
- # Save text response for all models
- if result.get("status") == "SUCCESS":
- self._saveTextResponse(modelName, result)
+ # Add prompt to result for logging
+ result["testPrompt"] = testPrompt
+ result["crawlConfig"] = {
+ "depth": CRAWL_DEPTH,
+ "width": CRAWL_WIDTH
+ }
+
+ # For WEB_CRAWL, also validate that content was extracted
+ if result.get("status") == "SUCCESS" and result.get("fullResponse"):
+ self._validateCrawlResponse(modelName, result)
except Exception as e:
endTime = asyncio.get_event_loop().time()
@@ -304,13 +332,22 @@ class AIModelsTester:
"responseLength": 0,
"responseType": "exception",
"hasContent": False,
- "error": str(e)
+ "error": str(e),
+ "testPrompt": testPrompt,
+ "crawlConfig": {
+ "depth": CRAWL_DEPTH,
+ "width": CRAWL_WIDTH
+ }
}
print(f"💥 EXCEPTION - {str(e)}")
self.testResults.append(result)
+ # Save text response even for exceptions to log the prompt
+ if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
+ self._saveTextResponse(modelName, result)
+
# Save individual model result immediately
self._saveIndividualModelResult(modelName, result)
@@ -378,6 +415,19 @@ class AIModelsTester:
if not content:
content = result.get("responsePreview", "No content available")
+ # If there's an error, include it in the content
+ if result.get("error"):
+ content = f"ERROR: {result.get('error')}\n\n{content}"
+
+ # Get prompt and config for logging
+ config = result.get("crawlConfig", {})
+ crawlDepth = config.get("depth", "N/A")
+ crawlWidth = config.get("width", "N/A")
+
+ # Get both the original JSON prompt and the actual prompt sent
+ originalPrompt = result.get("testPrompt", "N/A")
+ actualPromptSent = result.get("actualPromptSent", "N/A")
+
# Add metadata header
metadata = f"""Model: {modelName}
Test Time: {timestamp}
@@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
Processing Time: {result.get('processingTime', 0):.2f}s
Response Length: {result.get('responseLength', 0)} characters
Is Valid JSON: {result.get('isValidJson', False)}
+Test Method: {result.get('testMethod', 'standard')}
+Pages Crawled: {result.get('pagesCrawled', 'N/A')}
+Crawled URL: {result.get('crawledUrl', 'N/A')}
+Has URL: {result.get('hasUrl', 'N/A')}
+Has Title: {result.get('hasTitle', 'N/A')}
+Has Content: {result.get('hasContent', 'N/A')}
+Content Length: {result.get('contentLength', 'N/A')} characters
+
+--- CRAWL CONFIGURATION ---
+Depth: {crawlDepth}
+Width: {crawlWidth}
+
+--- ORIGINAL JSON PROMPT (input) ---
+{originalPrompt}
+
+--- ACTUAL PROMPT SENT TO API (EXACT) ---
+{actualPromptSent}
--- RESPONSE CONTENT ---
{content}
@@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving text response: {str(e)}")
result["textSaveError"] = str(e)
+ def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
+ """Validate that the WEB_CRAWL response contains crawled content."""
+ try:
+ content = result.get("fullResponse", "")
+
+ # Try to parse as JSON
+ crawledData = {}
+ try:
+ parsed = json.loads(content)
+ if isinstance(parsed, dict):
+ crawledData = parsed
+ except:
+ pass
+
+ # Check for expected fields: url, title, content
+ hasUrl = bool(crawledData.get("url"))
+ hasTitle = bool(crawledData.get("title"))
+ hasContent = bool(crawledData.get("content"))
+ contentLength = len(crawledData.get("content", ""))
+
+ result["hasUrl"] = hasUrl
+ result["hasTitle"] = hasTitle
+ result["hasContent"] = hasContent
+ result["contentLength"] = contentLength
+ result["crawledUrl"] = crawledData.get("url", "")
+
+ if hasUrl and hasContent:
+ print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
+ print(f" Content length: {contentLength} characters")
+ print(f" Title: {crawledData.get('title', 'N/A')}")
+ else:
+ print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
+
+ except Exception as e:
+ print(f"❌ Error validating crawl response: {str(e)}")
+ result["crawlValidationError"] = str(e)
+
+ async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
+ """Test Tavily API directly using the crawl() method with better link following."""
+ print(f"\n{'='*60}")
+ print(f"TESTING TAVILY DIRECT API (crawl method)")
+ print(f"{'='*60}")
+
+ startTime = asyncio.get_event_loop().time()
+
+ try:
+ from tavily import AsyncTavilyClient
+ from modules.shared.configuration import APP_CONFIG
+
+ apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
+ if not apiKey:
+ raise Exception("Tavily API key not found")
+
+ client = AsyncTavilyClient(api_key=apiKey)
+
+ # Map our configuration to Tavily parameters
+ # maxWidth -> limit (pages per level)
+ # maxDepth -> max_depth (link following depth)
+ # max_breadth = maxWidth (breadth of crawl at each level)
+ tavilyLimit = crawlWidth
+ tavilyMaxDepth = crawlDepth
+ tavilyMaxBreadth = crawlWidth
+
+ print(f"Calling Tavily API with crawl() method...")
+ print(f"URL: https://www.valueon.ch")
+ print(f"Instructions: Who works in this company?")
+ print(f"Limit: {tavilyLimit} pages per level")
+ print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
+ print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
+ print(f"Deep and Broad Crawl Configuration Active")
+
+ response = await client.crawl(
+ url="https://www.valueon.ch",
+ instructions="Who works in this company?",
+ limit=tavilyLimit,
+ max_depth=tavilyMaxDepth,
+ max_breadth=tavilyMaxBreadth
+ )
+
+ endTime = asyncio.get_event_loop().time()
+ processingTime = endTime - startTime
+
+ # Analyze response
+ contentLength = 0
+ pagesCrawled = 0
+ fullContent = ""
+
+ if isinstance(response, dict):
+ # Check if it has results
+ if "results" in response:
+ results = response["results"]
+ pagesCrawled = len(results)
+ content_parts = []
+ for result in results:
+ url = result.get("url", "")
+ title = result.get("title", "")
+ content = result.get("raw_content", result.get("content", ""))
+ content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
+ contentLength += len(content)
+
+ fullContent = "\n".join(content_parts)
+ else:
+ fullContent = json.dumps(response, indent=2)
+ contentLength = len(fullContent)
+ elif isinstance(response, list):
+ pagesCrawled = len(response)
+ content_parts = []
+ for item in response:
+ if isinstance(item, dict):
+ url = item.get("url", "")
+ title = item.get("title", "")
+ content = item.get("raw_content", item.get("content", ""))
+ content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
+ contentLength += len(content)
+
+ fullContent = "\n".join(content_parts)
+ else:
+ fullContent = str(response)
+ contentLength = len(fullContent)
+
+ result = {
+ "modelName": modelName,
+ "status": "SUCCESS",
+ "processingTime": round(processingTime, 2),
+ "responseLength": contentLength,
+ "responseType": "TavilyDirectAPI",
+ "hasContent": True,
+ "error": None,
+ "modelUsed": modelName,
+ "priceUsd": 0.0,
+ "bytesSent": 0,
+ "bytesReceived": contentLength,
+ "isValidJson": True,
+ "fullResponse": fullContent,
+ "pagesCrawled": pagesCrawled,
+ "testMethod": "direct_api_crawl"
+ }
+
+ print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
+ print(f"📄 Pages crawled: {pagesCrawled}")
+ print(f"📄 Total content length: {contentLength} characters")
+
+ # Save the response
+ self._saveTextResponse(modelName, result)
+ self._validateCrawlResponse(modelName, result)
+ self._saveIndividualModelResult(modelName, result)
+
+ self.testResults.append(result)
+ return result
+
+ except Exception as e:
+ endTime = asyncio.get_event_loop().time()
+ processingTime = endTime - startTime
+
+ result = {
+ "modelName": modelName,
+ "status": "EXCEPTION",
+ "processingTime": round(processingTime, 2),
+ "responseLength": 0,
+ "responseType": "exception",
+ "hasContent": False,
+ "error": str(e)
+ }
+
+ print(f"💥 EXCEPTION - {str(e)}")
+ self.testResults.append(result)
+ return result
+
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
"""Save individual model test result to file."""
try:
@@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving individual result: {str(e)}")
def getAllAvailableModels(self) -> List[str]:
- """Get all available model names."""
- # Hardcoded list of known models - same approach as test_ai_behavior.py
- return [
- # "claude-3-5-sonnet-20241022", # Skipped - text model, test later
- # "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input
- # "gpt-4o", # Skipped - text model, test later
- # "gpt-3.5-turbo", # Skipped - text model, test later
- # "gpt-4o-vision", # Skipped - requires image input
- # "dall-e-3", # Skipped - image generation, test later
- "sonar", # Perplexity web model
- "sonar-pro", # Perplexity web model
- "tavily-search", # Tavily web model (unified research)
- # "internal-extractor", # Skipped - internal model, test later
- # "internal-generator", # Skipped - internal model, test later
- # "internal-renderer" # Skipped - internal model, test later
- ]
+ """Get all available model names that support WEB_CRAWL."""
+ from modules.aicore.aicoreModelRegistry import modelRegistry
+ from modules.datamodels.datamodelAi import OperationTypeEnum
+
+ # Get all models from registry
+ allModels = modelRegistry.getAvailableModels()
+
+ # Filter models that support WEB_CRAWL
+ webCrawlModels = []
+ for model in allModels:
+ if model.operationTypes and any(
+ ot.operationType == OperationTypeEnum.WEB_CRAWL
+ for ot in model.operationTypes
+ ): # Include both Tavily and Perplexity models
+ webCrawlModels.append(model.name)
+
+ # Filter to only "sonar" model for testing
+ webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
+
+ print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
+ for modelName in webCrawlModels:
+ print(f" - {modelName}")
+
+ return webCrawlModels
def saveTestResults(self):
"""Save detailed test results to file."""
@@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
if result.get("isValidJson") is not None:
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
+ if result.get("crawledUrl"):
+ print(f" Crawled URL: {result['crawledUrl']}")
+
+ if result.get("contentLength") is not None:
+ print(f" Content length: {result['contentLength']} characters")
+
+ if result.get("pagesCrawled") is not None:
+ print(f" Pages crawled: {result['pagesCrawled']}")
+
if result["error"]:
print(f" Error: {result['error']}")
@@ -525,12 +777,32 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"{'='*80}")
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
+
+ # Find models with most content
+ modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
+ if modelsWithContent:
+ mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
+ totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
+ avgContent = totalContent / len(modelsWithContent)
+ print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
+ print(f"📊 Average content per model: {avgContent:.0f} characters")
+ print(f"📊 Total content crawled across all models: {totalContent} characters")
+
+ # Find models with most pages crawled (for Tavily direct API)
+ modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
+ if modelsWithPages:
+ mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
+ totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
+ avgPages = totalPages / len(modelsWithPages)
+ print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
+ print(f"📊 Average pages per model: {avgPages:.1f} pages")
+ print(f"📊 Total pages crawled across all models: {totalPages} pages")
async def main():
- """Run AI models testing."""
+ """Run AI models testing for WEB_CRAWL operation."""
tester = AIModelsTester()
- print("Starting AI Models Testing...")
+ print("Starting AI Models Testing for WEB_CRAWL...")
print("Initializing AI service...")
await tester.initialize()
@@ -542,8 +814,9 @@ async def main():
print(f" {i}. {model}")
print(f"\n{'='*80}")
- print("STARTING INDIVIDUAL MODEL TESTS")
+ print("STARTING WEB_CRAWL TESTS")
print(f"{'='*80}")
+ print("Testing each model's ability to crawl URLs and return content...")
print("Press Enter after each model test to continue to the next one...")
# Test each model individually