ai models ready for web and txt

This commit is contained in:
ValueOn AG 2025-10-26 18:17:17 +01:00
parent 72e0687826
commit 2489719c62
7 changed files with 577 additions and 232 deletions

View file

@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity", connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions", apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2, temperature=0.2,
maxTokens=4000, maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000, contextLength=32000,
costPer1kTokensInput=0.005, costPer1kTokensInput=0.005,
costPer1kTokensOutput=0.005, costPer1kTokensOutput=0.005,
@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity", connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions", apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2, temperature=0.2,
maxTokens=4000, maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000, contextLength=32000,
costPer1kTokensInput=0.01, costPer1kTokensInput=0.01,
costPer1kTokensOutput=0.01, costPer1kTokensOutput=0.01,
speedRating=6, # Slower due to AI analysis speedRating=6, # Slower due to AI analysis
qualityRating=10, # Best AI analysis quality qualityRating=9, # Best AI analysis quality
# capabilities removed (not used in business logic) # capabilities removed (not used in business logic)
functionCall=self._routeWebOperation, functionCall=self._routeWebOperation,
priority=PriorityEnum.QUALITY, priority=PriorityEnum.QUALITY,
@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
# Fallback to basic call # Fallback to basic call
return await self.callAiBasic(modelCall) return await self.callAiBasic(modelCall)
def _getDepthInstructions(self, maxDepth: int) -> str:
"""
Map maxDepth (numeric) to instructional text for LLM.
Args:
maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
Returns:
Instructional text for the LLM
"""
depthMap = {
1: "Basic overview - extract main content from the main page only",
2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
}
return depthMap.get(maxDepth, depthMap[2])
def _getWidthInstructions(self, maxWidth: int) -> str:
"""
Map maxWidth (numeric) to instructional text for LLM.
Args:
maxWidth: Number of pages to crawl at each level (default: 10)
Returns:
Instructional text for the LLM
"""
if maxWidth <= 5:
return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
elif maxWidth <= 15:
return f"Standard breadth - crawl up to {maxWidth} pages per level"
elif maxWidth <= 30:
return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
else:
return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse: async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
""" """
WEB_SEARCH operation - returns list of URLs based on search query. WEB_SEARCH operation - returns list of URLs based on search query.
@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs. Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
{'' if not countryName else f'Focus on results from {countryName}.'} {'' if not countryName else f'Focus on results from {countryName}.'}
{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}
Return ONLY a JSON array of URLs, no additional text: Return ONLY a JSON array of URLs, no additional text:
[ [
@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
""" """
WEB_CRAWL operation - crawls ONE URL and returns content. WEB_CRAWL operation - crawls ONE URL and returns content.
Perplexity API Parameters Used:
- messages: The prompt containing URL and instruction
- max_tokens: Maximum response length
- max_results: Number of search results (1-20, default: 10)
- temperature: Response randomness (not web search specific)
Pagination: Perplexity does NOT return paginated responses.
A single response contains all results within max_tokens limit.
Args: Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
webCrawlPrompt = AiCallPromptWebCrawl(**promptData) webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Build crawl request for Perplexity - ONE URL # Build crawl request for Perplexity - ONE URL
crawlPrompt = f"""Crawl and extract content from this URL based on the instruction: # Match playground prompt style: just URL + question
# This allows Perplexity to return detailed multi-source results
INSTRUCTION: '{webCrawlPrompt.instruction}' crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
URL to crawl (maxDepth={webCrawlPrompt.maxDepth}): # Build payload with optional Perplexity parameters
{webCrawlPrompt.url} # Note: max_tokens_per_page may not be supported by chat/completions endpoint
# The playground Python SDK might use a different internal API
Extract and return the relevant content based on the instruction. maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results
Return as JSON object with this structure:
{{
"url": "{webCrawlPrompt.url}",
"title": "Page title",
"content": "Extracted content relevant to the instruction"
}}
Return ONLY valid JSON, no additional text."""
payload = { payload = {
"model": model.name, "model": model.name,
"messages": [{"role": "user", "content": crawlPrompt}], "messages": [{"role": "user", "content": crawlPrompt}],
"temperature": temperature, "temperature": temperature,
"max_tokens": maxTokens "max_tokens": maxTokens, # Use model's configured maxTokens (24000)
"max_results": maxResults,
"return_citations": True # Request citations explicitly
} }
logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
response = await self.httpClient.post(model.apiUrl, json=payload) response = await self.httpClient.post(model.apiUrl, json=payload)
if response.status_code != 200: if response.status_code != 200:
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}") raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
apiResponse = response.json() apiResponse = response.json()
# Extract the main content
content = apiResponse["choices"][0]["message"]["content"] content = apiResponse["choices"][0]["message"]["content"]
# Parse JSON content and ensure it's a single object # Check for citations or search results in the response
import json citations = apiResponse.get("citations", [])
try: searchResults = apiResponse.get("search_results", [])
parsedContent = json.loads(content)
# Ensure it's a single object, not an array
if isinstance(parsedContent, list):
parsedContent = parsedContent[0] if parsedContent else {}
except:
# If not JSON, create structured response
parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
# Return as JSON string # Log what we found
if citations:
logger.info(f"Found {len(citations)} citations in response")
if searchResults:
logger.info(f"Found {len(searchResults)} search results in response")
logger.debug(f"API response keys: {list(apiResponse.keys())}")
# Build comprehensive response with citations if available
import json
responseData = {
"content": content,
"citations": citations if citations else [],
"search_results": searchResults if searchResults else []
}
# Return comprehensive response
return AiModelResponse( return AiModelResponse(
content=json.dumps(parsedContent, indent=2), content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
success=True, success=True,
modelId=model.name, modelId=model.name,
metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url} metadata={
"response_id": apiResponse.get("id", ""),
"operation": "WEB_CRAWL",
"url": webCrawlPrompt.url,
"actualPromptSent": crawlPrompt,
"has_citations": len(citations) > 0,
"has_search_results": len(searchResults) > 0
}
) )
except Exception as e: except Exception as e:

View file

@ -27,7 +27,8 @@ class WebCrawlResult:
content: str content: str
title: Optional[str] = None title: Optional[str] = None
class ConnectorWeb(BaseConnectorAi):
class AiTavily(BaseConnectorAi):
"""Tavily web search connector.""" """Tavily web search connector."""
def __init__(self): def __init__(self):
@ -42,7 +43,36 @@ class ConnectorWeb(BaseConnectorAi):
self.webSearchMaxResults: int = 20 self.webSearchMaxResults: int = 20
# Initialize client if API key is available # Initialize client if API key is available
self._initializeClient() self._initializeClient()
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search & Research",
connectorType="tavily",
apiUrl="https://api.tavily.com",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=8, # Good speed for search and extract
qualityRating=9, # Excellent quality for web research
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 9),
(OperationTypeEnum.WEB_CRAWL, 10)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
)
]
def _initializeClient(self): def _initializeClient(self):
"""Initialize the Tavily client if API key is available.""" """Initialize the Tavily client if API key is available."""
try: try:
@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
return filteredResults return filteredResults
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search & Research",
connectorType="tavily",
apiUrl="https://api.tavily.com",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=8, # Good speed for search and extract
qualityRating=9, # Excellent quality for web research
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 9),
(OperationTypeEnum.WEB_CRAWL, 8)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
)
]
@classmethod @classmethod
async def create(cls): async def create(cls):
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET") apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
topic: str | None = None, topic: str | None = None,
includeDomains: list[str] | None = None, includeDomains: list[str] | None = None,
excludeDomains: list[str] | None = None, excludeDomains: list[str] | None = None,
language: str | None = None,
country: str | None = None, country: str | None = None,
includeAnswer: bool | None = None, includeAnswer: str | None = None,
includeRawContent: bool | None = None, includeRawContent: str | None = None,
) -> list[WebSearchResult]: ) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search.""" """Calls the Tavily API to perform a web search."""
# Make sure maxResults is within the allowed range (use cached values) # Make sure maxResults is within the allowed range (use cached values)
@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
kwargs["include_domains"] = includeDomains kwargs["include_domains"] = includeDomains
if excludeDomains is not None: if excludeDomains is not None:
kwargs["exclude_domains"] = excludeDomains kwargs["exclude_domains"] = excludeDomains
if language is not None:
kwargs["language"] = language
if country is not None: if country is not None:
kwargs["country"] = country kwargs["country"] = country
if includeAnswer is not None: if includeAnswer is not None:
@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
if includeRawContent is not None: if includeRawContent is not None:
kwargs["include_raw_content"] = includeRawContent kwargs["include_raw_content"] = includeRawContent
logger.debug(f"Tavily.search kwargs: {kwargs}") # Log the final API call parameters for comparison
logger.info(f"Tavily API call parameters: {kwargs}")
# Ensure client is initialized # Ensure client is initialized
if self.client is None: if self.client is None:
@ -316,7 +316,11 @@ class ConnectorWeb(BaseConnectorAi):
raise ValueError("Tavily client not initialized. Please check API key configuration.") raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await self.client.search(**kwargs) response = await self.client.search(**kwargs)
# Return all results without score filtering
# Tavily's scoring is already applied by the API
logger.info(f"Tavily returned {len(response.get('results', []))} results")
return [ return [
WebSearchResult( WebSearchResult(
title=result["title"], title=result["title"],
@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):
async def _crawl( async def _crawl(
self, self,
urls: list, url: str,
extractDepth: str | None = None, instructions: str | None = None,
format: str | None = None, limit: int = 20,
maxDepth: int = 2,
maxBreadth: int = 40,
) -> list[WebCrawlResult]: ) -> list[WebCrawlResult]:
"""Calls the Tavily API to extract text content from URLs with retry logic.""" """Calls the Tavily API to crawl ONE URL with link following and retry logic."""
maxRetries = self.crawlMaxRetries maxRetries = self.crawlMaxRetries
retryDelay = self.crawlRetryDelay retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout timeout = self.crawlTimeout
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}") logger.debug(f"Starting crawl of URL: {url}")
logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s") logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
for attempt in range(maxRetries + 1): for attempt in range(maxRetries + 1):
try: try:
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}") logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
# Use asyncio.wait_for for timeout
# Build kwargs for extract
kwargsExtract: dict = {"urls": urls}
kwargsExtract["extract_depth"] = extractDepth or "advanced"
kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
# Ensure client is initialized # Ensure client is initialized
if self.client is None: if self.client is None:
self._initializeClient() self._initializeClient()
if self.client is None: if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.") raise ValueError("Tavily client not initialized. Please check API key configuration.")
logger.debug(f"Crawling URL: {url}")
# Build kwargs for crawl
kwargsCrawl: dict = {"url": url}
if instructions:
kwargsCrawl["instructions"] = instructions
if limit:
kwargsCrawl["limit"] = limit
if maxDepth:
kwargsCrawl["max_depth"] = maxDepth
if maxBreadth:
kwargsCrawl["max_breadth"] = maxBreadth
logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
response = await asyncio.wait_for( response = await asyncio.wait_for(
self.client.extract(**kwargsExtract), self.client.crawl(**kwargsCrawl),
timeout=timeout timeout=timeout
) )
logger.debug(f"Tavily response received: {list(response.keys())}")
# Debug: Log what Tavily actually returns logger.debug(f"Tavily response received: {type(response)}")
if "results" in response and response["results"]:
logger.debug(f"Tavily returned {len(response['results'])} results") # Parse response - could be dict with results or list
logger.debug(f"First result keys: {list(response['results'][0].keys())}") if isinstance(response, dict) and "results" in response:
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}") pageResults = response["results"]
elif isinstance(response, list):
# Log each result pageResults = response
for i, result in enumerate(response["results"]):
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
else: else:
logger.warning(f"Tavily returned no results in response: {response}") logger.warning(f"Unexpected response format: {type(response)}")
pageResults = []
results = [ logger.debug(f"Got {len(pageResults)} pages from crawl")
WebCrawlResult(
url=result["url"],
content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
title=result.get("title", "") # Extract title if available
)
for result in response["results"]
]
logger.debug(f"Crawl successful: extracted {len(results)} results") # Convert to WebCrawlResult format
results = []
for result in pageResults:
results.append(WebCrawlResult(
url=result.get("url", url),
content=result.get("raw_content", result.get("content", "")),
title=result.get("title", "")
))
logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
return results return results
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}") logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
if attempt < maxRetries: if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...") logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay) await asyncio.sleep(retryDelay)
@ -398,21 +410,20 @@ class ConnectorWeb(BaseConnectorAi):
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout") raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
except Exception as e: except Exception as e:
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}") logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}") logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
# Check if it's a validation error and log more details # Check if it's a validation error and log more details
if "validation" in str(e).lower(): if "validation" in str(e).lower():
logger.debug(f"URL validation failed. Checking URL format:") logger.debug(f"URL validation failed. Checking URL format:")
for i, url in enumerate(urls): logger.debug(f" URL: '{url}' (length: {len(url)})")
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})") # Check for common URL issues
# Check for common URL issues if ' ' in url:
if ' ' in url: logger.debug(f" WARNING: URL contains spaces!")
logger.debug(f" WARNING: URL contains spaces!") if not url.startswith(('http://', 'https://')):
if not url.startswith(('http://', 'https://')): logger.debug(f" WARNING: URL doesn't start with http/https!")
logger.debug(f" WARNING: URL doesn't start with http/https!") if len(url) > 2000:
if len(url) > 2000: logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
if attempt < maxRetries: if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...") logger.info(f"Retrying in {retryDelay} seconds...")
@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
if countryName: if countryName:
countryName = self._convertIsoCodeToCountryName(countryName) countryName = self._convertIsoCodeToCountryName(countryName)
# Perform search # Perform search - use exact parameters from prompt
# NOTE: timeRange parameter causes generic results, so we don't use it
searchResults = await self._search( searchResults = await self._search(
query=webSearchPrompt.instruction, query=webSearchPrompt.instruction,
maxResults=webSearchPrompt.maxNumberPages, maxResults=webSearchPrompt.maxNumberPages,
timeRange=webSearchPrompt.timeRange, timeRange=None, # Not used - causes generic results
country=countryName, country=countryName,
language=webSearchPrompt.language, includeAnswer="basic",
includeAnswer=False, includeRawContent="text"
includeRawContent=False
) )
# Extract URLs from results # Extract URLs from results
@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse": async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
""" """
WEB_CRAWL operation - crawls one URL using Tavily. WEB_CRAWL operation - crawls one URL using Tavily with link following.
Args: Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
Returns: Returns:
AiModelResponse with crawl results as JSON AiModelResponse with crawl results as JSON (may include multiple pages)
""" """
try: try:
# Extract parameters # Extract parameters
@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
# Create Pydantic model # Create Pydantic model
webCrawlPrompt = AiCallPromptWebCrawl(**promptData) webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Perform crawl for ONE URL # Perform crawl for ONE URL with link following
# Note: _crawl expects a list, so we wrap the single URL in a list # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
crawlResults = await self._crawl( crawlResults = await self._crawl(
urls=[webCrawlPrompt.url], url=webCrawlPrompt.url,
extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic", instructions=webCrawlPrompt.instruction,
format="markdown" limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages
maxDepth=webCrawlPrompt.maxDepth or 2,
maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth
) )
# Format result for single URL - consistent with Perplexity format # If we got multiple pages from the crawl, we need to format them differently
# Return the first result for backwards compatibility, but include total page count
if crawlResults and len(crawlResults) > 0: if crawlResults and len(crawlResults) > 0:
firstResult = crawlResults[0] # Get all pages content
allContent = ""
for i, result in enumerate(crawlResults, 1):
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
if result.title:
allContent += f"{pageHeader}Title: {result.title}\n\n"
allContent += f"{result.content}\n"
resultData = { resultData = {
"url": firstResult.url, "url": webCrawlPrompt.url,
"title": firstResult.title if firstResult.title else "Content", "title": crawlResults[0].title if crawlResults[0].title else "Content",
"content": firstResult.content "content": allContent,
"pagesCrawled": len(crawlResults),
"pageUrls": [result.url for result in crawlResults]
} }
else: else:
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"} resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
# Return as JSON - same format as Perplexity # Return as JSON - same format as Perplexity but with multiple pages content
import json import json
return AiModelResponse( return AiModelResponse(
content=json.dumps(resultData, indent=2), content=json.dumps(resultData, indent=2),
success=True, success=True,
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url} metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
) )
except Exception as e: except Exception as e:
logger.error(f"Error in Tavily web crawl: {str(e)}") logger.error(f"Error in Tavily web crawl: {str(e)}")
import json import json
errorResult = {"error": str(e), "url": ""} errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
return AiModelResponse( return AiModelResponse(
content=json.dumps(errorResult, indent=2), content=json.dumps(errorResult, indent=2),
success=False, success=False,

View file

@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
instruction: str = Field(description="Search instruction/query for finding relevant URLs") instruction: str = Field(description="Search instruction/query for finding relevant URLs")
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)") country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)") maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)") language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)") researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")

View file

@ -186,12 +186,13 @@ class CountryCodes:
Get Tavily-compatible country name from ISO-2 code. Get Tavily-compatible country name from ISO-2 code.
Args: Args:
isoCode: ISO-2 country code (e.g., "CH", "US") isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
Returns: Returns:
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states") Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
""" """
isoCodeUpper = isoCode.upper() # Convert to uppercase for lookup
isoCodeUpper = isoCode.upper() if isoCode else ""
mapping = cls._COUNTRY_MAP.get(isoCodeUpper) mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
return mapping[0] if mapping else isoCode return mapping[0] if mapping else isoCode

View file

@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class WebcrawlService: class WebService:
"""Service for web search and crawling operations.""" """Service for web search and crawling operations."""
def __init__(self, services): def __init__(self, services):
@ -56,7 +56,6 @@ class WebcrawlService:
extractedUrls = analysisResult.get("urls", []) extractedUrls = analysisResult.get("urls", [])
needsSearch = analysisResult.get("needsSearch", True) # Default to True needsSearch = analysisResult.get("needsSearch", True) # Default to True
maxNumberPages = analysisResult.get("maxNumberPages", 10) maxNumberPages = analysisResult.get("maxNumberPages", 10)
timeRange = analysisResult.get("timeRange")
countryCode = analysisResult.get("country", country) countryCode = analysisResult.get("country", country)
languageCode = analysisResult.get("language", language) languageCode = analysisResult.get("language", language)
finalResearchDepth = analysisResult.get("researchDepth", researchDepth) finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
@ -77,7 +76,6 @@ class WebcrawlService:
searchUrls = await self._performWebSearch( searchUrls = await self._performWebSearch(
instruction=instruction, instruction=instruction,
maxNumberPages=maxNumberPages - len(allUrls), maxNumberPages=maxNumberPages - len(allUrls),
timeRange=timeRange,
country=countryCode, country=countryCode,
language=languageCode language=languageCode
) )
@ -153,10 +151,9 @@ Extract and provide a JSON response with:
2. urls: List of URLs found in the prompt text 2. urls: List of URLs found in the prompt text
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted 3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20) 4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
5. timeRange: Time range if mentioned (d, w, m, y, or null) 5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de) 6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
7. language: Language code if specified (lowercase, e.g., de, en, fr) 7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
Return ONLY valid JSON, no additional text: Return ONLY valid JSON, no additional text:
{{ {{
@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
"urls": ["url1", "url2"], "urls": ["url1", "url2"],
"needsSearch": true, "needsSearch": true,
"maxNumberPages": 10, "maxNumberPages": 10,
"timeRange": null,
"country": "ch", "country": "ch",
"language": "de", "language": "de",
"researchDepth": "general" "researchDepth": "general"
@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
"urls": [], "urls": [],
"needsSearch": True, "needsSearch": True,
"maxNumberPages": 10, "maxNumberPages": 10,
"timeRange": None,
"country": country, "country": country,
"language": language, "language": language,
"researchDepth": researchDepth "researchDepth": researchDepth
@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
self, self,
instruction: str, instruction: str,
maxNumberPages: int, maxNumberPages: int,
timeRange: Optional[str],
country: Optional[str], country: Optional[str],
language: Optional[str] language: Optional[str]
) -> List[str]: ) -> List[str]:
@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
instruction=instruction, instruction=instruction,
country=country, country=country,
maxNumberPages=maxNumberPages, maxNumberPages=maxNumberPages,
timeRange=timeRange,
language=language language=language
) )
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2) searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
instruction=instruction, instruction=instruction,
url=url, # Single URL url=url, # Single URL
maxDepth=maxDepth, maxDepth=maxDepth,
maxWidth=10 maxWidth=50
) )
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2) crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)

View file

@ -170,7 +170,7 @@ class MethodAi(MethodBase):
- Output format: JSON with research results including URLs and content. - Output format: JSON with research results including URLs and content.
Parameters: Parameters:
- prompt (str, required): Natural language research instruction, including time range if relevant. - prompt (str, required): Natural language research instruction.
- list(url) (list, optional): Specific URLs to crawl, if needed. - list(url) (list, optional): Specific URLs to crawl, if needed.
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de). - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
- language (str, optional): Language code (lowercase, e.g., de, en, fr). - language (str, optional): Language code (lowercase, e.g., de, en, fr).

View file

@ -1,6 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
AI Models Test - Tests all available AI models individually AI Models Test - Tests WEB_CRAWL functionality on all models that support it
This script tests all models that have WEB_CRAWL capability, validates that
they can crawl specific URLs and return content, and analyzes the quality of results.
""" """
import asyncio import asyncio
@ -53,9 +56,18 @@ class AIModelsTester:
async def initialize(self): async def initialize(self):
"""Initialize the AI service.""" """Initialize the AI service."""
# Set logging level to INFO to reduce noise # Set logging level to DEBUG for detailed output
import logging import logging
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.DEBUG)
# Initialize the model registry with all connectors
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicorePluginTavily import AiTavily
from modules.aicore.aicorePluginPerplexity import AiPerplexity
# Register web connectors that support WEB_CRAWL
modelRegistry.registerConnector(AiTavily())
modelRegistry.registerConnector(AiPerplexity())
# The AI service needs to be recreated with proper initialization # The AI service needs to be recreated with proper initialization
from modules.services.serviceAi.mainServiceAi import AiService from modules.services.serviceAi.mainServiceAi import AiService
@ -86,27 +98,53 @@ class AIModelsTester:
print(f"📁 Results will be saved to: {self.modelTestDir}") print(f"📁 Results will be saved to: {self.modelTestDir}")
async def testModel(self, modelName: str) -> Dict[str, Any]: async def testModel(self, modelName: str) -> Dict[str, Any]:
"""Test a specific AI model with a simple prompt.""" """Test a specific AI model with WEB_CRAWL operation."""
print(f"\n{'='*60}") print(f"\n{'='*60}")
print(f"TESTING MODEL: {modelName}") print(f"TESTING MODEL: {modelName}")
print(f"OPERATION TYPE: WEB_CRAWL")
print(f"{'='*60}") print(f"{'='*60}")
# Use same prompt for all web models # CRAWL CONFIGURATION
import json # Deep and Broad Web Crawl Example:
# - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
# - Level 1: Starting page
# - Level 2: Pages linked from starting page
# - Level 3: Pages linked from Level 2 pages
# - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
# This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
#
# Common configurations:
# - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused)
# - General/Standard: maxDepth=2, maxWidth=10 (balanced)
# - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive)
if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower(): CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep
# All web models use the same JSON formatted prompt CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level
# Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
testPrompt = json.dumps({ print(f"Crawl Configuration:")
"prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.", print(f" - Depth: {CRAWL_DEPTH} levels (deep)")
"maxResults": 5, print(f" - Width: {CRAWL_WIDTH} pages per level (broad)")
"timeRange": "y", print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
"country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
"format": "json" # Use WEB_CRAWL specific prompt format
}, indent=2) from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
else:
# Fallback for other models # Test with simple prompt like playground example
testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON." simplePrompt = f"https://www.valueon.ch: Who works in this company?"
# But keep structured format for now to match our API
testPrompt = json.dumps({
"instruction": "Who works in this company?",
"url": "https://www.valueon.ch",
"maxDepth": CRAWL_DEPTH,
"maxWidth": CRAWL_WIDTH
}, indent=2)
print(f"Simple prompt (playground style): {simplePrompt}")
# For Tavily models, test direct API call for better link following
if "tavily" in modelName.lower():
return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
print(f"Test prompt: {testPrompt}") print(f"Test prompt: {testPrompt}")
print(f"Prompt length: {len(testPrompt)} characters") print(f"Prompt length: {len(testPrompt)} characters")
@ -114,17 +152,11 @@ class AIModelsTester:
startTime = asyncio.get_event_loop().time() startTime = asyncio.get_event_loop().time()
try: try:
# Create options to force this specific model # Create options for WEB_CRAWL operation
if "internal" in modelName.lower(): options = AiCallOptions(
options = AiCallOptions( operationType=OperationTypeEnum.WEB_CRAWL,
operationType=OperationTypeEnum.DATA_EXTRACT, preferredModel=modelName
preferredModel=modelName )
)
else:
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
preferredModel=modelName
)
# Call the AI service DIRECTLY through the model's functionCall # Call the AI service DIRECTLY through the model's functionCall
# This tests the actual model, not the document generation pipeline # This tests the actual model, not the document generation pipeline
@ -140,29 +172,14 @@ class AIModelsTester:
import base64 import base64
import os import os
# Prepare messages and options based on model type # For WEB_CRAWL models, use normal functionCall with structured prompt
if "vision" in modelName.lower(): messages = [{"role": "user", "content": testPrompt}]
# For vision models, skip for now since they require special handling modelCall = AiModelCall(
print(f"⚠️ Skipping vision model {modelName} - requires special image handling") messages=messages,
return { model=model,
"modelName": modelName, options=options
"status": "SKIPPED", )
"processingTime": 0.0, response = await model.functionCall(modelCall)
"responseLength": 0,
"responseType": "skipped",
"hasContent": False,
"error": "Vision model requires special image handling",
"fullResponse": "Skipped - vision model requires special image handling"
}
else:
# For other models, use normal functionCall
messages = [{"role": "user", "content": testPrompt}]
modelCall = AiModelCall(
messages=messages,
model=model,
options=options
)
response = await model.functionCall(modelCall)
endTime = asyncio.get_event_loop().time() endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime processingTime = endTime - startTime
@ -185,6 +202,10 @@ class AIModelsTester:
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0 "bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
} }
# Extract actual prompt sent if available in metadata
if hasattr(response, 'metadata') and response.metadata:
result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
# Try to parse content as JSON # Try to parse content as JSON
if response.content: if response.content:
try: try:
@ -289,9 +310,16 @@ class AIModelsTester:
print(f"📄 Response length: {len(str(response))} characters") print(f"📄 Response length: {len(str(response))} characters")
print(f"📄 Response preview: {result['responsePreview']}") print(f"📄 Response preview: {result['responsePreview']}")
# Save text response for all models # Add prompt to result for logging
if result.get("status") == "SUCCESS": result["testPrompt"] = testPrompt
self._saveTextResponse(modelName, result) result["crawlConfig"] = {
"depth": CRAWL_DEPTH,
"width": CRAWL_WIDTH
}
# For WEB_CRAWL, also validate that content was extracted
if result.get("status") == "SUCCESS" and result.get("fullResponse"):
self._validateCrawlResponse(modelName, result)
except Exception as e: except Exception as e:
endTime = asyncio.get_event_loop().time() endTime = asyncio.get_event_loop().time()
@ -304,13 +332,22 @@ class AIModelsTester:
"responseLength": 0, "responseLength": 0,
"responseType": "exception", "responseType": "exception",
"hasContent": False, "hasContent": False,
"error": str(e) "error": str(e),
"testPrompt": testPrompt,
"crawlConfig": {
"depth": CRAWL_DEPTH,
"width": CRAWL_WIDTH
}
} }
print(f"💥 EXCEPTION - {str(e)}") print(f"💥 EXCEPTION - {str(e)}")
self.testResults.append(result) self.testResults.append(result)
# Save text response even for exceptions to log the prompt
if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
self._saveTextResponse(modelName, result)
# Save individual model result immediately # Save individual model result immediately
self._saveIndividualModelResult(modelName, result) self._saveIndividualModelResult(modelName, result)
@ -378,6 +415,19 @@ class AIModelsTester:
if not content: if not content:
content = result.get("responsePreview", "No content available") content = result.get("responsePreview", "No content available")
# If there's an error, include it in the content
if result.get("error"):
content = f"ERROR: {result.get('error')}\n\n{content}"
# Get prompt and config for logging
config = result.get("crawlConfig", {})
crawlDepth = config.get("depth", "N/A")
crawlWidth = config.get("width", "N/A")
# Get both the original JSON prompt and the actual prompt sent
originalPrompt = result.get("testPrompt", "N/A")
actualPromptSent = result.get("actualPromptSent", "N/A")
# Add metadata header # Add metadata header
metadata = f"""Model: {modelName} metadata = f"""Model: {modelName}
Test Time: {timestamp} Test Time: {timestamp}
@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
Processing Time: {result.get('processingTime', 0):.2f}s Processing Time: {result.get('processingTime', 0):.2f}s
Response Length: {result.get('responseLength', 0)} characters Response Length: {result.get('responseLength', 0)} characters
Is Valid JSON: {result.get('isValidJson', False)} Is Valid JSON: {result.get('isValidJson', False)}
Test Method: {result.get('testMethod', 'standard')}
Pages Crawled: {result.get('pagesCrawled', 'N/A')}
Crawled URL: {result.get('crawledUrl', 'N/A')}
Has URL: {result.get('hasUrl', 'N/A')}
Has Title: {result.get('hasTitle', 'N/A')}
Has Content: {result.get('hasContent', 'N/A')}
Content Length: {result.get('contentLength', 'N/A')} characters
--- CRAWL CONFIGURATION ---
Depth: {crawlDepth}
Width: {crawlWidth}
--- ORIGINAL JSON PROMPT (input) ---
{originalPrompt}
--- ACTUAL PROMPT SENT TO API (EXACT) ---
{actualPromptSent}
--- RESPONSE CONTENT --- --- RESPONSE CONTENT ---
{content} {content}
@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving text response: {str(e)}") print(f"❌ Error saving text response: {str(e)}")
result["textSaveError"] = str(e) result["textSaveError"] = str(e)
def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
"""Validate that the WEB_CRAWL response contains crawled content."""
try:
content = result.get("fullResponse", "")
# Try to parse as JSON
crawledData = {}
try:
parsed = json.loads(content)
if isinstance(parsed, dict):
crawledData = parsed
except:
pass
# Check for expected fields: url, title, content
hasUrl = bool(crawledData.get("url"))
hasTitle = bool(crawledData.get("title"))
hasContent = bool(crawledData.get("content"))
contentLength = len(crawledData.get("content", ""))
result["hasUrl"] = hasUrl
result["hasTitle"] = hasTitle
result["hasContent"] = hasContent
result["contentLength"] = contentLength
result["crawledUrl"] = crawledData.get("url", "")
if hasUrl and hasContent:
print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
print(f" Content length: {contentLength} characters")
print(f" Title: {crawledData.get('title', 'N/A')}")
else:
print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
except Exception as e:
print(f"❌ Error validating crawl response: {str(e)}")
result["crawlValidationError"] = str(e)
async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
"""Test Tavily API directly using the crawl() method with better link following."""
print(f"\n{'='*60}")
print(f"TESTING TAVILY DIRECT API (crawl method)")
print(f"{'='*60}")
startTime = asyncio.get_event_loop().time()
try:
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if not apiKey:
raise Exception("Tavily API key not found")
client = AsyncTavilyClient(api_key=apiKey)
# Map our configuration to Tavily parameters
# maxWidth -> limit (pages per level)
# maxDepth -> max_depth (link following depth)
# max_breadth = maxWidth (breadth of crawl at each level)
tavilyLimit = crawlWidth
tavilyMaxDepth = crawlDepth
tavilyMaxBreadth = crawlWidth
print(f"Calling Tavily API with crawl() method...")
print(f"URL: https://www.valueon.ch")
print(f"Instructions: Who works in this company?")
print(f"Limit: {tavilyLimit} pages per level")
print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
print(f"Deep and Broad Crawl Configuration Active")
response = await client.crawl(
url="https://www.valueon.ch",
instructions="Who works in this company?",
limit=tavilyLimit,
max_depth=tavilyMaxDepth,
max_breadth=tavilyMaxBreadth
)
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
# Analyze response
contentLength = 0
pagesCrawled = 0
fullContent = ""
if isinstance(response, dict):
# Check if it has results
if "results" in response:
results = response["results"]
pagesCrawled = len(results)
content_parts = []
for result in results:
url = result.get("url", "")
title = result.get("title", "")
content = result.get("raw_content", result.get("content", ""))
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
contentLength += len(content)
fullContent = "\n".join(content_parts)
else:
fullContent = json.dumps(response, indent=2)
contentLength = len(fullContent)
elif isinstance(response, list):
pagesCrawled = len(response)
content_parts = []
for item in response:
if isinstance(item, dict):
url = item.get("url", "")
title = item.get("title", "")
content = item.get("raw_content", item.get("content", ""))
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
contentLength += len(content)
fullContent = "\n".join(content_parts)
else:
fullContent = str(response)
contentLength = len(fullContent)
result = {
"modelName": modelName,
"status": "SUCCESS",
"processingTime": round(processingTime, 2),
"responseLength": contentLength,
"responseType": "TavilyDirectAPI",
"hasContent": True,
"error": None,
"modelUsed": modelName,
"priceUsd": 0.0,
"bytesSent": 0,
"bytesReceived": contentLength,
"isValidJson": True,
"fullResponse": fullContent,
"pagesCrawled": pagesCrawled,
"testMethod": "direct_api_crawl"
}
print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
print(f"📄 Pages crawled: {pagesCrawled}")
print(f"📄 Total content length: {contentLength} characters")
# Save the response
self._saveTextResponse(modelName, result)
self._validateCrawlResponse(modelName, result)
self._saveIndividualModelResult(modelName, result)
self.testResults.append(result)
return result
except Exception as e:
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
result = {
"modelName": modelName,
"status": "EXCEPTION",
"processingTime": round(processingTime, 2),
"responseLength": 0,
"responseType": "exception",
"hasContent": False,
"error": str(e)
}
print(f"💥 EXCEPTION - {str(e)}")
self.testResults.append(result)
return result
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]): def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
"""Save individual model test result to file.""" """Save individual model test result to file."""
try: try:
@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving individual result: {str(e)}") print(f"❌ Error saving individual result: {str(e)}")
def getAllAvailableModels(self) -> List[str]: def getAllAvailableModels(self) -> List[str]:
"""Get all available model names.""" """Get all available model names that support WEB_CRAWL."""
# Hardcoded list of known models - same approach as test_ai_behavior.py from modules.aicore.aicoreModelRegistry import modelRegistry
return [ from modules.datamodels.datamodelAi import OperationTypeEnum
# "claude-3-5-sonnet-20241022", # Skipped - text model, test later
# "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input # Get all models from registry
# "gpt-4o", # Skipped - text model, test later allModels = modelRegistry.getAvailableModels()
# "gpt-3.5-turbo", # Skipped - text model, test later
# "gpt-4o-vision", # Skipped - requires image input # Filter models that support WEB_CRAWL
# "dall-e-3", # Skipped - image generation, test later webCrawlModels = []
"sonar", # Perplexity web model for model in allModels:
"sonar-pro", # Perplexity web model if model.operationTypes and any(
"tavily-search", # Tavily web model (unified research) ot.operationType == OperationTypeEnum.WEB_CRAWL
# "internal-extractor", # Skipped - internal model, test later for ot in model.operationTypes
# "internal-generator", # Skipped - internal model, test later ): # Include both Tavily and Perplexity models
# "internal-renderer" # Skipped - internal model, test later webCrawlModels.append(model.name)
]
# Filter to only "sonar" model for testing
webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
for modelName in webCrawlModels:
print(f" - {modelName}")
return webCrawlModels
def saveTestResults(self): def saveTestResults(self):
"""Save detailed test results to file.""" """Save detailed test results to file."""
@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
if result.get("isValidJson") is not None: if result.get("isValidJson") is not None:
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}") print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
if result.get("crawledUrl"):
print(f" Crawled URL: {result['crawledUrl']}")
if result.get("contentLength") is not None:
print(f" Content length: {result['contentLength']} characters")
if result.get("pagesCrawled") is not None:
print(f" Pages crawled: {result['pagesCrawled']}")
if result["error"]: if result["error"]:
print(f" Error: {result['error']}") print(f" Error: {result['error']}")
@ -525,12 +777,32 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"{'='*80}") print(f"{'='*80}")
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)") print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)") print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
# Find models with most content
modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
if modelsWithContent:
mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
avgContent = totalContent / len(modelsWithContent)
print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
print(f"📊 Average content per model: {avgContent:.0f} characters")
print(f"📊 Total content crawled across all models: {totalContent} characters")
# Find models with most pages crawled (for Tavily direct API)
modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
if modelsWithPages:
mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
avgPages = totalPages / len(modelsWithPages)
print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
print(f"📊 Average pages per model: {avgPages:.1f} pages")
print(f"📊 Total pages crawled across all models: {totalPages} pages")
async def main(): async def main():
"""Run AI models testing.""" """Run AI models testing for WEB_CRAWL operation."""
tester = AIModelsTester() tester = AIModelsTester()
print("Starting AI Models Testing...") print("Starting AI Models Testing for WEB_CRAWL...")
print("Initializing AI service...") print("Initializing AI service...")
await tester.initialize() await tester.initialize()
@ -542,8 +814,9 @@ async def main():
print(f" {i}. {model}") print(f" {i}. {model}")
print(f"\n{'='*80}") print(f"\n{'='*80}")
print("STARTING INDIVIDUAL MODEL TESTS") print("STARTING WEB_CRAWL TESTS")
print(f"{'='*80}") print(f"{'='*80}")
print("Testing each model's ability to crawl URLs and return content...")
print("Press Enter after each model test to continue to the next one...") print("Press Enter after each model test to continue to the next one...")
# Test each model individually # Test each model individually