ai models ready for web and txt

This commit is contained in:
ValueOn AG 2025-10-26 18:17:17 +01:00
parent 72e0687826
commit 2489719c62
7 changed files with 577 additions and 232 deletions

View file

@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2,
maxTokens=4000,
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000,
costPer1kTokensInput=0.005,
costPer1kTokensOutput=0.005,
@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
connectorType="perplexity",
apiUrl="https://api.perplexity.ai/chat/completions",
temperature=0.2,
maxTokens=4000,
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
contextLength=32000,
costPer1kTokensInput=0.01,
costPer1kTokensOutput=0.01,
speedRating=6, # Slower due to AI analysis
qualityRating=10, # Best AI analysis quality
qualityRating=9, # Best AI analysis quality
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.QUALITY,
@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
# Fallback to basic call
return await self.callAiBasic(modelCall)
def _getDepthInstructions(self, maxDepth: int) -> str:
"""
Map maxDepth (numeric) to instructional text for LLM.
Args:
maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
Returns:
Instructional text for the LLM
"""
depthMap = {
1: "Basic overview - extract main content from the main page only",
2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
}
return depthMap.get(maxDepth, depthMap[2])
def _getWidthInstructions(self, maxWidth: int) -> str:
"""
Map maxWidth (numeric) to instructional text for LLM.
Args:
maxWidth: Number of pages to crawl at each level (default: 10)
Returns:
Instructional text for the LLM
"""
if maxWidth <= 5:
return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
elif maxWidth <= 15:
return f"Standard breadth - crawl up to {maxWidth} pages per level"
elif maxWidth <= 30:
return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
else:
return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
"""
WEB_SEARCH operation - returns list of URLs based on search query.
@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
{'' if not countryName else f'Focus on results from {countryName}.'}
{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}
Return ONLY a JSON array of URLs, no additional text:
[
@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
"""
WEB_CRAWL operation - crawls ONE URL and returns content.
Perplexity API Parameters Used:
- messages: The prompt containing URL and instruction
- max_tokens: Maximum response length
- max_results: Number of search results (1-20, default: 10)
- temperature: Response randomness (not web search specific)
Pagination: Perplexity does NOT return paginated responses.
A single response contains all results within max_tokens limit.
Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Build crawl request for Perplexity - ONE URL
crawlPrompt = f"""Crawl and extract content from this URL based on the instruction:
INSTRUCTION: '{webCrawlPrompt.instruction}'
URL to crawl (maxDepth={webCrawlPrompt.maxDepth}):
{webCrawlPrompt.url}
Extract and return the relevant content based on the instruction.
Return as JSON object with this structure:
{{
"url": "{webCrawlPrompt.url}",
"title": "Page title",
"content": "Extracted content relevant to the instruction"
}}
Return ONLY valid JSON, no additional text."""
# Match playground prompt style: just URL + question
# This allows Perplexity to return detailed multi-source results
crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
# Build payload with optional Perplexity parameters
# Note: max_tokens_per_page may not be supported by chat/completions endpoint
# The playground Python SDK might use a different internal API
maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results
payload = {
"model": model.name,
"messages": [{"role": "user", "content": crawlPrompt}],
"temperature": temperature,
"max_tokens": maxTokens
"max_tokens": maxTokens, # Use model's configured maxTokens (24000)
"max_results": maxResults,
"return_citations": True # Request citations explicitly
}
logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
response = await self.httpClient.post(model.apiUrl, json=payload)
if response.status_code != 200:
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
apiResponse = response.json()
# Extract the main content
content = apiResponse["choices"][0]["message"]["content"]
# Parse JSON content and ensure it's a single object
import json
try:
parsedContent = json.loads(content)
# Ensure it's a single object, not an array
if isinstance(parsedContent, list):
parsedContent = parsedContent[0] if parsedContent else {}
except:
# If not JSON, create structured response
parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
# Check for citations or search results in the response
citations = apiResponse.get("citations", [])
searchResults = apiResponse.get("search_results", [])
# Return as JSON string
# Log what we found
if citations:
logger.info(f"Found {len(citations)} citations in response")
if searchResults:
logger.info(f"Found {len(searchResults)} search results in response")
logger.debug(f"API response keys: {list(apiResponse.keys())}")
# Build comprehensive response with citations if available
import json
responseData = {
"content": content,
"citations": citations if citations else [],
"search_results": searchResults if searchResults else []
}
# Return comprehensive response
return AiModelResponse(
content=json.dumps(parsedContent, indent=2),
content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
success=True,
modelId=model.name,
metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
metadata={
"response_id": apiResponse.get("id", ""),
"operation": "WEB_CRAWL",
"url": webCrawlPrompt.url,
"actualPromptSent": crawlPrompt,
"has_citations": len(citations) > 0,
"has_search_results": len(searchResults) > 0
}
)
except Exception as e:

View file

@ -27,7 +27,8 @@ class WebCrawlResult:
content: str
title: Optional[str] = None
class ConnectorWeb(BaseConnectorAi):
class AiTavily(BaseConnectorAi):
"""Tavily web search connector."""
def __init__(self):
@ -42,7 +43,36 @@ class ConnectorWeb(BaseConnectorAi):
self.webSearchMaxResults: int = 20
# Initialize client if API key is available
self._initializeClient()
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search & Research",
connectorType="tavily",
apiUrl="https://api.tavily.com",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=8, # Good speed for search and extract
qualityRating=9, # Excellent quality for web research
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 9),
(OperationTypeEnum.WEB_CRAWL, 10)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
)
]
def _initializeClient(self):
"""Initialize the Tavily client if API key is available."""
try:
@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
return filteredResults
def getModels(self) -> List[AiModel]:
"""Get all available Tavily models."""
return [
AiModel(
name="tavily-search",
displayName="Tavily Search & Research",
connectorType="tavily",
apiUrl="https://api.tavily.com",
temperature=0.0, # Web search doesn't use temperature
maxTokens=0, # Web search doesn't use tokens
contextLength=0,
costPer1kTokensInput=0.0,
costPer1kTokensOutput=0.0,
speedRating=8, # Good speed for search and extract
qualityRating=9, # Excellent quality for web research
# capabilities removed (not used in business logic)
functionCall=self._routeWebOperation,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.WEB_SEARCH, 9),
(OperationTypeEnum.WEB_CRAWL, 8)
),
version="tavily-search",
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
)
]
@classmethod
async def create(cls):
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
topic: str | None = None,
includeDomains: list[str] | None = None,
excludeDomains: list[str] | None = None,
language: str | None = None,
country: str | None = None,
includeAnswer: bool | None = None,
includeRawContent: bool | None = None,
includeAnswer: str | None = None,
includeRawContent: str | None = None,
) -> list[WebSearchResult]:
"""Calls the Tavily API to perform a web search."""
# Make sure maxResults is within the allowed range (use cached values)
@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
kwargs["include_domains"] = includeDomains
if excludeDomains is not None:
kwargs["exclude_domains"] = excludeDomains
if language is not None:
kwargs["language"] = language
if country is not None:
kwargs["country"] = country
if includeAnswer is not None:
@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
if includeRawContent is not None:
kwargs["include_raw_content"] = includeRawContent
logger.debug(f"Tavily.search kwargs: {kwargs}")
# Log the final API call parameters for comparison
logger.info(f"Tavily API call parameters: {kwargs}")
# Ensure client is initialized
if self.client is None:
@ -316,7 +316,11 @@ class ConnectorWeb(BaseConnectorAi):
raise ValueError("Tavily client not initialized. Please check API key configuration.")
response = await self.client.search(**kwargs)
# Return all results without score filtering
# Tavily's scoring is already applied by the API
logger.info(f"Tavily returned {len(response.get('results', []))} results")
return [
WebSearchResult(
title=result["title"],
@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):
async def _crawl(
self,
urls: list,
extractDepth: str | None = None,
format: str | None = None,
url: str,
instructions: str | None = None,
limit: int = 20,
maxDepth: int = 2,
maxBreadth: int = 40,
) -> list[WebCrawlResult]:
"""Calls the Tavily API to extract text content from URLs with retry logic."""
"""Calls the Tavily API to crawl ONE URL with link following and retry logic."""
maxRetries = self.crawlMaxRetries
retryDelay = self.crawlRetryDelay
timeout = self.crawlTimeout
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
logger.debug(f"Starting crawl of URL: {url}")
logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
for attempt in range(maxRetries + 1):
try:
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
# Use asyncio.wait_for for timeout
# Build kwargs for extract
kwargsExtract: dict = {"urls": urls}
kwargsExtract["extract_depth"] = extractDepth or "advanced"
kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
# Ensure client is initialized
if self.client is None:
self._initializeClient()
if self.client is None:
raise ValueError("Tavily client not initialized. Please check API key configuration.")
logger.debug(f"Crawling URL: {url}")
# Build kwargs for crawl
kwargsCrawl: dict = {"url": url}
if instructions:
kwargsCrawl["instructions"] = instructions
if limit:
kwargsCrawl["limit"] = limit
if maxDepth:
kwargsCrawl["max_depth"] = maxDepth
if maxBreadth:
kwargsCrawl["max_breadth"] = maxBreadth
logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
response = await asyncio.wait_for(
self.client.extract(**kwargsExtract),
self.client.crawl(**kwargsCrawl),
timeout=timeout
)
logger.debug(f"Tavily response received: {list(response.keys())}")
# Debug: Log what Tavily actually returns
if "results" in response and response["results"]:
logger.debug(f"Tavily returned {len(response['results'])} results")
logger.debug(f"First result keys: {list(response['results'][0].keys())}")
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
# Log each result
for i, result in enumerate(response["results"]):
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
logger.debug(f"Tavily response received: {type(response)}")
# Parse response - could be dict with results or list
if isinstance(response, dict) and "results" in response:
pageResults = response["results"]
elif isinstance(response, list):
pageResults = response
else:
logger.warning(f"Tavily returned no results in response: {response}")
logger.warning(f"Unexpected response format: {type(response)}")
pageResults = []
results = [
WebCrawlResult(
url=result["url"],
content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
title=result.get("title", "") # Extract title if available
)
for result in response["results"]
]
logger.debug(f"Got {len(pageResults)} pages from crawl")
logger.debug(f"Crawl successful: extracted {len(results)} results")
# Convert to WebCrawlResult format
results = []
for result in pageResults:
results.append(WebCrawlResult(
url=result.get("url", url),
content=result.get("raw_content", result.get("content", "")),
title=result.get("title", "")
))
logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
return results
except asyncio.TimeoutError:
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
await asyncio.sleep(retryDelay)
@ -398,21 +410,20 @@ class ConnectorWeb(BaseConnectorAi):
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
except Exception as e:
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
# Check if it's a validation error and log more details
if "validation" in str(e).lower():
logger.debug(f"URL validation failed. Checking URL format:")
for i, url in enumerate(urls):
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
# Check for common URL issues
if ' ' in url:
logger.debug(f" WARNING: URL contains spaces!")
if not url.startswith(('http://', 'https://')):
logger.debug(f" WARNING: URL doesn't start with http/https!")
if len(url) > 2000:
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
logger.debug(f" URL: '{url}' (length: {len(url)})")
# Check for common URL issues
if ' ' in url:
logger.debug(f" WARNING: URL contains spaces!")
if not url.startswith(('http://', 'https://')):
logger.debug(f" WARNING: URL doesn't start with http/https!")
if len(url) > 2000:
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
if attempt < maxRetries:
logger.info(f"Retrying in {retryDelay} seconds...")
@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
if countryName:
countryName = self._convertIsoCodeToCountryName(countryName)
# Perform search
# Perform search - use exact parameters from prompt
# NOTE: timeRange parameter causes generic results, so we don't use it
searchResults = await self._search(
query=webSearchPrompt.instruction,
maxResults=webSearchPrompt.maxNumberPages,
timeRange=webSearchPrompt.timeRange,
timeRange=None, # Not used - causes generic results
country=countryName,
language=webSearchPrompt.language,
includeAnswer=False,
includeRawContent=False
includeAnswer="basic",
includeRawContent="text"
)
# Extract URLs from results
@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
"""
WEB_CRAWL operation - crawls one URL using Tavily.
WEB_CRAWL operation - crawls one URL using Tavily with link following.
Args:
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
Returns:
AiModelResponse with crawl results as JSON
AiModelResponse with crawl results as JSON (may include multiple pages)
"""
try:
# Extract parameters
@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
# Create Pydantic model
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
# Perform crawl for ONE URL
# Note: _crawl expects a list, so we wrap the single URL in a list
# Perform crawl for ONE URL with link following
# Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
crawlResults = await self._crawl(
urls=[webCrawlPrompt.url],
extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
format="markdown"
url=webCrawlPrompt.url,
instructions=webCrawlPrompt.instruction,
limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages
maxDepth=webCrawlPrompt.maxDepth or 2,
maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth
)
# Format result for single URL - consistent with Perplexity format
# If we got multiple pages from the crawl, we need to format them differently
# Return the first result for backwards compatibility, but include total page count
if crawlResults and len(crawlResults) > 0:
firstResult = crawlResults[0]
# Get all pages content
allContent = ""
for i, result in enumerate(crawlResults, 1):
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
if result.title:
allContent += f"{pageHeader}Title: {result.title}\n\n"
allContent += f"{result.content}\n"
resultData = {
"url": firstResult.url,
"title": firstResult.title if firstResult.title else "Content",
"content": firstResult.content
"url": webCrawlPrompt.url,
"title": crawlResults[0].title if crawlResults[0].title else "Content",
"content": allContent,
"pagesCrawled": len(crawlResults),
"pageUrls": [result.url for result in crawlResults]
}
else:
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
# Return as JSON - same format as Perplexity
# Return as JSON - same format as Perplexity but with multiple pages content
import json
return AiModelResponse(
content=json.dumps(resultData, indent=2),
success=True,
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
)
except Exception as e:
logger.error(f"Error in Tavily web crawl: {str(e)}")
import json
errorResult = {"error": str(e), "url": ""}
errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
return AiModelResponse(
content=json.dumps(errorResult, indent=2),
success=False,

View file

@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")

View file

@ -186,12 +186,13 @@ class CountryCodes:
Get Tavily-compatible country name from ISO-2 code.
Args:
isoCode: ISO-2 country code (e.g., "CH", "US")
isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
Returns:
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
"""
isoCodeUpper = isoCode.upper()
# Convert to uppercase for lookup
isoCodeUpper = isoCode.upper() if isoCode else ""
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
return mapping[0] if mapping else isoCode

View file

@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
logger = logging.getLogger(__name__)
class WebcrawlService:
class WebService:
"""Service for web search and crawling operations."""
def __init__(self, services):
@ -56,7 +56,6 @@ class WebcrawlService:
extractedUrls = analysisResult.get("urls", [])
needsSearch = analysisResult.get("needsSearch", True) # Default to True
maxNumberPages = analysisResult.get("maxNumberPages", 10)
timeRange = analysisResult.get("timeRange")
countryCode = analysisResult.get("country", country)
languageCode = analysisResult.get("language", language)
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
@ -77,7 +76,6 @@ class WebcrawlService:
searchUrls = await self._performWebSearch(
instruction=instruction,
maxNumberPages=maxNumberPages - len(allUrls),
timeRange=timeRange,
country=countryCode,
language=languageCode
)
@ -153,10 +151,9 @@ Extract and provide a JSON response with:
2. urls: List of URLs found in the prompt text
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
5. timeRange: Time range if mentioned (d, w, m, y, or null)
6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
7. language: Language code if specified (lowercase, e.g., de, en, fr)
8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
Return ONLY valid JSON, no additional text:
{{
@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
"urls": ["url1", "url2"],
"needsSearch": true,
"maxNumberPages": 10,
"timeRange": null,
"country": "ch",
"language": "de",
"researchDepth": "general"
@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
"urls": [],
"needsSearch": True,
"maxNumberPages": 10,
"timeRange": None,
"country": country,
"language": language,
"researchDepth": researchDepth
@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
self,
instruction: str,
maxNumberPages: int,
timeRange: Optional[str],
country: Optional[str],
language: Optional[str]
) -> List[str]:
@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
instruction=instruction,
country=country,
maxNumberPages=maxNumberPages,
timeRange=timeRange,
language=language
)
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
instruction=instruction,
url=url, # Single URL
maxDepth=maxDepth,
maxWidth=10
maxWidth=50
)
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)

View file

@ -170,7 +170,7 @@ class MethodAi(MethodBase):
- Output format: JSON with research results including URLs and content.
Parameters:
- prompt (str, required): Natural language research instruction, including time range if relevant.
- prompt (str, required): Natural language research instruction.
- list(url) (list, optional): Specific URLs to crawl, if needed.
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
- language (str, optional): Language code (lowercase, e.g., de, en, fr).

View file

@ -1,6 +1,9 @@
#!/usr/bin/env python3
"""
AI Models Test - Tests all available AI models individually
AI Models Test - Tests WEB_CRAWL functionality on all models that support it
This script tests all models that have WEB_CRAWL capability, validates that
they can crawl specific URLs and return content, and analyzes the quality of results.
"""
import asyncio
@ -53,9 +56,18 @@ class AIModelsTester:
async def initialize(self):
"""Initialize the AI service."""
# Set logging level to INFO to reduce noise
# Set logging level to DEBUG for detailed output
import logging
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().setLevel(logging.DEBUG)
# Initialize the model registry with all connectors
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicorePluginTavily import AiTavily
from modules.aicore.aicorePluginPerplexity import AiPerplexity
# Register web connectors that support WEB_CRAWL
modelRegistry.registerConnector(AiTavily())
modelRegistry.registerConnector(AiPerplexity())
# The AI service needs to be recreated with proper initialization
from modules.services.serviceAi.mainServiceAi import AiService
@ -86,27 +98,53 @@ class AIModelsTester:
print(f"📁 Results will be saved to: {self.modelTestDir}")
async def testModel(self, modelName: str) -> Dict[str, Any]:
"""Test a specific AI model with a simple prompt."""
"""Test a specific AI model with WEB_CRAWL operation."""
print(f"\n{'='*60}")
print(f"TESTING MODEL: {modelName}")
print(f"OPERATION TYPE: WEB_CRAWL")
print(f"{'='*60}")
# Use same prompt for all web models
import json
# CRAWL CONFIGURATION
# Deep and Broad Web Crawl Example:
# - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
# - Level 1: Starting page
# - Level 2: Pages linked from starting page
# - Level 3: Pages linked from Level 2 pages
# - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
# This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
#
# Common configurations:
# - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused)
# - General/Standard: maxDepth=2, maxWidth=10 (balanced)
# - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive)
if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
# All web models use the same JSON formatted prompt
# Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
testPrompt = json.dumps({
"prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
"maxResults": 5,
"timeRange": "y",
"country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
"format": "json"
}, indent=2)
else:
# Fallback for other models
testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON."
CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep
CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level
print(f"Crawl Configuration:")
print(f" - Depth: {CRAWL_DEPTH} levels (deep)")
print(f" - Width: {CRAWL_WIDTH} pages per level (broad)")
print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
# Use WEB_CRAWL specific prompt format
from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
# Test with simple prompt like playground example
simplePrompt = f"https://www.valueon.ch: Who works in this company?"
# But keep structured format for now to match our API
testPrompt = json.dumps({
"instruction": "Who works in this company?",
"url": "https://www.valueon.ch",
"maxDepth": CRAWL_DEPTH,
"maxWidth": CRAWL_WIDTH
}, indent=2)
print(f"Simple prompt (playground style): {simplePrompt}")
# For Tavily models, test direct API call for better link following
if "tavily" in modelName.lower():
return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
print(f"Test prompt: {testPrompt}")
print(f"Prompt length: {len(testPrompt)} characters")
@ -114,17 +152,11 @@ class AIModelsTester:
startTime = asyncio.get_event_loop().time()
try:
# Create options to force this specific model
if "internal" in modelName.lower():
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_EXTRACT,
preferredModel=modelName
)
else:
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
preferredModel=modelName
)
# Create options for WEB_CRAWL operation
options = AiCallOptions(
operationType=OperationTypeEnum.WEB_CRAWL,
preferredModel=modelName
)
# Call the AI service DIRECTLY through the model's functionCall
# This tests the actual model, not the document generation pipeline
@ -140,29 +172,14 @@ class AIModelsTester:
import base64
import os
# Prepare messages and options based on model type
if "vision" in modelName.lower():
# For vision models, skip for now since they require special handling
print(f"⚠️ Skipping vision model {modelName} - requires special image handling")
return {
"modelName": modelName,
"status": "SKIPPED",
"processingTime": 0.0,
"responseLength": 0,
"responseType": "skipped",
"hasContent": False,
"error": "Vision model requires special image handling",
"fullResponse": "Skipped - vision model requires special image handling"
}
else:
# For other models, use normal functionCall
messages = [{"role": "user", "content": testPrompt}]
modelCall = AiModelCall(
messages=messages,
model=model,
options=options
)
response = await model.functionCall(modelCall)
# For WEB_CRAWL models, use normal functionCall with structured prompt
messages = [{"role": "user", "content": testPrompt}]
modelCall = AiModelCall(
messages=messages,
model=model,
options=options
)
response = await model.functionCall(modelCall)
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
@ -185,6 +202,10 @@ class AIModelsTester:
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
}
# Extract actual prompt sent if available in metadata
if hasattr(response, 'metadata') and response.metadata:
result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
# Try to parse content as JSON
if response.content:
try:
@ -289,9 +310,16 @@ class AIModelsTester:
print(f"📄 Response length: {len(str(response))} characters")
print(f"📄 Response preview: {result['responsePreview']}")
# Save text response for all models
if result.get("status") == "SUCCESS":
self._saveTextResponse(modelName, result)
# Add prompt to result for logging
result["testPrompt"] = testPrompt
result["crawlConfig"] = {
"depth": CRAWL_DEPTH,
"width": CRAWL_WIDTH
}
# For WEB_CRAWL, also validate that content was extracted
if result.get("status") == "SUCCESS" and result.get("fullResponse"):
self._validateCrawlResponse(modelName, result)
except Exception as e:
endTime = asyncio.get_event_loop().time()
@ -304,13 +332,22 @@ class AIModelsTester:
"responseLength": 0,
"responseType": "exception",
"hasContent": False,
"error": str(e)
"error": str(e),
"testPrompt": testPrompt,
"crawlConfig": {
"depth": CRAWL_DEPTH,
"width": CRAWL_WIDTH
}
}
print(f"💥 EXCEPTION - {str(e)}")
self.testResults.append(result)
# Save text response even for exceptions to log the prompt
if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
self._saveTextResponse(modelName, result)
# Save individual model result immediately
self._saveIndividualModelResult(modelName, result)
@ -378,6 +415,19 @@ class AIModelsTester:
if not content:
content = result.get("responsePreview", "No content available")
# If there's an error, include it in the content
if result.get("error"):
content = f"ERROR: {result.get('error')}\n\n{content}"
# Get prompt and config for logging
config = result.get("crawlConfig", {})
crawlDepth = config.get("depth", "N/A")
crawlWidth = config.get("width", "N/A")
# Get both the original JSON prompt and the actual prompt sent
originalPrompt = result.get("testPrompt", "N/A")
actualPromptSent = result.get("actualPromptSent", "N/A")
# Add metadata header
metadata = f"""Model: {modelName}
Test Time: {timestamp}
@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
Processing Time: {result.get('processingTime', 0):.2f}s
Response Length: {result.get('responseLength', 0)} characters
Is Valid JSON: {result.get('isValidJson', False)}
Test Method: {result.get('testMethod', 'standard')}
Pages Crawled: {result.get('pagesCrawled', 'N/A')}
Crawled URL: {result.get('crawledUrl', 'N/A')}
Has URL: {result.get('hasUrl', 'N/A')}
Has Title: {result.get('hasTitle', 'N/A')}
Has Content: {result.get('hasContent', 'N/A')}
Content Length: {result.get('contentLength', 'N/A')} characters
--- CRAWL CONFIGURATION ---
Depth: {crawlDepth}
Width: {crawlWidth}
--- ORIGINAL JSON PROMPT (input) ---
{originalPrompt}
--- ACTUAL PROMPT SENT TO API (EXACT) ---
{actualPromptSent}
--- RESPONSE CONTENT ---
{content}
@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving text response: {str(e)}")
result["textSaveError"] = str(e)
def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
"""Validate that the WEB_CRAWL response contains crawled content."""
try:
content = result.get("fullResponse", "")
# Try to parse as JSON
crawledData = {}
try:
parsed = json.loads(content)
if isinstance(parsed, dict):
crawledData = parsed
except:
pass
# Check for expected fields: url, title, content
hasUrl = bool(crawledData.get("url"))
hasTitle = bool(crawledData.get("title"))
hasContent = bool(crawledData.get("content"))
contentLength = len(crawledData.get("content", ""))
result["hasUrl"] = hasUrl
result["hasTitle"] = hasTitle
result["hasContent"] = hasContent
result["contentLength"] = contentLength
result["crawledUrl"] = crawledData.get("url", "")
if hasUrl and hasContent:
print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
print(f" Content length: {contentLength} characters")
print(f" Title: {crawledData.get('title', 'N/A')}")
else:
print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
except Exception as e:
print(f"❌ Error validating crawl response: {str(e)}")
result["crawlValidationError"] = str(e)
async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
"""Test Tavily API directly using the crawl() method with better link following."""
print(f"\n{'='*60}")
print(f"TESTING TAVILY DIRECT API (crawl method)")
print(f"{'='*60}")
startTime = asyncio.get_event_loop().time()
try:
from tavily import AsyncTavilyClient
from modules.shared.configuration import APP_CONFIG
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
if not apiKey:
raise Exception("Tavily API key not found")
client = AsyncTavilyClient(api_key=apiKey)
# Map our configuration to Tavily parameters
# maxWidth -> limit (pages per level)
# maxDepth -> max_depth (link following depth)
# max_breadth = maxWidth (breadth of crawl at each level)
tavilyLimit = crawlWidth
tavilyMaxDepth = crawlDepth
tavilyMaxBreadth = crawlWidth
print(f"Calling Tavily API with crawl() method...")
print(f"URL: https://www.valueon.ch")
print(f"Instructions: Who works in this company?")
print(f"Limit: {tavilyLimit} pages per level")
print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
print(f"Deep and Broad Crawl Configuration Active")
response = await client.crawl(
url="https://www.valueon.ch",
instructions="Who works in this company?",
limit=tavilyLimit,
max_depth=tavilyMaxDepth,
max_breadth=tavilyMaxBreadth
)
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
# Analyze response
contentLength = 0
pagesCrawled = 0
fullContent = ""
if isinstance(response, dict):
# Check if it has results
if "results" in response:
results = response["results"]
pagesCrawled = len(results)
content_parts = []
for result in results:
url = result.get("url", "")
title = result.get("title", "")
content = result.get("raw_content", result.get("content", ""))
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
contentLength += len(content)
fullContent = "\n".join(content_parts)
else:
fullContent = json.dumps(response, indent=2)
contentLength = len(fullContent)
elif isinstance(response, list):
pagesCrawled = len(response)
content_parts = []
for item in response:
if isinstance(item, dict):
url = item.get("url", "")
title = item.get("title", "")
content = item.get("raw_content", item.get("content", ""))
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
contentLength += len(content)
fullContent = "\n".join(content_parts)
else:
fullContent = str(response)
contentLength = len(fullContent)
result = {
"modelName": modelName,
"status": "SUCCESS",
"processingTime": round(processingTime, 2),
"responseLength": contentLength,
"responseType": "TavilyDirectAPI",
"hasContent": True,
"error": None,
"modelUsed": modelName,
"priceUsd": 0.0,
"bytesSent": 0,
"bytesReceived": contentLength,
"isValidJson": True,
"fullResponse": fullContent,
"pagesCrawled": pagesCrawled,
"testMethod": "direct_api_crawl"
}
print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
print(f"📄 Pages crawled: {pagesCrawled}")
print(f"📄 Total content length: {contentLength} characters")
# Save the response
self._saveTextResponse(modelName, result)
self._validateCrawlResponse(modelName, result)
self._saveIndividualModelResult(modelName, result)
self.testResults.append(result)
return result
except Exception as e:
endTime = asyncio.get_event_loop().time()
processingTime = endTime - startTime
result = {
"modelName": modelName,
"status": "EXCEPTION",
"processingTime": round(processingTime, 2),
"responseLength": 0,
"responseType": "exception",
"hasContent": False,
"error": str(e)
}
print(f"💥 EXCEPTION - {str(e)}")
self.testResults.append(result)
return result
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
"""Save individual model test result to file."""
try:
@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"❌ Error saving individual result: {str(e)}")
def getAllAvailableModels(self) -> List[str]:
"""Get all available model names."""
# Hardcoded list of known models - same approach as test_ai_behavior.py
return [
# "claude-3-5-sonnet-20241022", # Skipped - text model, test later
# "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input
# "gpt-4o", # Skipped - text model, test later
# "gpt-3.5-turbo", # Skipped - text model, test later
# "gpt-4o-vision", # Skipped - requires image input
# "dall-e-3", # Skipped - image generation, test later
"sonar", # Perplexity web model
"sonar-pro", # Perplexity web model
"tavily-search", # Tavily web model (unified research)
# "internal-extractor", # Skipped - internal model, test later
# "internal-generator", # Skipped - internal model, test later
# "internal-renderer" # Skipped - internal model, test later
]
"""Get all available model names that support WEB_CRAWL."""
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.datamodels.datamodelAi import OperationTypeEnum
# Get all models from registry
allModels = modelRegistry.getAvailableModels()
# Filter models that support WEB_CRAWL
webCrawlModels = []
for model in allModels:
if model.operationTypes and any(
ot.operationType == OperationTypeEnum.WEB_CRAWL
for ot in model.operationTypes
): # Include both Tavily and Perplexity models
webCrawlModels.append(model.name)
# Filter to only "sonar" model for testing
webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
for modelName in webCrawlModels:
print(f" - {modelName}")
return webCrawlModels
def saveTestResults(self):
"""Save detailed test results to file."""
@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
if result.get("isValidJson") is not None:
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
if result.get("crawledUrl"):
print(f" Crawled URL: {result['crawledUrl']}")
if result.get("contentLength") is not None:
print(f" Content length: {result['contentLength']} characters")
if result.get("pagesCrawled") is not None:
print(f" Pages crawled: {result['pagesCrawled']}")
if result["error"]:
print(f" Error: {result['error']}")
@ -525,12 +777,32 @@ Is Valid JSON: {result.get('isValidJson', False)}
print(f"{'='*80}")
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
# Find models with most content
modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
if modelsWithContent:
mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
avgContent = totalContent / len(modelsWithContent)
print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
print(f"📊 Average content per model: {avgContent:.0f} characters")
print(f"📊 Total content crawled across all models: {totalContent} characters")
# Find models with most pages crawled (for Tavily direct API)
modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
if modelsWithPages:
mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
avgPages = totalPages / len(modelsWithPages)
print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
print(f"📊 Average pages per model: {avgPages:.1f} pages")
print(f"📊 Total pages crawled across all models: {totalPages} pages")
async def main():
"""Run AI models testing."""
"""Run AI models testing for WEB_CRAWL operation."""
tester = AIModelsTester()
print("Starting AI Models Testing...")
print("Starting AI Models Testing for WEB_CRAWL...")
print("Initializing AI service...")
await tester.initialize()
@ -542,8 +814,9 @@ async def main():
print(f" {i}. {model}")
print(f"\n{'='*80}")
print("STARTING INDIVIDUAL MODEL TESTS")
print("STARTING WEB_CRAWL TESTS")
print(f"{'='*80}")
print("Testing each model's ability to crawl URLs and return content...")
print("Press Enter after each model test to continue to the next one...")
# Test each model individually