ai models ready for web and txt
This commit is contained in:
parent
72e0687826
commit
2489719c62
7 changed files with 577 additions and 232 deletions
|
|
@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
connectorType="perplexity",
|
connectorType="perplexity",
|
||||||
apiUrl="https://api.perplexity.ai/chat/completions",
|
apiUrl="https://api.perplexity.ai/chat/completions",
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
maxTokens=4000,
|
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
|
||||||
contextLength=32000,
|
contextLength=32000,
|
||||||
costPer1kTokensInput=0.005,
|
costPer1kTokensInput=0.005,
|
||||||
costPer1kTokensOutput=0.005,
|
costPer1kTokensOutput=0.005,
|
||||||
|
|
@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
connectorType="perplexity",
|
connectorType="perplexity",
|
||||||
apiUrl="https://api.perplexity.ai/chat/completions",
|
apiUrl="https://api.perplexity.ai/chat/completions",
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
maxTokens=4000,
|
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
|
||||||
contextLength=32000,
|
contextLength=32000,
|
||||||
costPer1kTokensInput=0.01,
|
costPer1kTokensInput=0.01,
|
||||||
costPer1kTokensOutput=0.01,
|
costPer1kTokensOutput=0.01,
|
||||||
speedRating=6, # Slower due to AI analysis
|
speedRating=6, # Slower due to AI analysis
|
||||||
qualityRating=10, # Best AI analysis quality
|
qualityRating=9, # Best AI analysis quality
|
||||||
# capabilities removed (not used in business logic)
|
# capabilities removed (not used in business logic)
|
||||||
functionCall=self._routeWebOperation,
|
functionCall=self._routeWebOperation,
|
||||||
priority=PriorityEnum.QUALITY,
|
priority=PriorityEnum.QUALITY,
|
||||||
|
|
@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
# Fallback to basic call
|
# Fallback to basic call
|
||||||
return await self.callAiBasic(modelCall)
|
return await self.callAiBasic(modelCall)
|
||||||
|
|
||||||
|
def _getDepthInstructions(self, maxDepth: int) -> str:
|
||||||
|
"""
|
||||||
|
Map maxDepth (numeric) to instructional text for LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Instructional text for the LLM
|
||||||
|
"""
|
||||||
|
depthMap = {
|
||||||
|
1: "Basic overview - extract main content from the main page only",
|
||||||
|
2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
|
||||||
|
3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
|
||||||
|
}
|
||||||
|
return depthMap.get(maxDepth, depthMap[2])
|
||||||
|
|
||||||
|
def _getWidthInstructions(self, maxWidth: int) -> str:
|
||||||
|
"""
|
||||||
|
Map maxWidth (numeric) to instructional text for LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
maxWidth: Number of pages to crawl at each level (default: 10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Instructional text for the LLM
|
||||||
|
"""
|
||||||
|
if maxWidth <= 5:
|
||||||
|
return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
|
||||||
|
elif maxWidth <= 15:
|
||||||
|
return f"Standard breadth - crawl up to {maxWidth} pages per level"
|
||||||
|
elif maxWidth <= 30:
|
||||||
|
return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
|
||||||
|
else:
|
||||||
|
return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
|
||||||
|
|
||||||
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
|
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||||
"""
|
"""
|
||||||
WEB_SEARCH operation - returns list of URLs based on search query.
|
WEB_SEARCH operation - returns list of URLs based on search query.
|
||||||
|
|
@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):
|
||||||
|
|
||||||
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
|
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
|
||||||
{'' if not countryName else f'Focus on results from {countryName}.'}
|
{'' if not countryName else f'Focus on results from {countryName}.'}
|
||||||
{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
|
|
||||||
{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}
|
|
||||||
|
|
||||||
Return ONLY a JSON array of URLs, no additional text:
|
Return ONLY a JSON array of URLs, no additional text:
|
||||||
[
|
[
|
||||||
|
|
@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
|
||||||
"""
|
"""
|
||||||
WEB_CRAWL operation - crawls ONE URL and returns content.
|
WEB_CRAWL operation - crawls ONE URL and returns content.
|
||||||
|
|
||||||
|
Perplexity API Parameters Used:
|
||||||
|
- messages: The prompt containing URL and instruction
|
||||||
|
- max_tokens: Maximum response length
|
||||||
|
- max_results: Number of search results (1-20, default: 10)
|
||||||
|
- temperature: Response randomness (not web search specific)
|
||||||
|
|
||||||
|
Pagination: Perplexity does NOT return paginated responses.
|
||||||
|
A single response contains all results within max_tokens limit.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
||||||
|
|
||||||
|
|
@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
|
||||||
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
||||||
|
|
||||||
# Build crawl request for Perplexity - ONE URL
|
# Build crawl request for Perplexity - ONE URL
|
||||||
crawlPrompt = f"""Crawl and extract content from this URL based on the instruction:
|
# Match playground prompt style: just URL + question
|
||||||
|
# This allows Perplexity to return detailed multi-source results
|
||||||
INSTRUCTION: '{webCrawlPrompt.instruction}'
|
crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
|
||||||
|
|
||||||
URL to crawl (maxDepth={webCrawlPrompt.maxDepth}):
|
# Build payload with optional Perplexity parameters
|
||||||
{webCrawlPrompt.url}
|
# Note: max_tokens_per_page may not be supported by chat/completions endpoint
|
||||||
|
# The playground Python SDK might use a different internal API
|
||||||
Extract and return the relevant content based on the instruction.
|
maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results
|
||||||
Return as JSON object with this structure:
|
|
||||||
{{
|
|
||||||
"url": "{webCrawlPrompt.url}",
|
|
||||||
"title": "Page title",
|
|
||||||
"content": "Extracted content relevant to the instruction"
|
|
||||||
}}
|
|
||||||
|
|
||||||
Return ONLY valid JSON, no additional text."""
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model.name,
|
"model": model.name,
|
||||||
"messages": [{"role": "user", "content": crawlPrompt}],
|
"messages": [{"role": "user", "content": crawlPrompt}],
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"max_tokens": maxTokens
|
"max_tokens": maxTokens, # Use model's configured maxTokens (24000)
|
||||||
|
"max_results": maxResults,
|
||||||
|
"return_citations": True # Request citations explicitly
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
|
||||||
|
|
||||||
response = await self.httpClient.post(model.apiUrl, json=payload)
|
response = await self.httpClient.post(model.apiUrl, json=payload)
|
||||||
|
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
|
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
|
||||||
|
|
||||||
apiResponse = response.json()
|
apiResponse = response.json()
|
||||||
|
|
||||||
|
# Extract the main content
|
||||||
content = apiResponse["choices"][0]["message"]["content"]
|
content = apiResponse["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
# Parse JSON content and ensure it's a single object
|
# Check for citations or search results in the response
|
||||||
import json
|
citations = apiResponse.get("citations", [])
|
||||||
try:
|
searchResults = apiResponse.get("search_results", [])
|
||||||
parsedContent = json.loads(content)
|
|
||||||
# Ensure it's a single object, not an array
|
|
||||||
if isinstance(parsedContent, list):
|
|
||||||
parsedContent = parsedContent[0] if parsedContent else {}
|
|
||||||
except:
|
|
||||||
# If not JSON, create structured response
|
|
||||||
parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
|
|
||||||
|
|
||||||
# Return as JSON string
|
# Log what we found
|
||||||
|
if citations:
|
||||||
|
logger.info(f"Found {len(citations)} citations in response")
|
||||||
|
if searchResults:
|
||||||
|
logger.info(f"Found {len(searchResults)} search results in response")
|
||||||
|
logger.debug(f"API response keys: {list(apiResponse.keys())}")
|
||||||
|
|
||||||
|
# Build comprehensive response with citations if available
|
||||||
|
import json
|
||||||
|
responseData = {
|
||||||
|
"content": content,
|
||||||
|
"citations": citations if citations else [],
|
||||||
|
"search_results": searchResults if searchResults else []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Return comprehensive response
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content=json.dumps(parsedContent, indent=2),
|
content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
|
||||||
success=True,
|
success=True,
|
||||||
modelId=model.name,
|
modelId=model.name,
|
||||||
metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
|
metadata={
|
||||||
|
"response_id": apiResponse.get("id", ""),
|
||||||
|
"operation": "WEB_CRAWL",
|
||||||
|
"url": webCrawlPrompt.url,
|
||||||
|
"actualPromptSent": crawlPrompt,
|
||||||
|
"has_citations": len(citations) > 0,
|
||||||
|
"has_search_results": len(searchResults) > 0
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,8 @@ class WebCrawlResult:
|
||||||
content: str
|
content: str
|
||||||
title: Optional[str] = None
|
title: Optional[str] = None
|
||||||
|
|
||||||
class ConnectorWeb(BaseConnectorAi):
|
|
||||||
|
class AiTavily(BaseConnectorAi):
|
||||||
"""Tavily web search connector."""
|
"""Tavily web search connector."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -42,7 +43,36 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
self.webSearchMaxResults: int = 20
|
self.webSearchMaxResults: int = 20
|
||||||
# Initialize client if API key is available
|
# Initialize client if API key is available
|
||||||
self._initializeClient()
|
self._initializeClient()
|
||||||
|
|
||||||
|
|
||||||
|
def getModels(self) -> List[AiModel]:
|
||||||
|
"""Get all available Tavily models."""
|
||||||
|
return [
|
||||||
|
AiModel(
|
||||||
|
name="tavily-search",
|
||||||
|
displayName="Tavily Search & Research",
|
||||||
|
connectorType="tavily",
|
||||||
|
apiUrl="https://api.tavily.com",
|
||||||
|
temperature=0.0, # Web search doesn't use temperature
|
||||||
|
maxTokens=0, # Web search doesn't use tokens
|
||||||
|
contextLength=0,
|
||||||
|
costPer1kTokensInput=0.0,
|
||||||
|
costPer1kTokensOutput=0.0,
|
||||||
|
speedRating=8, # Good speed for search and extract
|
||||||
|
qualityRating=9, # Excellent quality for web research
|
||||||
|
# capabilities removed (not used in business logic)
|
||||||
|
functionCall=self._routeWebOperation,
|
||||||
|
priority=PriorityEnum.BALANCED,
|
||||||
|
processingMode=ProcessingModeEnum.BASIC,
|
||||||
|
operationTypes=createOperationTypeRatings(
|
||||||
|
(OperationTypeEnum.WEB_SEARCH, 9),
|
||||||
|
(OperationTypeEnum.WEB_CRAWL, 10)
|
||||||
|
),
|
||||||
|
version="tavily-search",
|
||||||
|
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def _initializeClient(self):
|
def _initializeClient(self):
|
||||||
"""Initialize the Tavily client if API key is available."""
|
"""Initialize the Tavily client if API key is available."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
|
|
||||||
return filteredResults
|
return filteredResults
|
||||||
|
|
||||||
def getModels(self) -> List[AiModel]:
|
|
||||||
"""Get all available Tavily models."""
|
|
||||||
return [
|
|
||||||
AiModel(
|
|
||||||
name="tavily-search",
|
|
||||||
displayName="Tavily Search & Research",
|
|
||||||
connectorType="tavily",
|
|
||||||
apiUrl="https://api.tavily.com",
|
|
||||||
temperature=0.0, # Web search doesn't use temperature
|
|
||||||
maxTokens=0, # Web search doesn't use tokens
|
|
||||||
contextLength=0,
|
|
||||||
costPer1kTokensInput=0.0,
|
|
||||||
costPer1kTokensOutput=0.0,
|
|
||||||
speedRating=8, # Good speed for search and extract
|
|
||||||
qualityRating=9, # Excellent quality for web research
|
|
||||||
# capabilities removed (not used in business logic)
|
|
||||||
functionCall=self._routeWebOperation,
|
|
||||||
priority=PriorityEnum.BALANCED,
|
|
||||||
processingMode=ProcessingModeEnum.BASIC,
|
|
||||||
operationTypes=createOperationTypeRatings(
|
|
||||||
(OperationTypeEnum.WEB_SEARCH, 9),
|
|
||||||
(OperationTypeEnum.WEB_CRAWL, 8)
|
|
||||||
),
|
|
||||||
version="tavily-search",
|
|
||||||
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def create(cls):
|
async def create(cls):
|
||||||
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
|
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
|
||||||
|
|
@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
topic: str | None = None,
|
topic: str | None = None,
|
||||||
includeDomains: list[str] | None = None,
|
includeDomains: list[str] | None = None,
|
||||||
excludeDomains: list[str] | None = None,
|
excludeDomains: list[str] | None = None,
|
||||||
language: str | None = None,
|
|
||||||
country: str | None = None,
|
country: str | None = None,
|
||||||
includeAnswer: bool | None = None,
|
includeAnswer: str | None = None,
|
||||||
includeRawContent: bool | None = None,
|
includeRawContent: str | None = None,
|
||||||
) -> list[WebSearchResult]:
|
) -> list[WebSearchResult]:
|
||||||
"""Calls the Tavily API to perform a web search."""
|
"""Calls the Tavily API to perform a web search."""
|
||||||
# Make sure maxResults is within the allowed range (use cached values)
|
# Make sure maxResults is within the allowed range (use cached values)
|
||||||
|
|
@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
kwargs["include_domains"] = includeDomains
|
kwargs["include_domains"] = includeDomains
|
||||||
if excludeDomains is not None:
|
if excludeDomains is not None:
|
||||||
kwargs["exclude_domains"] = excludeDomains
|
kwargs["exclude_domains"] = excludeDomains
|
||||||
if language is not None:
|
|
||||||
kwargs["language"] = language
|
|
||||||
if country is not None:
|
if country is not None:
|
||||||
kwargs["country"] = country
|
kwargs["country"] = country
|
||||||
if includeAnswer is not None:
|
if includeAnswer is not None:
|
||||||
|
|
@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
if includeRawContent is not None:
|
if includeRawContent is not None:
|
||||||
kwargs["include_raw_content"] = includeRawContent
|
kwargs["include_raw_content"] = includeRawContent
|
||||||
|
|
||||||
logger.debug(f"Tavily.search kwargs: {kwargs}")
|
# Log the final API call parameters for comparison
|
||||||
|
logger.info(f"Tavily API call parameters: {kwargs}")
|
||||||
|
|
||||||
# Ensure client is initialized
|
# Ensure client is initialized
|
||||||
if self.client is None:
|
if self.client is None:
|
||||||
|
|
@ -316,7 +316,11 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
raise ValueError("Tavily client not initialized. Please check API key configuration.")
|
raise ValueError("Tavily client not initialized. Please check API key configuration.")
|
||||||
|
|
||||||
response = await self.client.search(**kwargs)
|
response = await self.client.search(**kwargs)
|
||||||
|
|
||||||
|
# Return all results without score filtering
|
||||||
|
# Tavily's scoring is already applied by the API
|
||||||
|
logger.info(f"Tavily returned {len(response.get('results', []))} results")
|
||||||
|
|
||||||
return [
|
return [
|
||||||
WebSearchResult(
|
WebSearchResult(
|
||||||
title=result["title"],
|
title=result["title"],
|
||||||
|
|
@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
|
|
||||||
async def _crawl(
|
async def _crawl(
|
||||||
self,
|
self,
|
||||||
urls: list,
|
url: str,
|
||||||
extractDepth: str | None = None,
|
instructions: str | None = None,
|
||||||
format: str | None = None,
|
limit: int = 20,
|
||||||
|
maxDepth: int = 2,
|
||||||
|
maxBreadth: int = 40,
|
||||||
) -> list[WebCrawlResult]:
|
) -> list[WebCrawlResult]:
|
||||||
"""Calls the Tavily API to extract text content from URLs with retry logic."""
|
"""Calls the Tavily API to crawl ONE URL with link following and retry logic."""
|
||||||
maxRetries = self.crawlMaxRetries
|
maxRetries = self.crawlMaxRetries
|
||||||
retryDelay = self.crawlRetryDelay
|
retryDelay = self.crawlRetryDelay
|
||||||
timeout = self.crawlTimeout
|
timeout = self.crawlTimeout
|
||||||
|
|
||||||
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
|
logger.debug(f"Starting crawl of URL: {url}")
|
||||||
logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
|
logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
|
||||||
|
|
||||||
for attempt in range(maxRetries + 1):
|
for attempt in range(maxRetries + 1):
|
||||||
try:
|
try:
|
||||||
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
|
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
|
||||||
|
|
||||||
# Use asyncio.wait_for for timeout
|
|
||||||
# Build kwargs for extract
|
|
||||||
kwargsExtract: dict = {"urls": urls}
|
|
||||||
kwargsExtract["extract_depth"] = extractDepth or "advanced"
|
|
||||||
kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
|
|
||||||
|
|
||||||
logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
|
|
||||||
|
|
||||||
# Ensure client is initialized
|
# Ensure client is initialized
|
||||||
if self.client is None:
|
if self.client is None:
|
||||||
self._initializeClient()
|
self._initializeClient()
|
||||||
if self.client is None:
|
if self.client is None:
|
||||||
raise ValueError("Tavily client not initialized. Please check API key configuration.")
|
raise ValueError("Tavily client not initialized. Please check API key configuration.")
|
||||||
|
|
||||||
|
logger.debug(f"Crawling URL: {url}")
|
||||||
|
|
||||||
|
# Build kwargs for crawl
|
||||||
|
kwargsCrawl: dict = {"url": url}
|
||||||
|
if instructions:
|
||||||
|
kwargsCrawl["instructions"] = instructions
|
||||||
|
if limit:
|
||||||
|
kwargsCrawl["limit"] = limit
|
||||||
|
if maxDepth:
|
||||||
|
kwargsCrawl["max_depth"] = maxDepth
|
||||||
|
if maxBreadth:
|
||||||
|
kwargsCrawl["max_breadth"] = maxBreadth
|
||||||
|
|
||||||
|
logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
|
||||||
|
|
||||||
response = await asyncio.wait_for(
|
response = await asyncio.wait_for(
|
||||||
self.client.extract(**kwargsExtract),
|
self.client.crawl(**kwargsCrawl),
|
||||||
timeout=timeout
|
timeout=timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(f"Tavily response received: {list(response.keys())}")
|
|
||||||
|
|
||||||
# Debug: Log what Tavily actually returns
|
logger.debug(f"Tavily response received: {type(response)}")
|
||||||
if "results" in response and response["results"]:
|
|
||||||
logger.debug(f"Tavily returned {len(response['results'])} results")
|
# Parse response - could be dict with results or list
|
||||||
logger.debug(f"First result keys: {list(response['results'][0].keys())}")
|
if isinstance(response, dict) and "results" in response:
|
||||||
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
|
pageResults = response["results"]
|
||||||
|
elif isinstance(response, list):
|
||||||
# Log each result
|
pageResults = response
|
||||||
for i, result in enumerate(response["results"]):
|
|
||||||
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Tavily returned no results in response: {response}")
|
logger.warning(f"Unexpected response format: {type(response)}")
|
||||||
|
pageResults = []
|
||||||
|
|
||||||
results = [
|
logger.debug(f"Got {len(pageResults)} pages from crawl")
|
||||||
WebCrawlResult(
|
|
||||||
url=result["url"],
|
|
||||||
content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
|
|
||||||
title=result.get("title", "") # Extract title if available
|
|
||||||
)
|
|
||||||
for result in response["results"]
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.debug(f"Crawl successful: extracted {len(results)} results")
|
# Convert to WebCrawlResult format
|
||||||
|
results = []
|
||||||
|
for result in pageResults:
|
||||||
|
results.append(WebCrawlResult(
|
||||||
|
url=result.get("url", url),
|
||||||
|
content=result.get("raw_content", result.get("content", "")),
|
||||||
|
title=result.get("title", "")
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
|
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
|
||||||
if attempt < maxRetries:
|
if attempt < maxRetries:
|
||||||
logger.info(f"Retrying in {retryDelay} seconds...")
|
logger.info(f"Retrying in {retryDelay} seconds...")
|
||||||
await asyncio.sleep(retryDelay)
|
await asyncio.sleep(retryDelay)
|
||||||
|
|
@ -398,21 +410,20 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
|
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
|
logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
|
||||||
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
|
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
# Check if it's a validation error and log more details
|
# Check if it's a validation error and log more details
|
||||||
if "validation" in str(e).lower():
|
if "validation" in str(e).lower():
|
||||||
logger.debug(f"URL validation failed. Checking URL format:")
|
logger.debug(f"URL validation failed. Checking URL format:")
|
||||||
for i, url in enumerate(urls):
|
logger.debug(f" URL: '{url}' (length: {len(url)})")
|
||||||
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
|
# Check for common URL issues
|
||||||
# Check for common URL issues
|
if ' ' in url:
|
||||||
if ' ' in url:
|
logger.debug(f" WARNING: URL contains spaces!")
|
||||||
logger.debug(f" WARNING: URL contains spaces!")
|
if not url.startswith(('http://', 'https://')):
|
||||||
if not url.startswith(('http://', 'https://')):
|
logger.debug(f" WARNING: URL doesn't start with http/https!")
|
||||||
logger.debug(f" WARNING: URL doesn't start with http/https!")
|
if len(url) > 2000:
|
||||||
if len(url) > 2000:
|
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
|
||||||
logger.debug(f" WARNING: URL is very long ({len(url)} chars)")
|
|
||||||
|
|
||||||
if attempt < maxRetries:
|
if attempt < maxRetries:
|
||||||
logger.info(f"Retrying in {retryDelay} seconds...")
|
logger.info(f"Retrying in {retryDelay} seconds...")
|
||||||
|
|
@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
if countryName:
|
if countryName:
|
||||||
countryName = self._convertIsoCodeToCountryName(countryName)
|
countryName = self._convertIsoCodeToCountryName(countryName)
|
||||||
|
|
||||||
# Perform search
|
# Perform search - use exact parameters from prompt
|
||||||
|
# NOTE: timeRange parameter causes generic results, so we don't use it
|
||||||
searchResults = await self._search(
|
searchResults = await self._search(
|
||||||
query=webSearchPrompt.instruction,
|
query=webSearchPrompt.instruction,
|
||||||
maxResults=webSearchPrompt.maxNumberPages,
|
maxResults=webSearchPrompt.maxNumberPages,
|
||||||
timeRange=webSearchPrompt.timeRange,
|
timeRange=None, # Not used - causes generic results
|
||||||
country=countryName,
|
country=countryName,
|
||||||
language=webSearchPrompt.language,
|
includeAnswer="basic",
|
||||||
includeAnswer=False,
|
includeRawContent="text"
|
||||||
includeRawContent=False
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract URLs from results
|
# Extract URLs from results
|
||||||
|
|
@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
|
|
||||||
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
|
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
|
||||||
"""
|
"""
|
||||||
WEB_CRAWL operation - crawls one URL using Tavily.
|
WEB_CRAWL operation - crawls one URL using Tavily with link following.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
AiModelResponse with crawl results as JSON
|
AiModelResponse with crawl results as JSON (may include multiple pages)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Extract parameters
|
# Extract parameters
|
||||||
|
|
@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
|
||||||
# Create Pydantic model
|
# Create Pydantic model
|
||||||
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
||||||
|
|
||||||
# Perform crawl for ONE URL
|
# Perform crawl for ONE URL with link following
|
||||||
# Note: _crawl expects a list, so we wrap the single URL in a list
|
# Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
|
||||||
crawlResults = await self._crawl(
|
crawlResults = await self._crawl(
|
||||||
urls=[webCrawlPrompt.url],
|
url=webCrawlPrompt.url,
|
||||||
extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
|
instructions=webCrawlPrompt.instruction,
|
||||||
format="markdown"
|
limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages
|
||||||
|
maxDepth=webCrawlPrompt.maxDepth or 2,
|
||||||
|
maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format result for single URL - consistent with Perplexity format
|
# If we got multiple pages from the crawl, we need to format them differently
|
||||||
|
# Return the first result for backwards compatibility, but include total page count
|
||||||
if crawlResults and len(crawlResults) > 0:
|
if crawlResults and len(crawlResults) > 0:
|
||||||
firstResult = crawlResults[0]
|
# Get all pages content
|
||||||
|
allContent = ""
|
||||||
|
for i, result in enumerate(crawlResults, 1):
|
||||||
|
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
|
||||||
|
if result.title:
|
||||||
|
allContent += f"{pageHeader}Title: {result.title}\n\n"
|
||||||
|
allContent += f"{result.content}\n"
|
||||||
|
|
||||||
resultData = {
|
resultData = {
|
||||||
"url": firstResult.url,
|
"url": webCrawlPrompt.url,
|
||||||
"title": firstResult.title if firstResult.title else "Content",
|
"title": crawlResults[0].title if crawlResults[0].title else "Content",
|
||||||
"content": firstResult.content
|
"content": allContent,
|
||||||
|
"pagesCrawled": len(crawlResults),
|
||||||
|
"pageUrls": [result.url for result in crawlResults]
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
|
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
|
||||||
|
|
||||||
# Return as JSON - same format as Perplexity
|
# Return as JSON - same format as Perplexity but with multiple pages content
|
||||||
import json
|
import json
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content=json.dumps(resultData, indent=2),
|
content=json.dumps(resultData, indent=2),
|
||||||
success=True,
|
success=True,
|
||||||
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
|
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in Tavily web crawl: {str(e)}")
|
logger.error(f"Error in Tavily web crawl: {str(e)}")
|
||||||
import json
|
import json
|
||||||
errorResult = {"error": str(e), "url": ""}
|
errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
|
||||||
return AiModelResponse(
|
return AiModelResponse(
|
||||||
content=json.dumps(errorResult, indent=2),
|
content=json.dumps(errorResult, indent=2),
|
||||||
success=False,
|
success=False,
|
||||||
|
|
|
||||||
|
|
@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
|
||||||
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
|
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
|
||||||
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
|
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
|
||||||
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
|
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
|
||||||
timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
|
|
||||||
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
|
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
|
||||||
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
|
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -186,12 +186,13 @@ class CountryCodes:
|
||||||
Get Tavily-compatible country name from ISO-2 code.
|
Get Tavily-compatible country name from ISO-2 code.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
isoCode: ISO-2 country code (e.g., "CH", "US")
|
isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
|
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
|
||||||
"""
|
"""
|
||||||
isoCodeUpper = isoCode.upper()
|
# Convert to uppercase for lookup
|
||||||
|
isoCodeUpper = isoCode.upper() if isoCode else ""
|
||||||
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
|
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
|
||||||
return mapping[0] if mapping else isoCode
|
return mapping[0] if mapping else isoCode
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class WebcrawlService:
|
class WebService:
|
||||||
"""Service for web search and crawling operations."""
|
"""Service for web search and crawling operations."""
|
||||||
|
|
||||||
def __init__(self, services):
|
def __init__(self, services):
|
||||||
|
|
@ -56,7 +56,6 @@ class WebcrawlService:
|
||||||
extractedUrls = analysisResult.get("urls", [])
|
extractedUrls = analysisResult.get("urls", [])
|
||||||
needsSearch = analysisResult.get("needsSearch", True) # Default to True
|
needsSearch = analysisResult.get("needsSearch", True) # Default to True
|
||||||
maxNumberPages = analysisResult.get("maxNumberPages", 10)
|
maxNumberPages = analysisResult.get("maxNumberPages", 10)
|
||||||
timeRange = analysisResult.get("timeRange")
|
|
||||||
countryCode = analysisResult.get("country", country)
|
countryCode = analysisResult.get("country", country)
|
||||||
languageCode = analysisResult.get("language", language)
|
languageCode = analysisResult.get("language", language)
|
||||||
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
|
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
|
||||||
|
|
@ -77,7 +76,6 @@ class WebcrawlService:
|
||||||
searchUrls = await self._performWebSearch(
|
searchUrls = await self._performWebSearch(
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
maxNumberPages=maxNumberPages - len(allUrls),
|
maxNumberPages=maxNumberPages - len(allUrls),
|
||||||
timeRange=timeRange,
|
|
||||||
country=countryCode,
|
country=countryCode,
|
||||||
language=languageCode
|
language=languageCode
|
||||||
)
|
)
|
||||||
|
|
@ -153,10 +151,9 @@ Extract and provide a JSON response with:
|
||||||
2. urls: List of URLs found in the prompt text
|
2. urls: List of URLs found in the prompt text
|
||||||
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
|
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
|
||||||
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
|
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
|
||||||
5. timeRange: Time range if mentioned (d, w, m, y, or null)
|
5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
|
||||||
6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
|
6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
|
||||||
7. language: Language code if specified (lowercase, e.g., de, en, fr)
|
7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
|
||||||
8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
|
|
||||||
|
|
||||||
Return ONLY valid JSON, no additional text:
|
Return ONLY valid JSON, no additional text:
|
||||||
{{
|
{{
|
||||||
|
|
@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
|
||||||
"urls": ["url1", "url2"],
|
"urls": ["url1", "url2"],
|
||||||
"needsSearch": true,
|
"needsSearch": true,
|
||||||
"maxNumberPages": 10,
|
"maxNumberPages": 10,
|
||||||
"timeRange": null,
|
|
||||||
"country": "ch",
|
"country": "ch",
|
||||||
"language": "de",
|
"language": "de",
|
||||||
"researchDepth": "general"
|
"researchDepth": "general"
|
||||||
|
|
@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
|
||||||
"urls": [],
|
"urls": [],
|
||||||
"needsSearch": True,
|
"needsSearch": True,
|
||||||
"maxNumberPages": 10,
|
"maxNumberPages": 10,
|
||||||
"timeRange": None,
|
|
||||||
"country": country,
|
"country": country,
|
||||||
"language": language,
|
"language": language,
|
||||||
"researchDepth": researchDepth
|
"researchDepth": researchDepth
|
||||||
|
|
@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
|
||||||
self,
|
self,
|
||||||
instruction: str,
|
instruction: str,
|
||||||
maxNumberPages: int,
|
maxNumberPages: int,
|
||||||
timeRange: Optional[str],
|
|
||||||
country: Optional[str],
|
country: Optional[str],
|
||||||
language: Optional[str]
|
language: Optional[str]
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
|
|
@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
country=country,
|
country=country,
|
||||||
maxNumberPages=maxNumberPages,
|
maxNumberPages=maxNumberPages,
|
||||||
timeRange=timeRange,
|
|
||||||
language=language
|
language=language
|
||||||
)
|
)
|
||||||
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
|
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||||
|
|
@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
url=url, # Single URL
|
url=url, # Single URL
|
||||||
maxDepth=maxDepth,
|
maxDepth=maxDepth,
|
||||||
maxWidth=10
|
maxWidth=50
|
||||||
)
|
)
|
||||||
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
|
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -170,7 +170,7 @@ class MethodAi(MethodBase):
|
||||||
- Output format: JSON with research results including URLs and content.
|
- Output format: JSON with research results including URLs and content.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- prompt (str, required): Natural language research instruction, including time range if relevant.
|
- prompt (str, required): Natural language research instruction.
|
||||||
- list(url) (list, optional): Specific URLs to crawl, if needed.
|
- list(url) (list, optional): Specific URLs to crawl, if needed.
|
||||||
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
|
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
|
||||||
- language (str, optional): Language code (lowercase, e.g., de, en, fr).
|
- language (str, optional): Language code (lowercase, e.g., de, en, fr).
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
AI Models Test - Tests all available AI models individually
|
AI Models Test - Tests WEB_CRAWL functionality on all models that support it
|
||||||
|
|
||||||
|
This script tests all models that have WEB_CRAWL capability, validates that
|
||||||
|
they can crawl specific URLs and return content, and analyzes the quality of results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
@ -53,9 +56,18 @@ class AIModelsTester:
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
"""Initialize the AI service."""
|
"""Initialize the AI service."""
|
||||||
# Set logging level to INFO to reduce noise
|
# Set logging level to DEBUG for detailed output
|
||||||
import logging
|
import logging
|
||||||
logging.getLogger().setLevel(logging.INFO)
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Initialize the model registry with all connectors
|
||||||
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||||
|
from modules.aicore.aicorePluginTavily import AiTavily
|
||||||
|
from modules.aicore.aicorePluginPerplexity import AiPerplexity
|
||||||
|
|
||||||
|
# Register web connectors that support WEB_CRAWL
|
||||||
|
modelRegistry.registerConnector(AiTavily())
|
||||||
|
modelRegistry.registerConnector(AiPerplexity())
|
||||||
|
|
||||||
# The AI service needs to be recreated with proper initialization
|
# The AI service needs to be recreated with proper initialization
|
||||||
from modules.services.serviceAi.mainServiceAi import AiService
|
from modules.services.serviceAi.mainServiceAi import AiService
|
||||||
|
|
@ -86,27 +98,53 @@ class AIModelsTester:
|
||||||
print(f"📁 Results will be saved to: {self.modelTestDir}")
|
print(f"📁 Results will be saved to: {self.modelTestDir}")
|
||||||
|
|
||||||
async def testModel(self, modelName: str) -> Dict[str, Any]:
|
async def testModel(self, modelName: str) -> Dict[str, Any]:
|
||||||
"""Test a specific AI model with a simple prompt."""
|
"""Test a specific AI model with WEB_CRAWL operation."""
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f"TESTING MODEL: {modelName}")
|
print(f"TESTING MODEL: {modelName}")
|
||||||
|
print(f"OPERATION TYPE: WEB_CRAWL")
|
||||||
print(f"{'='*60}")
|
print(f"{'='*60}")
|
||||||
|
|
||||||
# Use same prompt for all web models
|
# CRAWL CONFIGURATION
|
||||||
import json
|
# Deep and Broad Web Crawl Example:
|
||||||
|
# - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
|
||||||
|
# - Level 1: Starting page
|
||||||
|
# - Level 2: Pages linked from starting page
|
||||||
|
# - Level 3: Pages linked from Level 2 pages
|
||||||
|
# - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
|
||||||
|
# This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
|
||||||
|
#
|
||||||
|
# Common configurations:
|
||||||
|
# - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused)
|
||||||
|
# - General/Standard: maxDepth=2, maxWidth=10 (balanced)
|
||||||
|
# - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive)
|
||||||
|
|
||||||
if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
|
CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep
|
||||||
# All web models use the same JSON formatted prompt
|
CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level
|
||||||
# Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
|
|
||||||
testPrompt = json.dumps({
|
print(f"Crawl Configuration:")
|
||||||
"prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
|
print(f" - Depth: {CRAWL_DEPTH} levels (deep)")
|
||||||
"maxResults": 5,
|
print(f" - Width: {CRAWL_WIDTH} pages per level (broad)")
|
||||||
"timeRange": "y",
|
print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
|
||||||
"country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
|
|
||||||
"format": "json"
|
# Use WEB_CRAWL specific prompt format
|
||||||
}, indent=2)
|
from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
|
||||||
else:
|
|
||||||
# Fallback for other models
|
# Test with simple prompt like playground example
|
||||||
testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON."
|
simplePrompt = f"https://www.valueon.ch: Who works in this company?"
|
||||||
|
|
||||||
|
# But keep structured format for now to match our API
|
||||||
|
testPrompt = json.dumps({
|
||||||
|
"instruction": "Who works in this company?",
|
||||||
|
"url": "https://www.valueon.ch",
|
||||||
|
"maxDepth": CRAWL_DEPTH,
|
||||||
|
"maxWidth": CRAWL_WIDTH
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
print(f"Simple prompt (playground style): {simplePrompt}")
|
||||||
|
|
||||||
|
# For Tavily models, test direct API call for better link following
|
||||||
|
if "tavily" in modelName.lower():
|
||||||
|
return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
|
||||||
|
|
||||||
print(f"Test prompt: {testPrompt}")
|
print(f"Test prompt: {testPrompt}")
|
||||||
print(f"Prompt length: {len(testPrompt)} characters")
|
print(f"Prompt length: {len(testPrompt)} characters")
|
||||||
|
|
@ -114,17 +152,11 @@ class AIModelsTester:
|
||||||
startTime = asyncio.get_event_loop().time()
|
startTime = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create options to force this specific model
|
# Create options for WEB_CRAWL operation
|
||||||
if "internal" in modelName.lower():
|
options = AiCallOptions(
|
||||||
options = AiCallOptions(
|
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||||
operationType=OperationTypeEnum.DATA_EXTRACT,
|
preferredModel=modelName
|
||||||
preferredModel=modelName
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
options = AiCallOptions(
|
|
||||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
|
||||||
preferredModel=modelName
|
|
||||||
)
|
|
||||||
|
|
||||||
# Call the AI service DIRECTLY through the model's functionCall
|
# Call the AI service DIRECTLY through the model's functionCall
|
||||||
# This tests the actual model, not the document generation pipeline
|
# This tests the actual model, not the document generation pipeline
|
||||||
|
|
@ -140,29 +172,14 @@ class AIModelsTester:
|
||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Prepare messages and options based on model type
|
# For WEB_CRAWL models, use normal functionCall with structured prompt
|
||||||
if "vision" in modelName.lower():
|
messages = [{"role": "user", "content": testPrompt}]
|
||||||
# For vision models, skip for now since they require special handling
|
modelCall = AiModelCall(
|
||||||
print(f"⚠️ Skipping vision model {modelName} - requires special image handling")
|
messages=messages,
|
||||||
return {
|
model=model,
|
||||||
"modelName": modelName,
|
options=options
|
||||||
"status": "SKIPPED",
|
)
|
||||||
"processingTime": 0.0,
|
response = await model.functionCall(modelCall)
|
||||||
"responseLength": 0,
|
|
||||||
"responseType": "skipped",
|
|
||||||
"hasContent": False,
|
|
||||||
"error": "Vision model requires special image handling",
|
|
||||||
"fullResponse": "Skipped - vision model requires special image handling"
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# For other models, use normal functionCall
|
|
||||||
messages = [{"role": "user", "content": testPrompt}]
|
|
||||||
modelCall = AiModelCall(
|
|
||||||
messages=messages,
|
|
||||||
model=model,
|
|
||||||
options=options
|
|
||||||
)
|
|
||||||
response = await model.functionCall(modelCall)
|
|
||||||
|
|
||||||
endTime = asyncio.get_event_loop().time()
|
endTime = asyncio.get_event_loop().time()
|
||||||
processingTime = endTime - startTime
|
processingTime = endTime - startTime
|
||||||
|
|
@ -185,6 +202,10 @@ class AIModelsTester:
|
||||||
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
|
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Extract actual prompt sent if available in metadata
|
||||||
|
if hasattr(response, 'metadata') and response.metadata:
|
||||||
|
result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
|
||||||
|
|
||||||
# Try to parse content as JSON
|
# Try to parse content as JSON
|
||||||
if response.content:
|
if response.content:
|
||||||
try:
|
try:
|
||||||
|
|
@ -289,9 +310,16 @@ class AIModelsTester:
|
||||||
print(f"📄 Response length: {len(str(response))} characters")
|
print(f"📄 Response length: {len(str(response))} characters")
|
||||||
print(f"📄 Response preview: {result['responsePreview']}")
|
print(f"📄 Response preview: {result['responsePreview']}")
|
||||||
|
|
||||||
# Save text response for all models
|
# Add prompt to result for logging
|
||||||
if result.get("status") == "SUCCESS":
|
result["testPrompt"] = testPrompt
|
||||||
self._saveTextResponse(modelName, result)
|
result["crawlConfig"] = {
|
||||||
|
"depth": CRAWL_DEPTH,
|
||||||
|
"width": CRAWL_WIDTH
|
||||||
|
}
|
||||||
|
|
||||||
|
# For WEB_CRAWL, also validate that content was extracted
|
||||||
|
if result.get("status") == "SUCCESS" and result.get("fullResponse"):
|
||||||
|
self._validateCrawlResponse(modelName, result)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
endTime = asyncio.get_event_loop().time()
|
endTime = asyncio.get_event_loop().time()
|
||||||
|
|
@ -304,13 +332,22 @@ class AIModelsTester:
|
||||||
"responseLength": 0,
|
"responseLength": 0,
|
||||||
"responseType": "exception",
|
"responseType": "exception",
|
||||||
"hasContent": False,
|
"hasContent": False,
|
||||||
"error": str(e)
|
"error": str(e),
|
||||||
|
"testPrompt": testPrompt,
|
||||||
|
"crawlConfig": {
|
||||||
|
"depth": CRAWL_DEPTH,
|
||||||
|
"width": CRAWL_WIDTH
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
print(f"💥 EXCEPTION - {str(e)}")
|
print(f"💥 EXCEPTION - {str(e)}")
|
||||||
|
|
||||||
self.testResults.append(result)
|
self.testResults.append(result)
|
||||||
|
|
||||||
|
# Save text response even for exceptions to log the prompt
|
||||||
|
if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
|
||||||
|
self._saveTextResponse(modelName, result)
|
||||||
|
|
||||||
# Save individual model result immediately
|
# Save individual model result immediately
|
||||||
self._saveIndividualModelResult(modelName, result)
|
self._saveIndividualModelResult(modelName, result)
|
||||||
|
|
||||||
|
|
@ -378,6 +415,19 @@ class AIModelsTester:
|
||||||
if not content:
|
if not content:
|
||||||
content = result.get("responsePreview", "No content available")
|
content = result.get("responsePreview", "No content available")
|
||||||
|
|
||||||
|
# If there's an error, include it in the content
|
||||||
|
if result.get("error"):
|
||||||
|
content = f"ERROR: {result.get('error')}\n\n{content}"
|
||||||
|
|
||||||
|
# Get prompt and config for logging
|
||||||
|
config = result.get("crawlConfig", {})
|
||||||
|
crawlDepth = config.get("depth", "N/A")
|
||||||
|
crawlWidth = config.get("width", "N/A")
|
||||||
|
|
||||||
|
# Get both the original JSON prompt and the actual prompt sent
|
||||||
|
originalPrompt = result.get("testPrompt", "N/A")
|
||||||
|
actualPromptSent = result.get("actualPromptSent", "N/A")
|
||||||
|
|
||||||
# Add metadata header
|
# Add metadata header
|
||||||
metadata = f"""Model: {modelName}
|
metadata = f"""Model: {modelName}
|
||||||
Test Time: {timestamp}
|
Test Time: {timestamp}
|
||||||
|
|
@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
|
||||||
Processing Time: {result.get('processingTime', 0):.2f}s
|
Processing Time: {result.get('processingTime', 0):.2f}s
|
||||||
Response Length: {result.get('responseLength', 0)} characters
|
Response Length: {result.get('responseLength', 0)} characters
|
||||||
Is Valid JSON: {result.get('isValidJson', False)}
|
Is Valid JSON: {result.get('isValidJson', False)}
|
||||||
|
Test Method: {result.get('testMethod', 'standard')}
|
||||||
|
Pages Crawled: {result.get('pagesCrawled', 'N/A')}
|
||||||
|
Crawled URL: {result.get('crawledUrl', 'N/A')}
|
||||||
|
Has URL: {result.get('hasUrl', 'N/A')}
|
||||||
|
Has Title: {result.get('hasTitle', 'N/A')}
|
||||||
|
Has Content: {result.get('hasContent', 'N/A')}
|
||||||
|
Content Length: {result.get('contentLength', 'N/A')} characters
|
||||||
|
|
||||||
|
--- CRAWL CONFIGURATION ---
|
||||||
|
Depth: {crawlDepth}
|
||||||
|
Width: {crawlWidth}
|
||||||
|
|
||||||
|
--- ORIGINAL JSON PROMPT (input) ---
|
||||||
|
{originalPrompt}
|
||||||
|
|
||||||
|
--- ACTUAL PROMPT SENT TO API (EXACT) ---
|
||||||
|
{actualPromptSent}
|
||||||
|
|
||||||
--- RESPONSE CONTENT ---
|
--- RESPONSE CONTENT ---
|
||||||
{content}
|
{content}
|
||||||
|
|
@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
||||||
print(f"❌ Error saving text response: {str(e)}")
|
print(f"❌ Error saving text response: {str(e)}")
|
||||||
result["textSaveError"] = str(e)
|
result["textSaveError"] = str(e)
|
||||||
|
|
||||||
|
def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
|
||||||
|
"""Validate that the WEB_CRAWL response contains crawled content."""
|
||||||
|
try:
|
||||||
|
content = result.get("fullResponse", "")
|
||||||
|
|
||||||
|
# Try to parse as JSON
|
||||||
|
crawledData = {}
|
||||||
|
try:
|
||||||
|
parsed = json.loads(content)
|
||||||
|
if isinstance(parsed, dict):
|
||||||
|
crawledData = parsed
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for expected fields: url, title, content
|
||||||
|
hasUrl = bool(crawledData.get("url"))
|
||||||
|
hasTitle = bool(crawledData.get("title"))
|
||||||
|
hasContent = bool(crawledData.get("content"))
|
||||||
|
contentLength = len(crawledData.get("content", ""))
|
||||||
|
|
||||||
|
result["hasUrl"] = hasUrl
|
||||||
|
result["hasTitle"] = hasTitle
|
||||||
|
result["hasContent"] = hasContent
|
||||||
|
result["contentLength"] = contentLength
|
||||||
|
result["crawledUrl"] = crawledData.get("url", "")
|
||||||
|
|
||||||
|
if hasUrl and hasContent:
|
||||||
|
print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
|
||||||
|
print(f" Content length: {contentLength} characters")
|
||||||
|
print(f" Title: {crawledData.get('title', 'N/A')}")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error validating crawl response: {str(e)}")
|
||||||
|
result["crawlValidationError"] = str(e)
|
||||||
|
|
||||||
|
async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
|
||||||
|
"""Test Tavily API directly using the crawl() method with better link following."""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"TESTING TAVILY DIRECT API (crawl method)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
startTime = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tavily import AsyncTavilyClient
|
||||||
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
||||||
|
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
|
||||||
|
if not apiKey:
|
||||||
|
raise Exception("Tavily API key not found")
|
||||||
|
|
||||||
|
client = AsyncTavilyClient(api_key=apiKey)
|
||||||
|
|
||||||
|
# Map our configuration to Tavily parameters
|
||||||
|
# maxWidth -> limit (pages per level)
|
||||||
|
# maxDepth -> max_depth (link following depth)
|
||||||
|
# max_breadth = maxWidth (breadth of crawl at each level)
|
||||||
|
tavilyLimit = crawlWidth
|
||||||
|
tavilyMaxDepth = crawlDepth
|
||||||
|
tavilyMaxBreadth = crawlWidth
|
||||||
|
|
||||||
|
print(f"Calling Tavily API with crawl() method...")
|
||||||
|
print(f"URL: https://www.valueon.ch")
|
||||||
|
print(f"Instructions: Who works in this company?")
|
||||||
|
print(f"Limit: {tavilyLimit} pages per level")
|
||||||
|
print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
|
||||||
|
print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
|
||||||
|
print(f"Deep and Broad Crawl Configuration Active")
|
||||||
|
|
||||||
|
response = await client.crawl(
|
||||||
|
url="https://www.valueon.ch",
|
||||||
|
instructions="Who works in this company?",
|
||||||
|
limit=tavilyLimit,
|
||||||
|
max_depth=tavilyMaxDepth,
|
||||||
|
max_breadth=tavilyMaxBreadth
|
||||||
|
)
|
||||||
|
|
||||||
|
endTime = asyncio.get_event_loop().time()
|
||||||
|
processingTime = endTime - startTime
|
||||||
|
|
||||||
|
# Analyze response
|
||||||
|
contentLength = 0
|
||||||
|
pagesCrawled = 0
|
||||||
|
fullContent = ""
|
||||||
|
|
||||||
|
if isinstance(response, dict):
|
||||||
|
# Check if it has results
|
||||||
|
if "results" in response:
|
||||||
|
results = response["results"]
|
||||||
|
pagesCrawled = len(results)
|
||||||
|
content_parts = []
|
||||||
|
for result in results:
|
||||||
|
url = result.get("url", "")
|
||||||
|
title = result.get("title", "")
|
||||||
|
content = result.get("raw_content", result.get("content", ""))
|
||||||
|
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
|
||||||
|
contentLength += len(content)
|
||||||
|
|
||||||
|
fullContent = "\n".join(content_parts)
|
||||||
|
else:
|
||||||
|
fullContent = json.dumps(response, indent=2)
|
||||||
|
contentLength = len(fullContent)
|
||||||
|
elif isinstance(response, list):
|
||||||
|
pagesCrawled = len(response)
|
||||||
|
content_parts = []
|
||||||
|
for item in response:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
url = item.get("url", "")
|
||||||
|
title = item.get("title", "")
|
||||||
|
content = item.get("raw_content", item.get("content", ""))
|
||||||
|
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
|
||||||
|
contentLength += len(content)
|
||||||
|
|
||||||
|
fullContent = "\n".join(content_parts)
|
||||||
|
else:
|
||||||
|
fullContent = str(response)
|
||||||
|
contentLength = len(fullContent)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"modelName": modelName,
|
||||||
|
"status": "SUCCESS",
|
||||||
|
"processingTime": round(processingTime, 2),
|
||||||
|
"responseLength": contentLength,
|
||||||
|
"responseType": "TavilyDirectAPI",
|
||||||
|
"hasContent": True,
|
||||||
|
"error": None,
|
||||||
|
"modelUsed": modelName,
|
||||||
|
"priceUsd": 0.0,
|
||||||
|
"bytesSent": 0,
|
||||||
|
"bytesReceived": contentLength,
|
||||||
|
"isValidJson": True,
|
||||||
|
"fullResponse": fullContent,
|
||||||
|
"pagesCrawled": pagesCrawled,
|
||||||
|
"testMethod": "direct_api_crawl"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
|
||||||
|
print(f"📄 Pages crawled: {pagesCrawled}")
|
||||||
|
print(f"📄 Total content length: {contentLength} characters")
|
||||||
|
|
||||||
|
# Save the response
|
||||||
|
self._saveTextResponse(modelName, result)
|
||||||
|
self._validateCrawlResponse(modelName, result)
|
||||||
|
self._saveIndividualModelResult(modelName, result)
|
||||||
|
|
||||||
|
self.testResults.append(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
endTime = asyncio.get_event_loop().time()
|
||||||
|
processingTime = endTime - startTime
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"modelName": modelName,
|
||||||
|
"status": "EXCEPTION",
|
||||||
|
"processingTime": round(processingTime, 2),
|
||||||
|
"responseLength": 0,
|
||||||
|
"responseType": "exception",
|
||||||
|
"hasContent": False,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"💥 EXCEPTION - {str(e)}")
|
||||||
|
self.testResults.append(result)
|
||||||
|
return result
|
||||||
|
|
||||||
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
|
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
|
||||||
"""Save individual model test result to file."""
|
"""Save individual model test result to file."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
||||||
print(f"❌ Error saving individual result: {str(e)}")
|
print(f"❌ Error saving individual result: {str(e)}")
|
||||||
|
|
||||||
def getAllAvailableModels(self) -> List[str]:
|
def getAllAvailableModels(self) -> List[str]:
|
||||||
"""Get all available model names."""
|
"""Get all available model names that support WEB_CRAWL."""
|
||||||
# Hardcoded list of known models - same approach as test_ai_behavior.py
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||||
return [
|
from modules.datamodels.datamodelAi import OperationTypeEnum
|
||||||
# "claude-3-5-sonnet-20241022", # Skipped - text model, test later
|
|
||||||
# "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input
|
# Get all models from registry
|
||||||
# "gpt-4o", # Skipped - text model, test later
|
allModels = modelRegistry.getAvailableModels()
|
||||||
# "gpt-3.5-turbo", # Skipped - text model, test later
|
|
||||||
# "gpt-4o-vision", # Skipped - requires image input
|
# Filter models that support WEB_CRAWL
|
||||||
# "dall-e-3", # Skipped - image generation, test later
|
webCrawlModels = []
|
||||||
"sonar", # Perplexity web model
|
for model in allModels:
|
||||||
"sonar-pro", # Perplexity web model
|
if model.operationTypes and any(
|
||||||
"tavily-search", # Tavily web model (unified research)
|
ot.operationType == OperationTypeEnum.WEB_CRAWL
|
||||||
# "internal-extractor", # Skipped - internal model, test later
|
for ot in model.operationTypes
|
||||||
# "internal-generator", # Skipped - internal model, test later
|
): # Include both Tavily and Perplexity models
|
||||||
# "internal-renderer" # Skipped - internal model, test later
|
webCrawlModels.append(model.name)
|
||||||
]
|
|
||||||
|
# Filter to only "sonar" model for testing
|
||||||
|
webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
|
||||||
|
|
||||||
|
print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
|
||||||
|
for modelName in webCrawlModels:
|
||||||
|
print(f" - {modelName}")
|
||||||
|
|
||||||
|
return webCrawlModels
|
||||||
|
|
||||||
def saveTestResults(self):
|
def saveTestResults(self):
|
||||||
"""Save detailed test results to file."""
|
"""Save detailed test results to file."""
|
||||||
|
|
@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
||||||
if result.get("isValidJson") is not None:
|
if result.get("isValidJson") is not None:
|
||||||
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
|
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
|
||||||
|
|
||||||
|
if result.get("crawledUrl"):
|
||||||
|
print(f" Crawled URL: {result['crawledUrl']}")
|
||||||
|
|
||||||
|
if result.get("contentLength") is not None:
|
||||||
|
print(f" Content length: {result['contentLength']} characters")
|
||||||
|
|
||||||
|
if result.get("pagesCrawled") is not None:
|
||||||
|
print(f" Pages crawled: {result['pagesCrawled']}")
|
||||||
|
|
||||||
if result["error"]:
|
if result["error"]:
|
||||||
print(f" Error: {result['error']}")
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
|
@ -525,12 +777,32 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
||||||
print(f"{'='*80}")
|
print(f"{'='*80}")
|
||||||
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
|
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
|
||||||
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
|
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
|
||||||
|
|
||||||
|
# Find models with most content
|
||||||
|
modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
|
||||||
|
if modelsWithContent:
|
||||||
|
mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
|
||||||
|
totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
|
||||||
|
avgContent = totalContent / len(modelsWithContent)
|
||||||
|
print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
|
||||||
|
print(f"📊 Average content per model: {avgContent:.0f} characters")
|
||||||
|
print(f"📊 Total content crawled across all models: {totalContent} characters")
|
||||||
|
|
||||||
|
# Find models with most pages crawled (for Tavily direct API)
|
||||||
|
modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
|
||||||
|
if modelsWithPages:
|
||||||
|
mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
|
||||||
|
totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
|
||||||
|
avgPages = totalPages / len(modelsWithPages)
|
||||||
|
print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
|
||||||
|
print(f"📊 Average pages per model: {avgPages:.1f} pages")
|
||||||
|
print(f"📊 Total pages crawled across all models: {totalPages} pages")
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run AI models testing."""
|
"""Run AI models testing for WEB_CRAWL operation."""
|
||||||
tester = AIModelsTester()
|
tester = AIModelsTester()
|
||||||
|
|
||||||
print("Starting AI Models Testing...")
|
print("Starting AI Models Testing for WEB_CRAWL...")
|
||||||
print("Initializing AI service...")
|
print("Initializing AI service...")
|
||||||
await tester.initialize()
|
await tester.initialize()
|
||||||
|
|
||||||
|
|
@ -542,8 +814,9 @@ async def main():
|
||||||
print(f" {i}. {model}")
|
print(f" {i}. {model}")
|
||||||
|
|
||||||
print(f"\n{'='*80}")
|
print(f"\n{'='*80}")
|
||||||
print("STARTING INDIVIDUAL MODEL TESTS")
|
print("STARTING WEB_CRAWL TESTS")
|
||||||
print(f"{'='*80}")
|
print(f"{'='*80}")
|
||||||
|
print("Testing each model's ability to crawl URLs and return content...")
|
||||||
print("Press Enter after each model test to continue to the next one...")
|
print("Press Enter after each model test to continue to the next one...")
|
||||||
|
|
||||||
# Test each model individually
|
# Test each model individually
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue