ai models ready for web and txt
This commit is contained in:
parent
72e0687826
commit
2489719c62
7 changed files with 577 additions and 232 deletions
|
|
@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
|
|||
connectorType="perplexity",
|
||||
apiUrl="https://api.perplexity.ai/chat/completions",
|
||||
temperature=0.2,
|
||||
maxTokens=4000,
|
||||
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
|
||||
contextLength=32000,
|
||||
costPer1kTokensInput=0.005,
|
||||
costPer1kTokensOutput=0.005,
|
||||
|
|
@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
|
|||
connectorType="perplexity",
|
||||
apiUrl="https://api.perplexity.ai/chat/completions",
|
||||
temperature=0.2,
|
||||
maxTokens=4000,
|
||||
maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k)
|
||||
contextLength=32000,
|
||||
costPer1kTokensInput=0.01,
|
||||
costPer1kTokensOutput=0.01,
|
||||
speedRating=6, # Slower due to AI analysis
|
||||
qualityRating=10, # Best AI analysis quality
|
||||
qualityRating=9, # Best AI analysis quality
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self._routeWebOperation,
|
||||
priority=PriorityEnum.QUALITY,
|
||||
|
|
@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
|
|||
# Fallback to basic call
|
||||
return await self.callAiBasic(modelCall)
|
||||
|
||||
def _getDepthInstructions(self, maxDepth: int) -> str:
|
||||
"""
|
||||
Map maxDepth (numeric) to instructional text for LLM.
|
||||
|
||||
Args:
|
||||
maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
|
||||
|
||||
Returns:
|
||||
Instructional text for the LLM
|
||||
"""
|
||||
depthMap = {
|
||||
1: "Basic overview - extract main content from the main page only",
|
||||
2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
|
||||
3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
|
||||
}
|
||||
return depthMap.get(maxDepth, depthMap[2])
|
||||
|
||||
def _getWidthInstructions(self, maxWidth: int) -> str:
|
||||
"""
|
||||
Map maxWidth (numeric) to instructional text for LLM.
|
||||
|
||||
Args:
|
||||
maxWidth: Number of pages to crawl at each level (default: 10)
|
||||
|
||||
Returns:
|
||||
Instructional text for the LLM
|
||||
"""
|
||||
if maxWidth <= 5:
|
||||
return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
|
||||
elif maxWidth <= 15:
|
||||
return f"Standard breadth - crawl up to {maxWidth} pages per level"
|
||||
elif maxWidth <= 30:
|
||||
return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
|
||||
else:
|
||||
return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
|
||||
|
||||
async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
|
||||
"""
|
||||
WEB_SEARCH operation - returns list of URLs based on search query.
|
||||
|
|
@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):
|
|||
|
||||
Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
|
||||
{'' if not countryName else f'Focus on results from {countryName}.'}
|
||||
{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
|
||||
{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}
|
||||
|
||||
Return ONLY a JSON array of URLs, no additional text:
|
||||
[
|
||||
|
|
@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
|
|||
"""
|
||||
WEB_CRAWL operation - crawls ONE URL and returns content.
|
||||
|
||||
Perplexity API Parameters Used:
|
||||
- messages: The prompt containing URL and instruction
|
||||
- max_tokens: Maximum response length
|
||||
- max_results: Number of search results (1-20, default: 10)
|
||||
- temperature: Response randomness (not web search specific)
|
||||
|
||||
Pagination: Perplexity does NOT return paginated responses.
|
||||
A single response contains all results within max_tokens limit.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
||||
|
||||
|
|
@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
|
|||
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
||||
|
||||
# Build crawl request for Perplexity - ONE URL
|
||||
crawlPrompt = f"""Crawl and extract content from this URL based on the instruction:
|
||||
# Match playground prompt style: just URL + question
|
||||
# This allows Perplexity to return detailed multi-source results
|
||||
crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
|
||||
|
||||
INSTRUCTION: '{webCrawlPrompt.instruction}'
|
||||
|
||||
URL to crawl (maxDepth={webCrawlPrompt.maxDepth}):
|
||||
{webCrawlPrompt.url}
|
||||
|
||||
Extract and return the relevant content based on the instruction.
|
||||
Return as JSON object with this structure:
|
||||
{{
|
||||
"url": "{webCrawlPrompt.url}",
|
||||
"title": "Page title",
|
||||
"content": "Extracted content relevant to the instruction"
|
||||
}}
|
||||
|
||||
Return ONLY valid JSON, no additional text."""
|
||||
# Build payload with optional Perplexity parameters
|
||||
# Note: max_tokens_per_page may not be supported by chat/completions endpoint
|
||||
# The playground Python SDK might use a different internal API
|
||||
maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results
|
||||
|
||||
payload = {
|
||||
"model": model.name,
|
||||
"messages": [{"role": "user", "content": crawlPrompt}],
|
||||
"temperature": temperature,
|
||||
"max_tokens": maxTokens
|
||||
"max_tokens": maxTokens, # Use model's configured maxTokens (24000)
|
||||
"max_results": maxResults,
|
||||
"return_citations": True # Request citations explicitly
|
||||
}
|
||||
|
||||
logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
|
||||
|
||||
response = await self.httpClient.post(model.apiUrl, json=payload)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
|
||||
|
||||
apiResponse = response.json()
|
||||
|
||||
# Extract the main content
|
||||
content = apiResponse["choices"][0]["message"]["content"]
|
||||
|
||||
# Parse JSON content and ensure it's a single object
|
||||
import json
|
||||
try:
|
||||
parsedContent = json.loads(content)
|
||||
# Ensure it's a single object, not an array
|
||||
if isinstance(parsedContent, list):
|
||||
parsedContent = parsedContent[0] if parsedContent else {}
|
||||
except:
|
||||
# If not JSON, create structured response
|
||||
parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
|
||||
# Check for citations or search results in the response
|
||||
citations = apiResponse.get("citations", [])
|
||||
searchResults = apiResponse.get("search_results", [])
|
||||
|
||||
# Return as JSON string
|
||||
# Log what we found
|
||||
if citations:
|
||||
logger.info(f"Found {len(citations)} citations in response")
|
||||
if searchResults:
|
||||
logger.info(f"Found {len(searchResults)} search results in response")
|
||||
logger.debug(f"API response keys: {list(apiResponse.keys())}")
|
||||
|
||||
# Build comprehensive response with citations if available
|
||||
import json
|
||||
responseData = {
|
||||
"content": content,
|
||||
"citations": citations if citations else [],
|
||||
"search_results": searchResults if searchResults else []
|
||||
}
|
||||
|
||||
# Return comprehensive response
|
||||
return AiModelResponse(
|
||||
content=json.dumps(parsedContent, indent=2),
|
||||
content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
|
||||
success=True,
|
||||
modelId=model.name,
|
||||
metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
|
||||
metadata={
|
||||
"response_id": apiResponse.get("id", ""),
|
||||
"operation": "WEB_CRAWL",
|
||||
"url": webCrawlPrompt.url,
|
||||
"actualPromptSent": crawlPrompt,
|
||||
"has_citations": len(citations) > 0,
|
||||
"has_search_results": len(searchResults) > 0
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -27,7 +27,8 @@ class WebCrawlResult:
|
|||
content: str
|
||||
title: Optional[str] = None
|
||||
|
||||
class ConnectorWeb(BaseConnectorAi):
|
||||
|
||||
class AiTavily(BaseConnectorAi):
|
||||
"""Tavily web search connector."""
|
||||
|
||||
def __init__(self):
|
||||
|
|
@ -43,6 +44,35 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
# Initialize client if API key is available
|
||||
self._initializeClient()
|
||||
|
||||
|
||||
def getModels(self) -> List[AiModel]:
|
||||
"""Get all available Tavily models."""
|
||||
return [
|
||||
AiModel(
|
||||
name="tavily-search",
|
||||
displayName="Tavily Search & Research",
|
||||
connectorType="tavily",
|
||||
apiUrl="https://api.tavily.com",
|
||||
temperature=0.0, # Web search doesn't use temperature
|
||||
maxTokens=0, # Web search doesn't use tokens
|
||||
contextLength=0,
|
||||
costPer1kTokensInput=0.0,
|
||||
costPer1kTokensOutput=0.0,
|
||||
speedRating=8, # Good speed for search and extract
|
||||
qualityRating=9, # Excellent quality for web research
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self._routeWebOperation,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
(OperationTypeEnum.WEB_SEARCH, 9),
|
||||
(OperationTypeEnum.WEB_CRAWL, 10)
|
||||
),
|
||||
version="tavily-search",
|
||||
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
|
||||
)
|
||||
]
|
||||
|
||||
def _initializeClient(self):
|
||||
"""Initialize the Tavily client if API key is available."""
|
||||
try:
|
||||
|
|
@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
return filteredResults
|
||||
|
||||
def getModels(self) -> List[AiModel]:
|
||||
"""Get all available Tavily models."""
|
||||
return [
|
||||
AiModel(
|
||||
name="tavily-search",
|
||||
displayName="Tavily Search & Research",
|
||||
connectorType="tavily",
|
||||
apiUrl="https://api.tavily.com",
|
||||
temperature=0.0, # Web search doesn't use temperature
|
||||
maxTokens=0, # Web search doesn't use tokens
|
||||
contextLength=0,
|
||||
costPer1kTokensInput=0.0,
|
||||
costPer1kTokensOutput=0.0,
|
||||
speedRating=8, # Good speed for search and extract
|
||||
qualityRating=9, # Excellent quality for web research
|
||||
# capabilities removed (not used in business logic)
|
||||
functionCall=self._routeWebOperation,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.BASIC,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
(OperationTypeEnum.WEB_SEARCH, 9),
|
||||
(OperationTypeEnum.WEB_CRAWL, 8)
|
||||
),
|
||||
version="tavily-search",
|
||||
calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate
|
||||
)
|
||||
]
|
||||
|
||||
@classmethod
|
||||
async def create(cls):
|
||||
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
|
||||
|
|
@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
topic: str | None = None,
|
||||
includeDomains: list[str] | None = None,
|
||||
excludeDomains: list[str] | None = None,
|
||||
language: str | None = None,
|
||||
country: str | None = None,
|
||||
includeAnswer: bool | None = None,
|
||||
includeRawContent: bool | None = None,
|
||||
includeAnswer: str | None = None,
|
||||
includeRawContent: str | None = None,
|
||||
) -> list[WebSearchResult]:
|
||||
"""Calls the Tavily API to perform a web search."""
|
||||
# Make sure maxResults is within the allowed range (use cached values)
|
||||
|
|
@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
kwargs["include_domains"] = includeDomains
|
||||
if excludeDomains is not None:
|
||||
kwargs["exclude_domains"] = excludeDomains
|
||||
if language is not None:
|
||||
kwargs["language"] = language
|
||||
if country is not None:
|
||||
kwargs["country"] = country
|
||||
if includeAnswer is not None:
|
||||
|
|
@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
if includeRawContent is not None:
|
||||
kwargs["include_raw_content"] = includeRawContent
|
||||
|
||||
logger.debug(f"Tavily.search kwargs: {kwargs}")
|
||||
# Log the final API call parameters for comparison
|
||||
logger.info(f"Tavily API call parameters: {kwargs}")
|
||||
|
||||
# Ensure client is initialized
|
||||
if self.client is None:
|
||||
|
|
@ -317,6 +317,10 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
response = await self.client.search(**kwargs)
|
||||
|
||||
# Return all results without score filtering
|
||||
# Tavily's scoring is already applied by the API
|
||||
logger.info(f"Tavily returned {len(response.get('results', []))} results")
|
||||
|
||||
return [
|
||||
WebSearchResult(
|
||||
title=result["title"],
|
||||
|
|
@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
async def _crawl(
|
||||
self,
|
||||
urls: list,
|
||||
extractDepth: str | None = None,
|
||||
format: str | None = None,
|
||||
url: str,
|
||||
instructions: str | None = None,
|
||||
limit: int = 20,
|
||||
maxDepth: int = 2,
|
||||
maxBreadth: int = 40,
|
||||
) -> list[WebCrawlResult]:
|
||||
"""Calls the Tavily API to extract text content from URLs with retry logic."""
|
||||
"""Calls the Tavily API to crawl ONE URL with link following and retry logic."""
|
||||
maxRetries = self.crawlMaxRetries
|
||||
retryDelay = self.crawlRetryDelay
|
||||
timeout = self.crawlTimeout
|
||||
|
||||
logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
|
||||
logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
|
||||
logger.debug(f"Starting crawl of URL: {url}")
|
||||
logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")
|
||||
|
||||
for attempt in range(maxRetries + 1):
|
||||
try:
|
||||
logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
|
||||
|
||||
# Use asyncio.wait_for for timeout
|
||||
# Build kwargs for extract
|
||||
kwargsExtract: dict = {"urls": urls}
|
||||
kwargsExtract["extract_depth"] = extractDepth or "advanced"
|
||||
kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure
|
||||
|
||||
logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
|
||||
|
||||
# Ensure client is initialized
|
||||
if self.client is None:
|
||||
self._initializeClient()
|
||||
if self.client is None:
|
||||
raise ValueError("Tavily client not initialized. Please check API key configuration.")
|
||||
|
||||
logger.debug(f"Crawling URL: {url}")
|
||||
|
||||
# Build kwargs for crawl
|
||||
kwargsCrawl: dict = {"url": url}
|
||||
if instructions:
|
||||
kwargsCrawl["instructions"] = instructions
|
||||
if limit:
|
||||
kwargsCrawl["limit"] = limit
|
||||
if maxDepth:
|
||||
kwargsCrawl["max_depth"] = maxDepth
|
||||
if maxBreadth:
|
||||
kwargsCrawl["max_breadth"] = maxBreadth
|
||||
|
||||
logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
|
||||
|
||||
response = await asyncio.wait_for(
|
||||
self.client.extract(**kwargsExtract),
|
||||
self.client.crawl(**kwargsCrawl),
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
logger.debug(f"Tavily response received: {list(response.keys())}")
|
||||
logger.debug(f"Tavily response received: {type(response)}")
|
||||
|
||||
# Debug: Log what Tavily actually returns
|
||||
if "results" in response and response["results"]:
|
||||
logger.debug(f"Tavily returned {len(response['results'])} results")
|
||||
logger.debug(f"First result keys: {list(response['results'][0].keys())}")
|
||||
logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
|
||||
|
||||
# Log each result
|
||||
for i, result in enumerate(response["results"]):
|
||||
logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
|
||||
# Parse response - could be dict with results or list
|
||||
if isinstance(response, dict) and "results" in response:
|
||||
pageResults = response["results"]
|
||||
elif isinstance(response, list):
|
||||
pageResults = response
|
||||
else:
|
||||
logger.warning(f"Tavily returned no results in response: {response}")
|
||||
logger.warning(f"Unexpected response format: {type(response)}")
|
||||
pageResults = []
|
||||
|
||||
results = [
|
||||
WebCrawlResult(
|
||||
url=result["url"],
|
||||
content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content
|
||||
title=result.get("title", "") # Extract title if available
|
||||
)
|
||||
for result in response["results"]
|
||||
]
|
||||
logger.debug(f"Got {len(pageResults)} pages from crawl")
|
||||
|
||||
logger.debug(f"Crawl successful: extracted {len(results)} results")
|
||||
# Convert to WebCrawlResult format
|
||||
results = []
|
||||
for result in pageResults:
|
||||
results.append(WebCrawlResult(
|
||||
url=result.get("url", url),
|
||||
content=result.get("raw_content", result.get("content", "")),
|
||||
title=result.get("title", "")
|
||||
))
|
||||
|
||||
logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
|
||||
return results
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
|
||||
logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
|
||||
if attempt < maxRetries:
|
||||
logger.info(f"Retrying in {retryDelay} seconds...")
|
||||
await asyncio.sleep(retryDelay)
|
||||
|
|
@ -398,14 +410,13 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
|
||||
logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
|
||||
logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
|
||||
|
||||
# Check if it's a validation error and log more details
|
||||
if "validation" in str(e).lower():
|
||||
logger.debug(f"URL validation failed. Checking URL format:")
|
||||
for i, url in enumerate(urls):
|
||||
logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})")
|
||||
logger.debug(f" URL: '{url}' (length: {len(url)})")
|
||||
# Check for common URL issues
|
||||
if ' ' in url:
|
||||
logger.debug(f" WARNING: URL contains spaces!")
|
||||
|
|
@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
if countryName:
|
||||
countryName = self._convertIsoCodeToCountryName(countryName)
|
||||
|
||||
# Perform search
|
||||
# Perform search - use exact parameters from prompt
|
||||
# NOTE: timeRange parameter causes generic results, so we don't use it
|
||||
searchResults = await self._search(
|
||||
query=webSearchPrompt.instruction,
|
||||
maxResults=webSearchPrompt.maxNumberPages,
|
||||
timeRange=webSearchPrompt.timeRange,
|
||||
timeRange=None, # Not used - causes generic results
|
||||
country=countryName,
|
||||
language=webSearchPrompt.language,
|
||||
includeAnswer=False,
|
||||
includeRawContent=False
|
||||
includeAnswer="basic",
|
||||
includeRawContent="text"
|
||||
)
|
||||
|
||||
# Extract URLs from results
|
||||
|
|
@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
|
||||
async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
|
||||
"""
|
||||
WEB_CRAWL operation - crawls one URL using Tavily.
|
||||
WEB_CRAWL operation - crawls one URL using Tavily with link following.
|
||||
|
||||
Args:
|
||||
modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
|
||||
|
||||
Returns:
|
||||
AiModelResponse with crawl results as JSON
|
||||
AiModelResponse with crawl results as JSON (may include multiple pages)
|
||||
"""
|
||||
try:
|
||||
# Extract parameters
|
||||
|
|
@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
|
|||
# Create Pydantic model
|
||||
webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
|
||||
|
||||
# Perform crawl for ONE URL
|
||||
# Note: _crawl expects a list, so we wrap the single URL in a list
|
||||
# Perform crawl for ONE URL with link following
|
||||
# Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
|
||||
crawlResults = await self._crawl(
|
||||
urls=[webCrawlPrompt.url],
|
||||
extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
|
||||
format="markdown"
|
||||
url=webCrawlPrompt.url,
|
||||
instructions=webCrawlPrompt.instruction,
|
||||
limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages
|
||||
maxDepth=webCrawlPrompt.maxDepth or 2,
|
||||
maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth
|
||||
)
|
||||
|
||||
# Format result for single URL - consistent with Perplexity format
|
||||
# If we got multiple pages from the crawl, we need to format them differently
|
||||
# Return the first result for backwards compatibility, but include total page count
|
||||
if crawlResults and len(crawlResults) > 0:
|
||||
firstResult = crawlResults[0]
|
||||
# Get all pages content
|
||||
allContent = ""
|
||||
for i, result in enumerate(crawlResults, 1):
|
||||
pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
|
||||
if result.title:
|
||||
allContent += f"{pageHeader}Title: {result.title}\n\n"
|
||||
allContent += f"{result.content}\n"
|
||||
|
||||
resultData = {
|
||||
"url": firstResult.url,
|
||||
"title": firstResult.title if firstResult.title else "Content",
|
||||
"content": firstResult.content
|
||||
"url": webCrawlPrompt.url,
|
||||
"title": crawlResults[0].title if crawlResults[0].title else "Content",
|
||||
"content": allContent,
|
||||
"pagesCrawled": len(crawlResults),
|
||||
"pageUrls": [result.url for result in crawlResults]
|
||||
}
|
||||
else:
|
||||
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
|
||||
resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
|
||||
|
||||
# Return as JSON - same format as Perplexity
|
||||
# Return as JSON - same format as Perplexity but with multiple pages content
|
||||
import json
|
||||
return AiModelResponse(
|
||||
content=json.dumps(resultData, indent=2),
|
||||
success=True,
|
||||
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
|
||||
metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Tavily web crawl: {str(e)}")
|
||||
import json
|
||||
errorResult = {"error": str(e), "url": ""}
|
||||
errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
|
||||
return AiModelResponse(
|
||||
content=json.dumps(errorResult, indent=2),
|
||||
success=False,
|
||||
|
|
|
|||
|
|
@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
|
|||
instruction: str = Field(description="Search instruction/query for finding relevant URLs")
|
||||
country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
|
||||
maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
|
||||
timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
|
||||
language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
|
||||
researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
|
||||
|
||||
|
|
|
|||
|
|
@ -186,12 +186,13 @@ class CountryCodes:
|
|||
Get Tavily-compatible country name from ISO-2 code.
|
||||
|
||||
Args:
|
||||
isoCode: ISO-2 country code (e.g., "CH", "US")
|
||||
isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
|
||||
|
||||
Returns:
|
||||
Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
|
||||
"""
|
||||
isoCodeUpper = isoCode.upper()
|
||||
# Convert to uppercase for lookup
|
||||
isoCodeUpper = isoCode.upper() if isoCode else ""
|
||||
mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
|
||||
return mapping[0] if mapping else isoCode
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebcrawlService:
|
||||
class WebService:
|
||||
"""Service for web search and crawling operations."""
|
||||
|
||||
def __init__(self, services):
|
||||
|
|
@ -56,7 +56,6 @@ class WebcrawlService:
|
|||
extractedUrls = analysisResult.get("urls", [])
|
||||
needsSearch = analysisResult.get("needsSearch", True) # Default to True
|
||||
maxNumberPages = analysisResult.get("maxNumberPages", 10)
|
||||
timeRange = analysisResult.get("timeRange")
|
||||
countryCode = analysisResult.get("country", country)
|
||||
languageCode = analysisResult.get("language", language)
|
||||
finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
|
||||
|
|
@ -77,7 +76,6 @@ class WebcrawlService:
|
|||
searchUrls = await self._performWebSearch(
|
||||
instruction=instruction,
|
||||
maxNumberPages=maxNumberPages - len(allUrls),
|
||||
timeRange=timeRange,
|
||||
country=countryCode,
|
||||
language=languageCode
|
||||
)
|
||||
|
|
@ -153,10 +151,9 @@ Extract and provide a JSON response with:
|
|||
2. urls: List of URLs found in the prompt text
|
||||
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
|
||||
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
|
||||
5. timeRange: Time range if mentioned (d, w, m, y, or null)
|
||||
6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
|
||||
7. language: Language code if specified (lowercase, e.g., de, en, fr)
|
||||
8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
|
||||
5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
|
||||
6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
|
||||
7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
|
||||
|
||||
Return ONLY valid JSON, no additional text:
|
||||
{{
|
||||
|
|
@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
|
|||
"urls": ["url1", "url2"],
|
||||
"needsSearch": true,
|
||||
"maxNumberPages": 10,
|
||||
"timeRange": null,
|
||||
"country": "ch",
|
||||
"language": "de",
|
||||
"researchDepth": "general"
|
||||
|
|
@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
|
|||
"urls": [],
|
||||
"needsSearch": True,
|
||||
"maxNumberPages": 10,
|
||||
"timeRange": None,
|
||||
"country": country,
|
||||
"language": language,
|
||||
"researchDepth": researchDepth
|
||||
|
|
@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
|
|||
self,
|
||||
instruction: str,
|
||||
maxNumberPages: int,
|
||||
timeRange: Optional[str],
|
||||
country: Optional[str],
|
||||
language: Optional[str]
|
||||
) -> List[str]:
|
||||
|
|
@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
|
|||
instruction=instruction,
|
||||
country=country,
|
||||
maxNumberPages=maxNumberPages,
|
||||
timeRange=timeRange,
|
||||
language=language
|
||||
)
|
||||
searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||
|
|
@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
|
|||
instruction=instruction,
|
||||
url=url, # Single URL
|
||||
maxDepth=maxDepth,
|
||||
maxWidth=10
|
||||
maxWidth=50
|
||||
)
|
||||
crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
|
||||
|
||||
|
|
|
|||
|
|
@ -170,7 +170,7 @@ class MethodAi(MethodBase):
|
|||
- Output format: JSON with research results including URLs and content.
|
||||
|
||||
Parameters:
|
||||
- prompt (str, required): Natural language research instruction, including time range if relevant.
|
||||
- prompt (str, required): Natural language research instruction.
|
||||
- list(url) (list, optional): Specific URLs to crawl, if needed.
|
||||
- country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
|
||||
- language (str, optional): Language code (lowercase, e.g., de, en, fr).
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Models Test - Tests all available AI models individually
|
||||
AI Models Test - Tests WEB_CRAWL functionality on all models that support it
|
||||
|
||||
This script tests all models that have WEB_CRAWL capability, validates that
|
||||
they can crawl specific URLs and return content, and analyzes the quality of results.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
|
@ -53,9 +56,18 @@ class AIModelsTester:
|
|||
|
||||
async def initialize(self):
|
||||
"""Initialize the AI service."""
|
||||
# Set logging level to INFO to reduce noise
|
||||
# Set logging level to DEBUG for detailed output
|
||||
import logging
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Initialize the model registry with all connectors
|
||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||
from modules.aicore.aicorePluginTavily import AiTavily
|
||||
from modules.aicore.aicorePluginPerplexity import AiPerplexity
|
||||
|
||||
# Register web connectors that support WEB_CRAWL
|
||||
modelRegistry.registerConnector(AiTavily())
|
||||
modelRegistry.registerConnector(AiPerplexity())
|
||||
|
||||
# The AI service needs to be recreated with proper initialization
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
|
|
@ -86,27 +98,53 @@ class AIModelsTester:
|
|||
print(f"📁 Results will be saved to: {self.modelTestDir}")
|
||||
|
||||
async def testModel(self, modelName: str) -> Dict[str, Any]:
|
||||
"""Test a specific AI model with a simple prompt."""
|
||||
"""Test a specific AI model with WEB_CRAWL operation."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TESTING MODEL: {modelName}")
|
||||
print(f"OPERATION TYPE: WEB_CRAWL")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Use same prompt for all web models
|
||||
import json
|
||||
# CRAWL CONFIGURATION
|
||||
# Deep and Broad Web Crawl Example:
|
||||
# - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
|
||||
# - Level 1: Starting page
|
||||
# - Level 2: Pages linked from starting page
|
||||
# - Level 3: Pages linked from Level 2 pages
|
||||
# - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
|
||||
# This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
|
||||
#
|
||||
# Common configurations:
|
||||
# - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused)
|
||||
# - General/Standard: maxDepth=2, maxWidth=10 (balanced)
|
||||
# - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive)
|
||||
|
||||
if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
|
||||
# All web models use the same JSON formatted prompt
|
||||
# Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
|
||||
CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep
|
||||
CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level
|
||||
|
||||
print(f"Crawl Configuration:")
|
||||
print(f" - Depth: {CRAWL_DEPTH} levels (deep)")
|
||||
print(f" - Width: {CRAWL_WIDTH} pages per level (broad)")
|
||||
print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
|
||||
|
||||
# Use WEB_CRAWL specific prompt format
|
||||
from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
|
||||
|
||||
# Test with simple prompt like playground example
|
||||
simplePrompt = f"https://www.valueon.ch: Who works in this company?"
|
||||
|
||||
# But keep structured format for now to match our API
|
||||
testPrompt = json.dumps({
|
||||
"prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
|
||||
"maxResults": 5,
|
||||
"timeRange": "y",
|
||||
"country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland"
|
||||
"format": "json"
|
||||
"instruction": "Who works in this company?",
|
||||
"url": "https://www.valueon.ch",
|
||||
"maxDepth": CRAWL_DEPTH,
|
||||
"maxWidth": CRAWL_WIDTH
|
||||
}, indent=2)
|
||||
else:
|
||||
# Fallback for other models
|
||||
testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON."
|
||||
|
||||
print(f"Simple prompt (playground style): {simplePrompt}")
|
||||
|
||||
# For Tavily models, test direct API call for better link following
|
||||
if "tavily" in modelName.lower():
|
||||
return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
|
||||
|
||||
print(f"Test prompt: {testPrompt}")
|
||||
print(f"Prompt length: {len(testPrompt)} characters")
|
||||
|
|
@ -114,15 +152,9 @@ class AIModelsTester:
|
|||
startTime = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
# Create options to force this specific model
|
||||
if "internal" in modelName.lower():
|
||||
# Create options for WEB_CRAWL operation
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_EXTRACT,
|
||||
preferredModel=modelName
|
||||
)
|
||||
else:
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
||||
operationType=OperationTypeEnum.WEB_CRAWL,
|
||||
preferredModel=modelName
|
||||
)
|
||||
|
||||
|
|
@ -140,22 +172,7 @@ class AIModelsTester:
|
|||
import base64
|
||||
import os
|
||||
|
||||
# Prepare messages and options based on model type
|
||||
if "vision" in modelName.lower():
|
||||
# For vision models, skip for now since they require special handling
|
||||
print(f"⚠️ Skipping vision model {modelName} - requires special image handling")
|
||||
return {
|
||||
"modelName": modelName,
|
||||
"status": "SKIPPED",
|
||||
"processingTime": 0.0,
|
||||
"responseLength": 0,
|
||||
"responseType": "skipped",
|
||||
"hasContent": False,
|
||||
"error": "Vision model requires special image handling",
|
||||
"fullResponse": "Skipped - vision model requires special image handling"
|
||||
}
|
||||
else:
|
||||
# For other models, use normal functionCall
|
||||
# For WEB_CRAWL models, use normal functionCall with structured prompt
|
||||
messages = [{"role": "user", "content": testPrompt}]
|
||||
modelCall = AiModelCall(
|
||||
messages=messages,
|
||||
|
|
@ -185,6 +202,10 @@ class AIModelsTester:
|
|||
"bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
|
||||
}
|
||||
|
||||
# Extract actual prompt sent if available in metadata
|
||||
if hasattr(response, 'metadata') and response.metadata:
|
||||
result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
|
||||
|
||||
# Try to parse content as JSON
|
||||
if response.content:
|
||||
try:
|
||||
|
|
@ -289,9 +310,16 @@ class AIModelsTester:
|
|||
print(f"📄 Response length: {len(str(response))} characters")
|
||||
print(f"📄 Response preview: {result['responsePreview']}")
|
||||
|
||||
# Save text response for all models
|
||||
if result.get("status") == "SUCCESS":
|
||||
self._saveTextResponse(modelName, result)
|
||||
# Add prompt to result for logging
|
||||
result["testPrompt"] = testPrompt
|
||||
result["crawlConfig"] = {
|
||||
"depth": CRAWL_DEPTH,
|
||||
"width": CRAWL_WIDTH
|
||||
}
|
||||
|
||||
# For WEB_CRAWL, also validate that content was extracted
|
||||
if result.get("status") == "SUCCESS" and result.get("fullResponse"):
|
||||
self._validateCrawlResponse(modelName, result)
|
||||
|
||||
except Exception as e:
|
||||
endTime = asyncio.get_event_loop().time()
|
||||
|
|
@ -304,13 +332,22 @@ class AIModelsTester:
|
|||
"responseLength": 0,
|
||||
"responseType": "exception",
|
||||
"hasContent": False,
|
||||
"error": str(e)
|
||||
"error": str(e),
|
||||
"testPrompt": testPrompt,
|
||||
"crawlConfig": {
|
||||
"depth": CRAWL_DEPTH,
|
||||
"width": CRAWL_WIDTH
|
||||
}
|
||||
}
|
||||
|
||||
print(f"💥 EXCEPTION - {str(e)}")
|
||||
|
||||
self.testResults.append(result)
|
||||
|
||||
# Save text response even for exceptions to log the prompt
|
||||
if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
|
||||
self._saveTextResponse(modelName, result)
|
||||
|
||||
# Save individual model result immediately
|
||||
self._saveIndividualModelResult(modelName, result)
|
||||
|
||||
|
|
@ -378,6 +415,19 @@ class AIModelsTester:
|
|||
if not content:
|
||||
content = result.get("responsePreview", "No content available")
|
||||
|
||||
# If there's an error, include it in the content
|
||||
if result.get("error"):
|
||||
content = f"ERROR: {result.get('error')}\n\n{content}"
|
||||
|
||||
# Get prompt and config for logging
|
||||
config = result.get("crawlConfig", {})
|
||||
crawlDepth = config.get("depth", "N/A")
|
||||
crawlWidth = config.get("width", "N/A")
|
||||
|
||||
# Get both the original JSON prompt and the actual prompt sent
|
||||
originalPrompt = result.get("testPrompt", "N/A")
|
||||
actualPromptSent = result.get("actualPromptSent", "N/A")
|
||||
|
||||
# Add metadata header
|
||||
metadata = f"""Model: {modelName}
|
||||
Test Time: {timestamp}
|
||||
|
|
@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
|
|||
Processing Time: {result.get('processingTime', 0):.2f}s
|
||||
Response Length: {result.get('responseLength', 0)} characters
|
||||
Is Valid JSON: {result.get('isValidJson', False)}
|
||||
Test Method: {result.get('testMethod', 'standard')}
|
||||
Pages Crawled: {result.get('pagesCrawled', 'N/A')}
|
||||
Crawled URL: {result.get('crawledUrl', 'N/A')}
|
||||
Has URL: {result.get('hasUrl', 'N/A')}
|
||||
Has Title: {result.get('hasTitle', 'N/A')}
|
||||
Has Content: {result.get('hasContent', 'N/A')}
|
||||
Content Length: {result.get('contentLength', 'N/A')} characters
|
||||
|
||||
--- CRAWL CONFIGURATION ---
|
||||
Depth: {crawlDepth}
|
||||
Width: {crawlWidth}
|
||||
|
||||
--- ORIGINAL JSON PROMPT (input) ---
|
||||
{originalPrompt}
|
||||
|
||||
--- ACTUAL PROMPT SENT TO API (EXACT) ---
|
||||
{actualPromptSent}
|
||||
|
||||
--- RESPONSE CONTENT ---
|
||||
{content}
|
||||
|
|
@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
|||
print(f"❌ Error saving text response: {str(e)}")
|
||||
result["textSaveError"] = str(e)
|
||||
|
||||
def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
|
||||
"""Validate that the WEB_CRAWL response contains crawled content."""
|
||||
try:
|
||||
content = result.get("fullResponse", "")
|
||||
|
||||
# Try to parse as JSON
|
||||
crawledData = {}
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
if isinstance(parsed, dict):
|
||||
crawledData = parsed
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check for expected fields: url, title, content
|
||||
hasUrl = bool(crawledData.get("url"))
|
||||
hasTitle = bool(crawledData.get("title"))
|
||||
hasContent = bool(crawledData.get("content"))
|
||||
contentLength = len(crawledData.get("content", ""))
|
||||
|
||||
result["hasUrl"] = hasUrl
|
||||
result["hasTitle"] = hasTitle
|
||||
result["hasContent"] = hasContent
|
||||
result["contentLength"] = contentLength
|
||||
result["crawledUrl"] = crawledData.get("url", "")
|
||||
|
||||
if hasUrl and hasContent:
|
||||
print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
|
||||
print(f" Content length: {contentLength} characters")
|
||||
print(f" Title: {crawledData.get('title', 'N/A')}")
|
||||
else:
|
||||
print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error validating crawl response: {str(e)}")
|
||||
result["crawlValidationError"] = str(e)
|
||||
|
||||
async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
|
||||
"""Test Tavily API directly using the crawl() method with better link following."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TESTING TAVILY DIRECT API (crawl method)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
startTime = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
from tavily import AsyncTavilyClient
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
|
||||
apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
|
||||
if not apiKey:
|
||||
raise Exception("Tavily API key not found")
|
||||
|
||||
client = AsyncTavilyClient(api_key=apiKey)
|
||||
|
||||
# Map our configuration to Tavily parameters
|
||||
# maxWidth -> limit (pages per level)
|
||||
# maxDepth -> max_depth (link following depth)
|
||||
# max_breadth = maxWidth (breadth of crawl at each level)
|
||||
tavilyLimit = crawlWidth
|
||||
tavilyMaxDepth = crawlDepth
|
||||
tavilyMaxBreadth = crawlWidth
|
||||
|
||||
print(f"Calling Tavily API with crawl() method...")
|
||||
print(f"URL: https://www.valueon.ch")
|
||||
print(f"Instructions: Who works in this company?")
|
||||
print(f"Limit: {tavilyLimit} pages per level")
|
||||
print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
|
||||
print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
|
||||
print(f"Deep and Broad Crawl Configuration Active")
|
||||
|
||||
response = await client.crawl(
|
||||
url="https://www.valueon.ch",
|
||||
instructions="Who works in this company?",
|
||||
limit=tavilyLimit,
|
||||
max_depth=tavilyMaxDepth,
|
||||
max_breadth=tavilyMaxBreadth
|
||||
)
|
||||
|
||||
endTime = asyncio.get_event_loop().time()
|
||||
processingTime = endTime - startTime
|
||||
|
||||
# Analyze response
|
||||
contentLength = 0
|
||||
pagesCrawled = 0
|
||||
fullContent = ""
|
||||
|
||||
if isinstance(response, dict):
|
||||
# Check if it has results
|
||||
if "results" in response:
|
||||
results = response["results"]
|
||||
pagesCrawled = len(results)
|
||||
content_parts = []
|
||||
for result in results:
|
||||
url = result.get("url", "")
|
||||
title = result.get("title", "")
|
||||
content = result.get("raw_content", result.get("content", ""))
|
||||
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
|
||||
contentLength += len(content)
|
||||
|
||||
fullContent = "\n".join(content_parts)
|
||||
else:
|
||||
fullContent = json.dumps(response, indent=2)
|
||||
contentLength = len(fullContent)
|
||||
elif isinstance(response, list):
|
||||
pagesCrawled = len(response)
|
||||
content_parts = []
|
||||
for item in response:
|
||||
if isinstance(item, dict):
|
||||
url = item.get("url", "")
|
||||
title = item.get("title", "")
|
||||
content = item.get("raw_content", item.get("content", ""))
|
||||
content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
|
||||
contentLength += len(content)
|
||||
|
||||
fullContent = "\n".join(content_parts)
|
||||
else:
|
||||
fullContent = str(response)
|
||||
contentLength = len(fullContent)
|
||||
|
||||
result = {
|
||||
"modelName": modelName,
|
||||
"status": "SUCCESS",
|
||||
"processingTime": round(processingTime, 2),
|
||||
"responseLength": contentLength,
|
||||
"responseType": "TavilyDirectAPI",
|
||||
"hasContent": True,
|
||||
"error": None,
|
||||
"modelUsed": modelName,
|
||||
"priceUsd": 0.0,
|
||||
"bytesSent": 0,
|
||||
"bytesReceived": contentLength,
|
||||
"isValidJson": True,
|
||||
"fullResponse": fullContent,
|
||||
"pagesCrawled": pagesCrawled,
|
||||
"testMethod": "direct_api_crawl"
|
||||
}
|
||||
|
||||
print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
|
||||
print(f"📄 Pages crawled: {pagesCrawled}")
|
||||
print(f"📄 Total content length: {contentLength} characters")
|
||||
|
||||
# Save the response
|
||||
self._saveTextResponse(modelName, result)
|
||||
self._validateCrawlResponse(modelName, result)
|
||||
self._saveIndividualModelResult(modelName, result)
|
||||
|
||||
self.testResults.append(result)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
endTime = asyncio.get_event_loop().time()
|
||||
processingTime = endTime - startTime
|
||||
|
||||
result = {
|
||||
"modelName": modelName,
|
||||
"status": "EXCEPTION",
|
||||
"processingTime": round(processingTime, 2),
|
||||
"responseLength": 0,
|
||||
"responseType": "exception",
|
||||
"hasContent": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
print(f"💥 EXCEPTION - {str(e)}")
|
||||
self.testResults.append(result)
|
||||
return result
|
||||
|
||||
def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
|
||||
"""Save individual model test result to file."""
|
||||
try:
|
||||
|
|
@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
|||
print(f"❌ Error saving individual result: {str(e)}")
|
||||
|
||||
def getAllAvailableModels(self) -> List[str]:
|
||||
"""Get all available model names."""
|
||||
# Hardcoded list of known models - same approach as test_ai_behavior.py
|
||||
return [
|
||||
# "claude-3-5-sonnet-20241022", # Skipped - text model, test later
|
||||
# "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input
|
||||
# "gpt-4o", # Skipped - text model, test later
|
||||
# "gpt-3.5-turbo", # Skipped - text model, test later
|
||||
# "gpt-4o-vision", # Skipped - requires image input
|
||||
# "dall-e-3", # Skipped - image generation, test later
|
||||
"sonar", # Perplexity web model
|
||||
"sonar-pro", # Perplexity web model
|
||||
"tavily-search", # Tavily web model (unified research)
|
||||
# "internal-extractor", # Skipped - internal model, test later
|
||||
# "internal-generator", # Skipped - internal model, test later
|
||||
# "internal-renderer" # Skipped - internal model, test later
|
||||
]
|
||||
"""Get all available model names that support WEB_CRAWL."""
|
||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||
from modules.datamodels.datamodelAi import OperationTypeEnum
|
||||
|
||||
# Get all models from registry
|
||||
allModels = modelRegistry.getAvailableModels()
|
||||
|
||||
# Filter models that support WEB_CRAWL
|
||||
webCrawlModels = []
|
||||
for model in allModels:
|
||||
if model.operationTypes and any(
|
||||
ot.operationType == OperationTypeEnum.WEB_CRAWL
|
||||
for ot in model.operationTypes
|
||||
): # Include both Tavily and Perplexity models
|
||||
webCrawlModels.append(model.name)
|
||||
|
||||
# Filter to only "sonar" model for testing
|
||||
webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
|
||||
|
||||
print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
|
||||
for modelName in webCrawlModels:
|
||||
print(f" - {modelName}")
|
||||
|
||||
return webCrawlModels
|
||||
|
||||
def saveTestResults(self):
|
||||
"""Save detailed test results to file."""
|
||||
|
|
@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
|||
if result.get("isValidJson") is not None:
|
||||
print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
|
||||
|
||||
if result.get("crawledUrl"):
|
||||
print(f" Crawled URL: {result['crawledUrl']}")
|
||||
|
||||
if result.get("contentLength") is not None:
|
||||
print(f" Content length: {result['contentLength']} characters")
|
||||
|
||||
if result.get("pagesCrawled") is not None:
|
||||
print(f" Pages crawled: {result['pagesCrawled']}")
|
||||
|
||||
if result["error"]:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
|
|
@ -526,11 +778,31 @@ Is Valid JSON: {result.get('isValidJson', False)}
|
|||
print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
|
||||
print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
|
||||
|
||||
# Find models with most content
|
||||
modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
|
||||
if modelsWithContent:
|
||||
mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
|
||||
totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
|
||||
avgContent = totalContent / len(modelsWithContent)
|
||||
print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
|
||||
print(f"📊 Average content per model: {avgContent:.0f} characters")
|
||||
print(f"📊 Total content crawled across all models: {totalContent} characters")
|
||||
|
||||
# Find models with most pages crawled (for Tavily direct API)
|
||||
modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
|
||||
if modelsWithPages:
|
||||
mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
|
||||
totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
|
||||
avgPages = totalPages / len(modelsWithPages)
|
||||
print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
|
||||
print(f"📊 Average pages per model: {avgPages:.1f} pages")
|
||||
print(f"📊 Total pages crawled across all models: {totalPages} pages")
|
||||
|
||||
async def main():
|
||||
"""Run AI models testing."""
|
||||
"""Run AI models testing for WEB_CRAWL operation."""
|
||||
tester = AIModelsTester()
|
||||
|
||||
print("Starting AI Models Testing...")
|
||||
print("Starting AI Models Testing for WEB_CRAWL...")
|
||||
print("Initializing AI service...")
|
||||
await tester.initialize()
|
||||
|
||||
|
|
@ -542,8 +814,9 @@ async def main():
|
|||
print(f" {i}. {model}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("STARTING INDIVIDUAL MODEL TESTS")
|
||||
print("STARTING WEB_CRAWL TESTS")
|
||||
print(f"{'='*80}")
|
||||
print("Testing each model's ability to crawl URLs and return content...")
|
||||
print("Press Enter after each model test to continue to the next one...")
|
||||
|
||||
# Test each model individually
|
||||
|
|
|
|||
Loading…
Reference in a new issue