From 2489719c629f3ec9196b0079a08167a22346e37b Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 26 Oct 2025 18:17:17 +0100 Subject: [PATCH] ai models ready for web and txt --- modules/aicore/aicorePluginPerplexity.py | 126 ++++-- modules/aicore/aicorePluginTavily.py | 233 +++++----- modules/datamodels/datamodelAi.py | 1 - modules/datamodels/datamodelTools.py | 5 +- modules/services/serviceWeb/mainServiceWeb.py | 17 +- modules/workflows/methods/methodAi.py | 2 +- test_ai_models.py | 425 ++++++++++++++---- 7 files changed, 577 insertions(+), 232 deletions(-) diff --git a/modules/aicore/aicorePluginPerplexity.py b/modules/aicore/aicorePluginPerplexity.py index 23bffc48..51b585b9 100644 --- a/modules/aicore/aicorePluginPerplexity.py +++ b/modules/aicore/aicorePluginPerplexity.py @@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi): connectorType="perplexity", apiUrl="https://api.perplexity.ai/chat/completions", temperature=0.2, - maxTokens=4000, + maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k) contextLength=32000, costPer1kTokensInput=0.005, costPer1kTokensOutput=0.005, @@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi): connectorType="perplexity", apiUrl="https://api.perplexity.ai/chat/completions", temperature=0.2, - maxTokens=4000, + maxTokens=24000, # Increased for detailed web crawl responses (Perplexity supports up to 25k) contextLength=32000, costPer1kTokensInput=0.01, costPer1kTokensOutput=0.01, speedRating=6, # Slower due to AI analysis - qualityRating=10, # Best AI analysis quality + qualityRating=9, # Best AI analysis quality # capabilities removed (not used in business logic) functionCall=self._routeWebOperation, priority=PriorityEnum.QUALITY, @@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi): # Fallback to basic call return await self.callAiBasic(modelCall) + def _getDepthInstructions(self, maxDepth: int) -> str: + """ + Map maxDepth (numeric) to instructional text for LLM. + + Args: + maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive) + + Returns: + Instructional text for the LLM + """ + depthMap = { + 1: "Basic overview - extract main content from the main page only", + 2: "Standard crawl - extract content from main page and linked pages (2 levels deep)", + 3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)" + } + return depthMap.get(maxDepth, depthMap[2]) + + def _getWidthInstructions(self, maxWidth: int) -> str: + """ + Map maxWidth (numeric) to instructional text for LLM. + + Args: + maxWidth: Number of pages to crawl at each level (default: 10) + + Returns: + Instructional text for the LLM + """ + if maxWidth <= 5: + return f"Focused crawl - limit to {maxWidth} most relevant pages per level" + elif maxWidth <= 15: + return f"Standard breadth - crawl up to {maxWidth} pages per level" + elif maxWidth <= 30: + return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality" + else: + return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage" + async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse: """ WEB_SEARCH operation - returns list of URLs based on search query. @@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi): Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs. {'' if not countryName else f'Focus on results from {countryName}.'} -{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'} -{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'} Return ONLY a JSON array of URLs, no additional text: [ @@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text: """ WEB_CRAWL operation - crawls ONE URL and returns content. + Perplexity API Parameters Used: + - messages: The prompt containing URL and instruction + - max_tokens: Maximum response length + - max_results: Number of search results (1-20, default: 10) + - temperature: Response randomness (not web search specific) + + Pagination: Perplexity does NOT return paginated responses. + A single response contains all results within max_tokens limit. + Args: modelCall: AiModelCall with AiCallPromptWebCrawl as prompt @@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text: webCrawlPrompt = AiCallPromptWebCrawl(**promptData) # Build crawl request for Perplexity - ONE URL - crawlPrompt = f"""Crawl and extract content from this URL based on the instruction: - -INSTRUCTION: '{webCrawlPrompt.instruction}' - -URL to crawl (maxDepth={webCrawlPrompt.maxDepth}): -{webCrawlPrompt.url} - -Extract and return the relevant content based on the instruction. -Return as JSON object with this structure: -{{ - "url": "{webCrawlPrompt.url}", - "title": "Page title", - "content": "Extracted content relevant to the instruction" -}} - -Return ONLY valid JSON, no additional text.""" + # Match playground prompt style: just URL + question + # This allows Perplexity to return detailed multi-source results + crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}" + + # Build payload with optional Perplexity parameters + # Note: max_tokens_per_page may not be supported by chat/completions endpoint + # The playground Python SDK might use a different internal API + maxResults = min(webCrawlPrompt.maxWidth or 10, 20) # Max 20 results payload = { "model": model.name, "messages": [{"role": "user", "content": crawlPrompt}], "temperature": temperature, - "max_tokens": maxTokens + "max_tokens": maxTokens, # Use model's configured maxTokens (24000) + "max_results": maxResults, + "return_citations": True # Request citations explicitly } + logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}") + response = await self.httpClient.post(model.apiUrl, json=payload) if response.status_code != 200: raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}") apiResponse = response.json() + + # Extract the main content content = apiResponse["choices"][0]["message"]["content"] - # Parse JSON content and ensure it's a single object - import json - try: - parsedContent = json.loads(content) - # Ensure it's a single object, not an array - if isinstance(parsedContent, list): - parsedContent = parsedContent[0] if parsedContent else {} - except: - # If not JSON, create structured response - parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content} + # Check for citations or search results in the response + citations = apiResponse.get("citations", []) + searchResults = apiResponse.get("search_results", []) - # Return as JSON string + # Log what we found + if citations: + logger.info(f"Found {len(citations)} citations in response") + if searchResults: + logger.info(f"Found {len(searchResults)} search results in response") + logger.debug(f"API response keys: {list(apiResponse.keys())}") + + # Build comprehensive response with citations if available + import json + responseData = { + "content": content, + "citations": citations if citations else [], + "search_results": searchResults if searchResults else [] + } + + # Return comprehensive response return AiModelResponse( - content=json.dumps(parsedContent, indent=2), + content=json.dumps(responseData, indent=2) if (citations or searchResults) else content, success=True, modelId=model.name, - metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url} + metadata={ + "response_id": apiResponse.get("id", ""), + "operation": "WEB_CRAWL", + "url": webCrawlPrompt.url, + "actualPromptSent": crawlPrompt, + "has_citations": len(citations) > 0, + "has_search_results": len(searchResults) > 0 + } ) except Exception as e: diff --git a/modules/aicore/aicorePluginTavily.py b/modules/aicore/aicorePluginTavily.py index 9320bba7..fb454f1f 100644 --- a/modules/aicore/aicorePluginTavily.py +++ b/modules/aicore/aicorePluginTavily.py @@ -27,7 +27,8 @@ class WebCrawlResult: content: str title: Optional[str] = None -class ConnectorWeb(BaseConnectorAi): + +class AiTavily(BaseConnectorAi): """Tavily web search connector.""" def __init__(self): @@ -42,7 +43,36 @@ class ConnectorWeb(BaseConnectorAi): self.webSearchMaxResults: int = 20 # Initialize client if API key is available self._initializeClient() - + + + def getModels(self) -> List[AiModel]: + """Get all available Tavily models.""" + return [ + AiModel( + name="tavily-search", + displayName="Tavily Search & Research", + connectorType="tavily", + apiUrl="https://api.tavily.com", + temperature=0.0, # Web search doesn't use temperature + maxTokens=0, # Web search doesn't use tokens + contextLength=0, + costPer1kTokensInput=0.0, + costPer1kTokensOutput=0.0, + speedRating=8, # Good speed for search and extract + qualityRating=9, # Excellent quality for web research + # capabilities removed (not used in business logic) + functionCall=self._routeWebOperation, + priority=PriorityEnum.BALANCED, + processingMode=ProcessingModeEnum.BASIC, + operationTypes=createOperationTypeRatings( + (OperationTypeEnum.WEB_SEARCH, 9), + (OperationTypeEnum.WEB_CRAWL, 10) + ), + version="tavily-search", + calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate + ) + ] + def _initializeClient(self): """Initialize the Tavily client if API key is available.""" try: @@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi): return filteredResults - def getModels(self) -> List[AiModel]: - """Get all available Tavily models.""" - return [ - AiModel( - name="tavily-search", - displayName="Tavily Search & Research", - connectorType="tavily", - apiUrl="https://api.tavily.com", - temperature=0.0, # Web search doesn't use temperature - maxTokens=0, # Web search doesn't use tokens - contextLength=0, - costPer1kTokensInput=0.0, - costPer1kTokensOutput=0.0, - speedRating=8, # Good speed for search and extract - qualityRating=9, # Excellent quality for web research - # capabilities removed (not used in business logic) - functionCall=self._routeWebOperation, - priority=PriorityEnum.BALANCED, - processingMode=ProcessingModeEnum.BASIC, - operationTypes=createOperationTypeRatings( - (OperationTypeEnum.WEB_SEARCH, 9), - (OperationTypeEnum.WEB_CRAWL, 8) - ), - version="tavily-search", - calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008 # Simple flat rate - ) - ] - @classmethod async def create(cls): apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET") @@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi): topic: str | None = None, includeDomains: list[str] | None = None, excludeDomains: list[str] | None = None, - language: str | None = None, country: str | None = None, - includeAnswer: bool | None = None, - includeRawContent: bool | None = None, + includeAnswer: str | None = None, + includeRawContent: str | None = None, ) -> list[WebSearchResult]: """Calls the Tavily API to perform a web search.""" # Make sure maxResults is within the allowed range (use cached values) @@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi): kwargs["include_domains"] = includeDomains if excludeDomains is not None: kwargs["exclude_domains"] = excludeDomains - if language is not None: - kwargs["language"] = language if country is not None: kwargs["country"] = country if includeAnswer is not None: @@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi): if includeRawContent is not None: kwargs["include_raw_content"] = includeRawContent - logger.debug(f"Tavily.search kwargs: {kwargs}") + # Log the final API call parameters for comparison + logger.info(f"Tavily API call parameters: {kwargs}") # Ensure client is initialized if self.client is None: @@ -316,7 +316,11 @@ class ConnectorWeb(BaseConnectorAi): raise ValueError("Tavily client not initialized. Please check API key configuration.") response = await self.client.search(**kwargs) - + + # Return all results without score filtering + # Tavily's scoring is already applied by the API + logger.info(f"Tavily returned {len(response.get('results', []))} results") + return [ WebSearchResult( title=result["title"], @@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi): async def _crawl( self, - urls: list, - extractDepth: str | None = None, - format: str | None = None, + url: str, + instructions: str | None = None, + limit: int = 20, + maxDepth: int = 2, + maxBreadth: int = 40, ) -> list[WebCrawlResult]: - """Calls the Tavily API to extract text content from URLs with retry logic.""" + """Calls the Tavily API to crawl ONE URL with link following and retry logic.""" maxRetries = self.crawlMaxRetries retryDelay = self.crawlRetryDelay timeout = self.crawlTimeout - logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}") - logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s") + logger.debug(f"Starting crawl of URL: {url}") + logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s") for attempt in range(maxRetries + 1): try: logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}") - # Use asyncio.wait_for for timeout - # Build kwargs for extract - kwargsExtract: dict = {"urls": urls} - kwargsExtract["extract_depth"] = extractDepth or "advanced" - kwargsExtract["format"] = format or "markdown" # Use markdown to get HTML structure - - logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}") - # Ensure client is initialized if self.client is None: self._initializeClient() if self.client is None: raise ValueError("Tavily client not initialized. Please check API key configuration.") + logger.debug(f"Crawling URL: {url}") + + # Build kwargs for crawl + kwargsCrawl: dict = {"url": url} + if instructions: + kwargsCrawl["instructions"] = instructions + if limit: + kwargsCrawl["limit"] = limit + if maxDepth: + kwargsCrawl["max_depth"] = maxDepth + if maxBreadth: + kwargsCrawl["max_breadth"] = maxBreadth + + logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}") + response = await asyncio.wait_for( - self.client.extract(**kwargsExtract), + self.client.crawl(**kwargsCrawl), timeout=timeout ) - - logger.debug(f"Tavily response received: {list(response.keys())}") - # Debug: Log what Tavily actually returns - if "results" in response and response["results"]: - logger.debug(f"Tavily returned {len(response['results'])} results") - logger.debug(f"First result keys: {list(response['results'][0].keys())}") - logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}") - - # Log each result - for i, result in enumerate(response["results"]): - logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}") + logger.debug(f"Tavily response received: {type(response)}") + + # Parse response - could be dict with results or list + if isinstance(response, dict) and "results" in response: + pageResults = response["results"] + elif isinstance(response, list): + pageResults = response else: - logger.warning(f"Tavily returned no results in response: {response}") + logger.warning(f"Unexpected response format: {type(response)}") + pageResults = [] - results = [ - WebCrawlResult( - url=result["url"], - content=result.get("raw_content", result.get("content", "")), # Try raw_content first, fallback to content - title=result.get("title", "") # Extract title if available - ) - for result in response["results"] - ] + logger.debug(f"Got {len(pageResults)} pages from crawl") - logger.debug(f"Crawl successful: extracted {len(results)} results") + # Convert to WebCrawlResult format + results = [] + for result in pageResults: + results.append(WebCrawlResult( + url=result.get("url", url), + content=result.get("raw_content", result.get("content", "")), + title=result.get("title", "") + )) + + logger.debug(f"Crawl successful: extracted {len(results)} pages from URL") return results except asyncio.TimeoutError: - logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}") + logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}") if attempt < maxRetries: logger.info(f"Retrying in {retryDelay} seconds...") await asyncio.sleep(retryDelay) @@ -398,21 +410,20 @@ class ConnectorWeb(BaseConnectorAi): raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout") except Exception as e: - logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}") + logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}") logger.debug(f"Full error details: {type(e).__name__}: {str(e)}") # Check if it's a validation error and log more details if "validation" in str(e).lower(): logger.debug(f"URL validation failed. Checking URL format:") - for i, url in enumerate(urls): - logger.debug(f" URL {i+1}: '{url}' (length: {len(url)})") - # Check for common URL issues - if ' ' in url: - logger.debug(f" WARNING: URL contains spaces!") - if not url.startswith(('http://', 'https://')): - logger.debug(f" WARNING: URL doesn't start with http/https!") - if len(url) > 2000: - logger.debug(f" WARNING: URL is very long ({len(url)} chars)") + logger.debug(f" URL: '{url}' (length: {len(url)})") + # Check for common URL issues + if ' ' in url: + logger.debug(f" WARNING: URL contains spaces!") + if not url.startswith(('http://', 'https://')): + logger.debug(f" WARNING: URL doesn't start with http/https!") + if len(url) > 2000: + logger.debug(f" WARNING: URL is very long ({len(url)} chars)") if attempt < maxRetries: logger.info(f"Retrying in {retryDelay} seconds...") @@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi): if countryName: countryName = self._convertIsoCodeToCountryName(countryName) - # Perform search + # Perform search - use exact parameters from prompt + # NOTE: timeRange parameter causes generic results, so we don't use it searchResults = await self._search( query=webSearchPrompt.instruction, maxResults=webSearchPrompt.maxNumberPages, - timeRange=webSearchPrompt.timeRange, + timeRange=None, # Not used - causes generic results country=countryName, - language=webSearchPrompt.language, - includeAnswer=False, - includeRawContent=False + includeAnswer="basic", + includeRawContent="text" ) # Extract URLs from results @@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi): async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse": """ - WEB_CRAWL operation - crawls one URL using Tavily. + WEB_CRAWL operation - crawls one URL using Tavily with link following. Args: modelCall: AiModelCall with AiCallPromptWebCrawl as prompt Returns: - AiModelResponse with crawl results as JSON + AiModelResponse with crawl results as JSON (may include multiple pages) """ try: # Extract parameters @@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi): # Create Pydantic model webCrawlPrompt = AiCallPromptWebCrawl(**promptData) - # Perform crawl for ONE URL - # Note: _crawl expects a list, so we wrap the single URL in a list + # Perform crawl for ONE URL with link following + # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth crawlResults = await self._crawl( - urls=[webCrawlPrompt.url], - extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic", - format="markdown" + url=webCrawlPrompt.url, + instructions=webCrawlPrompt.instruction, + limit=webCrawlPrompt.maxWidth or 20, # maxWidth controls number of pages + maxDepth=webCrawlPrompt.maxDepth or 2, + maxBreadth=webCrawlPrompt.maxWidth or 40 # Use same as limit for breadth ) - # Format result for single URL - consistent with Perplexity format + # If we got multiple pages from the crawl, we need to format them differently + # Return the first result for backwards compatibility, but include total page count if crawlResults and len(crawlResults) > 0: - firstResult = crawlResults[0] + # Get all pages content + allContent = "" + for i, result in enumerate(crawlResults, 1): + pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n" + if result.title: + allContent += f"{pageHeader}Title: {result.title}\n\n" + allContent += f"{result.content}\n" + resultData = { - "url": firstResult.url, - "title": firstResult.title if firstResult.title else "Content", - "content": firstResult.content + "url": webCrawlPrompt.url, + "title": crawlResults[0].title if crawlResults[0].title else "Content", + "content": allContent, + "pagesCrawled": len(crawlResults), + "pageUrls": [result.url for result in crawlResults] } else: - resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"} + resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0} - # Return as JSON - same format as Perplexity + # Return as JSON - same format as Perplexity but with multiple pages content import json return AiModelResponse( content=json.dumps(resultData, indent=2), success=True, - metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url} + metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0} ) except Exception as e: logger.error(f"Error in Tavily web crawl: {str(e)}") import json - errorResult = {"error": str(e), "url": ""} + errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""} return AiModelResponse( content=json.dumps(errorResult, indent=2), success=False, diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py index b7e883d1..f73cbd08 100644 --- a/modules/datamodels/datamodelAi.py +++ b/modules/datamodels/datamodelAi.py @@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel): instruction: str = Field(description="Search instruction/query for finding relevant URLs") country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)") maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)") - timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)") language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)") researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)") diff --git a/modules/datamodels/datamodelTools.py b/modules/datamodels/datamodelTools.py index 39ba8bda..45227903 100644 --- a/modules/datamodels/datamodelTools.py +++ b/modules/datamodels/datamodelTools.py @@ -186,12 +186,13 @@ class CountryCodes: Get Tavily-compatible country name from ISO-2 code. Args: - isoCode: ISO-2 country code (e.g., "CH", "US") + isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us") Returns: Country name in lowercase as required by Tavily (e.g., "switzerland", "united states") """ - isoCodeUpper = isoCode.upper() + # Convert to uppercase for lookup + isoCodeUpper = isoCode.upper() if isoCode else "" mapping = cls._COUNTRY_MAP.get(isoCodeUpper) return mapping[0] if mapping else isoCode diff --git a/modules/services/serviceWeb/mainServiceWeb.py b/modules/services/serviceWeb/mainServiceWeb.py index fc08aa7c..3e43da4a 100644 --- a/modules/services/serviceWeb/mainServiceWeb.py +++ b/modules/services/serviceWeb/mainServiceWeb.py @@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC logger = logging.getLogger(__name__) -class WebcrawlService: +class WebService: """Service for web search and crawling operations.""" def __init__(self, services): @@ -56,7 +56,6 @@ class WebcrawlService: extractedUrls = analysisResult.get("urls", []) needsSearch = analysisResult.get("needsSearch", True) # Default to True maxNumberPages = analysisResult.get("maxNumberPages", 10) - timeRange = analysisResult.get("timeRange") countryCode = analysisResult.get("country", country) languageCode = analysisResult.get("language", language) finalResearchDepth = analysisResult.get("researchDepth", researchDepth) @@ -77,7 +76,6 @@ class WebcrawlService: searchUrls = await self._performWebSearch( instruction=instruction, maxNumberPages=maxNumberPages - len(allUrls), - timeRange=timeRange, country=countryCode, language=languageCode ) @@ -153,10 +151,9 @@ Extract and provide a JSON response with: 2. urls: List of URLs found in the prompt text 3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted 4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20) -5. timeRange: Time range if mentioned (d, w, m, y, or null) -6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de) -7. language: Language code if specified (lowercase, e.g., de, en, fr) -8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3) +5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de) +6. language: Language identified from the prompt (lowercase, e.g., de, en, fr) +7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3) Return ONLY valid JSON, no additional text: {{ @@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text: "urls": ["url1", "url2"], "needsSearch": true, "maxNumberPages": 10, - "timeRange": null, "country": "ch", "language": "de", "researchDepth": "general" @@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text: "urls": [], "needsSearch": True, "maxNumberPages": 10, - "timeRange": None, "country": country, "language": language, "researchDepth": researchDepth @@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text: self, instruction: str, maxNumberPages: int, - timeRange: Optional[str], country: Optional[str], language: Optional[str] ) -> List[str]: @@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text: instruction=instruction, country=country, maxNumberPages=maxNumberPages, - timeRange=timeRange, language=language ) searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2) @@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text: instruction=instruction, url=url, # Single URL maxDepth=maxDepth, - maxWidth=10 + maxWidth=50 ) crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2) diff --git a/modules/workflows/methods/methodAi.py b/modules/workflows/methods/methodAi.py index 708ee91b..178b6264 100644 --- a/modules/workflows/methods/methodAi.py +++ b/modules/workflows/methods/methodAi.py @@ -170,7 +170,7 @@ class MethodAi(MethodBase): - Output format: JSON with research results including URLs and content. Parameters: - - prompt (str, required): Natural language research instruction, including time range if relevant. + - prompt (str, required): Natural language research instruction. - list(url) (list, optional): Specific URLs to crawl, if needed. - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de). - language (str, optional): Language code (lowercase, e.g., de, en, fr). diff --git a/test_ai_models.py b/test_ai_models.py index 2906afd1..37772ee3 100644 --- a/test_ai_models.py +++ b/test_ai_models.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 """ -AI Models Test - Tests all available AI models individually +AI Models Test - Tests WEB_CRAWL functionality on all models that support it + +This script tests all models that have WEB_CRAWL capability, validates that +they can crawl specific URLs and return content, and analyzes the quality of results. """ import asyncio @@ -53,9 +56,18 @@ class AIModelsTester: async def initialize(self): """Initialize the AI service.""" - # Set logging level to INFO to reduce noise + # Set logging level to DEBUG for detailed output import logging - logging.getLogger().setLevel(logging.INFO) + logging.getLogger().setLevel(logging.DEBUG) + + # Initialize the model registry with all connectors + from modules.aicore.aicoreModelRegistry import modelRegistry + from modules.aicore.aicorePluginTavily import AiTavily + from modules.aicore.aicorePluginPerplexity import AiPerplexity + + # Register web connectors that support WEB_CRAWL + modelRegistry.registerConnector(AiTavily()) + modelRegistry.registerConnector(AiPerplexity()) # The AI service needs to be recreated with proper initialization from modules.services.serviceAi.mainServiceAi import AiService @@ -86,27 +98,53 @@ class AIModelsTester: print(f"📁 Results will be saved to: {self.modelTestDir}") async def testModel(self, modelName: str) -> Dict[str, Any]: - """Test a specific AI model with a simple prompt.""" + """Test a specific AI model with WEB_CRAWL operation.""" print(f"\n{'='*60}") print(f"TESTING MODEL: {modelName}") + print(f"OPERATION TYPE: WEB_CRAWL") print(f"{'='*60}") - # Use same prompt for all web models - import json + # CRAWL CONFIGURATION + # Deep and Broad Web Crawl Example: + # - maxDepth: 3 (deep) - follows links up to 3 levels from starting page + # - Level 1: Starting page + # - Level 2: Pages linked from starting page + # - Level 3: Pages linked from Level 2 pages + # - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level + # This results in potential maximum of ~1,250 pages (if 50 links exist at each level) + # + # Common configurations: + # - Fast/Overview: maxDepth=1, maxWidth=5 (shallow, focused) + # - General/Standard: maxDepth=2, maxWidth=10 (balanced) + # - Deep and Broad: maxDepth=3, maxWidth=50 (comprehensive) - if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower(): - # All web models use the same JSON formatted prompt - # Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names - testPrompt = json.dumps({ - "prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.", - "maxResults": 5, - "timeRange": "y", - "country": "CH", # ISO-2 code, Perplexity will convert to "Switzerland" - "format": "json" - }, indent=2) - else: - # Fallback for other models - testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON." + CRAWL_DEPTH = 3 # Deep crawl: follows links 3 levels deep + CRAWL_WIDTH = 50 # Broad crawl: up to 50 pages per level + + print(f"Crawl Configuration:") + print(f" - Depth: {CRAWL_DEPTH} levels (deep)") + print(f" - Width: {CRAWL_WIDTH} pages per level (broad)") + print(f" - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages") + + # Use WEB_CRAWL specific prompt format + from modules.datamodels.datamodelAi import AiCallPromptWebCrawl + + # Test with simple prompt like playground example + simplePrompt = f"https://www.valueon.ch: Who works in this company?" + + # But keep structured format for now to match our API + testPrompt = json.dumps({ + "instruction": "Who works in this company?", + "url": "https://www.valueon.ch", + "maxDepth": CRAWL_DEPTH, + "maxWidth": CRAWL_WIDTH + }, indent=2) + + print(f"Simple prompt (playground style): {simplePrompt}") + + # For Tavily models, test direct API call for better link following + if "tavily" in modelName.lower(): + return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH) print(f"Test prompt: {testPrompt}") print(f"Prompt length: {len(testPrompt)} characters") @@ -114,17 +152,11 @@ class AIModelsTester: startTime = asyncio.get_event_loop().time() try: - # Create options to force this specific model - if "internal" in modelName.lower(): - options = AiCallOptions( - operationType=OperationTypeEnum.DATA_EXTRACT, - preferredModel=modelName - ) - else: - options = AiCallOptions( - operationType=OperationTypeEnum.DATA_GENERATE, - preferredModel=modelName - ) + # Create options for WEB_CRAWL operation + options = AiCallOptions( + operationType=OperationTypeEnum.WEB_CRAWL, + preferredModel=modelName + ) # Call the AI service DIRECTLY through the model's functionCall # This tests the actual model, not the document generation pipeline @@ -140,29 +172,14 @@ class AIModelsTester: import base64 import os - # Prepare messages and options based on model type - if "vision" in modelName.lower(): - # For vision models, skip for now since they require special handling - print(f"⚠️ Skipping vision model {modelName} - requires special image handling") - return { - "modelName": modelName, - "status": "SKIPPED", - "processingTime": 0.0, - "responseLength": 0, - "responseType": "skipped", - "hasContent": False, - "error": "Vision model requires special image handling", - "fullResponse": "Skipped - vision model requires special image handling" - } - else: - # For other models, use normal functionCall - messages = [{"role": "user", "content": testPrompt}] - modelCall = AiModelCall( - messages=messages, - model=model, - options=options - ) - response = await model.functionCall(modelCall) + # For WEB_CRAWL models, use normal functionCall with structured prompt + messages = [{"role": "user", "content": testPrompt}] + modelCall = AiModelCall( + messages=messages, + model=model, + options=options + ) + response = await model.functionCall(modelCall) endTime = asyncio.get_event_loop().time() processingTime = endTime - startTime @@ -185,6 +202,10 @@ class AIModelsTester: "bytesReceived": len(response.content.encode('utf-8')) if response.content else 0 } + # Extract actual prompt sent if available in metadata + if hasattr(response, 'metadata') and response.metadata: + result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A") + # Try to parse content as JSON if response.content: try: @@ -289,9 +310,16 @@ class AIModelsTester: print(f"📄 Response length: {len(str(response))} characters") print(f"📄 Response preview: {result['responsePreview']}") - # Save text response for all models - if result.get("status") == "SUCCESS": - self._saveTextResponse(modelName, result) + # Add prompt to result for logging + result["testPrompt"] = testPrompt + result["crawlConfig"] = { + "depth": CRAWL_DEPTH, + "width": CRAWL_WIDTH + } + + # For WEB_CRAWL, also validate that content was extracted + if result.get("status") == "SUCCESS" and result.get("fullResponse"): + self._validateCrawlResponse(modelName, result) except Exception as e: endTime = asyncio.get_event_loop().time() @@ -304,13 +332,22 @@ class AIModelsTester: "responseLength": 0, "responseType": "exception", "hasContent": False, - "error": str(e) + "error": str(e), + "testPrompt": testPrompt, + "crawlConfig": { + "depth": CRAWL_DEPTH, + "width": CRAWL_WIDTH + } } print(f"💥 EXCEPTION - {str(e)}") self.testResults.append(result) + # Save text response even for exceptions to log the prompt + if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]: + self._saveTextResponse(modelName, result) + # Save individual model result immediately self._saveIndividualModelResult(modelName, result) @@ -378,6 +415,19 @@ class AIModelsTester: if not content: content = result.get("responsePreview", "No content available") + # If there's an error, include it in the content + if result.get("error"): + content = f"ERROR: {result.get('error')}\n\n{content}" + + # Get prompt and config for logging + config = result.get("crawlConfig", {}) + crawlDepth = config.get("depth", "N/A") + crawlWidth = config.get("width", "N/A") + + # Get both the original JSON prompt and the actual prompt sent + originalPrompt = result.get("testPrompt", "N/A") + actualPromptSent = result.get("actualPromptSent", "N/A") + # Add metadata header metadata = f"""Model: {modelName} Test Time: {timestamp} @@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')} Processing Time: {result.get('processingTime', 0):.2f}s Response Length: {result.get('responseLength', 0)} characters Is Valid JSON: {result.get('isValidJson', False)} +Test Method: {result.get('testMethod', 'standard')} +Pages Crawled: {result.get('pagesCrawled', 'N/A')} +Crawled URL: {result.get('crawledUrl', 'N/A')} +Has URL: {result.get('hasUrl', 'N/A')} +Has Title: {result.get('hasTitle', 'N/A')} +Has Content: {result.get('hasContent', 'N/A')} +Content Length: {result.get('contentLength', 'N/A')} characters + +--- CRAWL CONFIGURATION --- +Depth: {crawlDepth} +Width: {crawlWidth} + +--- ORIGINAL JSON PROMPT (input) --- +{originalPrompt} + +--- ACTUAL PROMPT SENT TO API (EXACT) --- +{actualPromptSent} --- RESPONSE CONTENT --- {content} @@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)} print(f"❌ Error saving text response: {str(e)}") result["textSaveError"] = str(e) + def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]): + """Validate that the WEB_CRAWL response contains crawled content.""" + try: + content = result.get("fullResponse", "") + + # Try to parse as JSON + crawledData = {} + try: + parsed = json.loads(content) + if isinstance(parsed, dict): + crawledData = parsed + except: + pass + + # Check for expected fields: url, title, content + hasUrl = bool(crawledData.get("url")) + hasTitle = bool(crawledData.get("title")) + hasContent = bool(crawledData.get("content")) + contentLength = len(crawledData.get("content", "")) + + result["hasUrl"] = hasUrl + result["hasTitle"] = hasTitle + result["hasContent"] = hasContent + result["contentLength"] = contentLength + result["crawledUrl"] = crawledData.get("url", "") + + if hasUrl and hasContent: + print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}") + print(f" Content length: {contentLength} characters") + print(f" Title: {crawledData.get('title', 'N/A')}") + else: + print(f"⚠️ Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}") + + except Exception as e: + print(f"❌ Error validating crawl response: {str(e)}") + result["crawlValidationError"] = str(e) + + async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]: + """Test Tavily API directly using the crawl() method with better link following.""" + print(f"\n{'='*60}") + print(f"TESTING TAVILY DIRECT API (crawl method)") + print(f"{'='*60}") + + startTime = asyncio.get_event_loop().time() + + try: + from tavily import AsyncTavilyClient + from modules.shared.configuration import APP_CONFIG + + apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET") + if not apiKey: + raise Exception("Tavily API key not found") + + client = AsyncTavilyClient(api_key=apiKey) + + # Map our configuration to Tavily parameters + # maxWidth -> limit (pages per level) + # maxDepth -> max_depth (link following depth) + # max_breadth = maxWidth (breadth of crawl at each level) + tavilyLimit = crawlWidth + tavilyMaxDepth = crawlDepth + tavilyMaxBreadth = crawlWidth + + print(f"Calling Tavily API with crawl() method...") + print(f"URL: https://www.valueon.ch") + print(f"Instructions: Who works in this company?") + print(f"Limit: {tavilyLimit} pages per level") + print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)") + print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)") + print(f"Deep and Broad Crawl Configuration Active") + + response = await client.crawl( + url="https://www.valueon.ch", + instructions="Who works in this company?", + limit=tavilyLimit, + max_depth=tavilyMaxDepth, + max_breadth=tavilyMaxBreadth + ) + + endTime = asyncio.get_event_loop().time() + processingTime = endTime - startTime + + # Analyze response + contentLength = 0 + pagesCrawled = 0 + fullContent = "" + + if isinstance(response, dict): + # Check if it has results + if "results" in response: + results = response["results"] + pagesCrawled = len(results) + content_parts = [] + for result in results: + url = result.get("url", "") + title = result.get("title", "") + content = result.get("raw_content", result.get("content", "")) + content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n") + contentLength += len(content) + + fullContent = "\n".join(content_parts) + else: + fullContent = json.dumps(response, indent=2) + contentLength = len(fullContent) + elif isinstance(response, list): + pagesCrawled = len(response) + content_parts = [] + for item in response: + if isinstance(item, dict): + url = item.get("url", "") + title = item.get("title", "") + content = item.get("raw_content", item.get("content", "")) + content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n") + contentLength += len(content) + + fullContent = "\n".join(content_parts) + else: + fullContent = str(response) + contentLength = len(fullContent) + + result = { + "modelName": modelName, + "status": "SUCCESS", + "processingTime": round(processingTime, 2), + "responseLength": contentLength, + "responseType": "TavilyDirectAPI", + "hasContent": True, + "error": None, + "modelUsed": modelName, + "priceUsd": 0.0, + "bytesSent": 0, + "bytesReceived": contentLength, + "isValidJson": True, + "fullResponse": fullContent, + "pagesCrawled": pagesCrawled, + "testMethod": "direct_api_crawl" + } + + print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s") + print(f"📄 Pages crawled: {pagesCrawled}") + print(f"📄 Total content length: {contentLength} characters") + + # Save the response + self._saveTextResponse(modelName, result) + self._validateCrawlResponse(modelName, result) + self._saveIndividualModelResult(modelName, result) + + self.testResults.append(result) + return result + + except Exception as e: + endTime = asyncio.get_event_loop().time() + processingTime = endTime - startTime + + result = { + "modelName": modelName, + "status": "EXCEPTION", + "processingTime": round(processingTime, 2), + "responseLength": 0, + "responseType": "exception", + "hasContent": False, + "error": str(e) + } + + print(f"💥 EXCEPTION - {str(e)}") + self.testResults.append(result) + return result + def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]): """Save individual model test result to file.""" try: @@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)} print(f"❌ Error saving individual result: {str(e)}") def getAllAvailableModels(self) -> List[str]: - """Get all available model names.""" - # Hardcoded list of known models - same approach as test_ai_behavior.py - return [ - # "claude-3-5-sonnet-20241022", # Skipped - text model, test later - # "claude-3-5-sonnet-20241022-vision", # Skipped - requires image input - # "gpt-4o", # Skipped - text model, test later - # "gpt-3.5-turbo", # Skipped - text model, test later - # "gpt-4o-vision", # Skipped - requires image input - # "dall-e-3", # Skipped - image generation, test later - "sonar", # Perplexity web model - "sonar-pro", # Perplexity web model - "tavily-search", # Tavily web model (unified research) - # "internal-extractor", # Skipped - internal model, test later - # "internal-generator", # Skipped - internal model, test later - # "internal-renderer" # Skipped - internal model, test later - ] + """Get all available model names that support WEB_CRAWL.""" + from modules.aicore.aicoreModelRegistry import modelRegistry + from modules.datamodels.datamodelAi import OperationTypeEnum + + # Get all models from registry + allModels = modelRegistry.getAvailableModels() + + # Filter models that support WEB_CRAWL + webCrawlModels = [] + for model in allModels: + if model.operationTypes and any( + ot.operationType == OperationTypeEnum.WEB_CRAWL + for ot in model.operationTypes + ): # Include both Tavily and Perplexity models + webCrawlModels.append(model.name) + + # Filter to only "sonar" model for testing + webCrawlModels = [m for m in webCrawlModels if m == "sonar"] + + print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):") + for modelName in webCrawlModels: + print(f" - {modelName}") + + return webCrawlModels def saveTestResults(self): """Save detailed test results to file.""" @@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)} if result.get("isValidJson") is not None: print(f" Valid JSON: {'Yes' if result['isValidJson'] else 'No'}") + if result.get("crawledUrl"): + print(f" Crawled URL: {result['crawledUrl']}") + + if result.get("contentLength") is not None: + print(f" Content length: {result['contentLength']} characters") + + if result.get("pagesCrawled") is not None: + print(f" Pages crawled: {result['pagesCrawled']}") + if result["error"]: print(f" Error: {result['error']}") @@ -525,12 +777,32 @@ Is Valid JSON: {result.get('isValidJson', False)} print(f"{'='*80}") print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)") print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)") + + # Find models with most content + modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0] + if modelsWithContent: + mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0)) + totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent) + avgContent = totalContent / len(modelsWithContent) + print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)") + print(f"📊 Average content per model: {avgContent:.0f} characters") + print(f"📊 Total content crawled across all models: {totalContent} characters") + + # Find models with most pages crawled (for Tavily direct API) + modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0] + if modelsWithPages: + mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0)) + totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages) + avgPages = totalPages / len(modelsWithPages) + print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)") + print(f"📊 Average pages per model: {avgPages:.1f} pages") + print(f"📊 Total pages crawled across all models: {totalPages} pages") async def main(): - """Run AI models testing.""" + """Run AI models testing for WEB_CRAWL operation.""" tester = AIModelsTester() - print("Starting AI Models Testing...") + print("Starting AI Models Testing for WEB_CRAWL...") print("Initializing AI service...") await tester.initialize() @@ -542,8 +814,9 @@ async def main(): print(f" {i}. {model}") print(f"\n{'='*80}") - print("STARTING INDIVIDUAL MODEL TESTS") + print("STARTING WEB_CRAWL TESTS") print(f"{'='*80}") + print("Testing each model's ability to crawl URLs and return content...") print("Press Enter after each model test to continue to the next one...") # Test each model individually