ai models ready for web and txt

2025-10-26 18:17:17 +01:00 · 2025-10-26 18:17:17 +01:00 · 2489719c62
commit 2489719c62
parent 72e0687826
7 changed files with 577 additions and 232 deletions
--- a/modules/aicore/aicorePluginPerplexity.py
+++ b/modules/aicore/aicorePluginPerplexity.py
@ -57,7 +57,7 @@ class AiPerplexity(BaseConnectorAi):
                connectorType="perplexity",
                apiUrl="https://api.perplexity.ai/chat/completions",
                temperature=0.2,
-                maxTokens=4000,
+                maxTokens=24000,  # Increased for detailed web crawl responses (Perplexity supports up to 25k)
                contextLength=32000,
                costPer1kTokensInput=0.005,
                costPer1kTokensOutput=0.005,
@ -80,12 +80,12 @@ class AiPerplexity(BaseConnectorAi):
                connectorType="perplexity",
                apiUrl="https://api.perplexity.ai/chat/completions",
                temperature=0.2,
-                maxTokens=4000,
+                maxTokens=24000,  # Increased for detailed web crawl responses (Perplexity supports up to 25k)
                contextLength=32000,
                costPer1kTokensInput=0.01,
                costPer1kTokensOutput=0.01,
                speedRating=6,  # Slower due to AI analysis
-                qualityRating=10,  # Best AI analysis quality
+                qualityRating=9,  # Best AI analysis quality
                # capabilities removed (not used in business logic)
                functionCall=self._routeWebOperation,
                priority=PriorityEnum.QUALITY,
@ -217,6 +217,42 @@ class AiPerplexity(BaseConnectorAi):
            # Fallback to basic call
            return await self.callAiBasic(modelCall)
    
+    def _getDepthInstructions(self, maxDepth: int) -> str:
+        """
+        Map maxDepth (numeric) to instructional text for LLM.
+        
+        Args:
+            maxDepth: 1 (fast/overview), 2 (general/standard), 3 (deep/comprehensive)
+            
+        Returns:
+            Instructional text for the LLM
+        """
+        depthMap = {
+            1: "Basic overview - extract main content from the main page only",
+            2: "Standard crawl - extract content from main page and linked pages (2 levels deep)",
+            3: "Deep crawl - comprehensively extract content from main page and all accessible linked pages (3+ levels deep)"
+        }
+        return depthMap.get(maxDepth, depthMap[2])
+    
+    def _getWidthInstructions(self, maxWidth: int) -> str:
+        """
+        Map maxWidth (numeric) to instructional text for LLM.
+        
+        Args:
+            maxWidth: Number of pages to crawl at each level (default: 10)
+            
+        Returns:
+            Instructional text for the LLM
+        """
+        if maxWidth <= 5:
+            return f"Focused crawl - limit to {maxWidth} most relevant pages per level"
+        elif maxWidth <= 15:
+            return f"Standard breadth - crawl up to {maxWidth} pages per level"
+        elif maxWidth <= 30:
+            return f"Wide crawl - crawl up to {maxWidth} pages per level, prioritize quality"
+        else:
+            return f"Extensive crawl - crawl up to {maxWidth} pages per level, comprehensive coverage"
+    
    async def webSearch(self, modelCall: AiModelCall) -> AiModelResponse:
        """
        WEB_SEARCH operation - returns list of URLs based on search query.
@ -253,8 +289,6 @@ class AiPerplexity(BaseConnectorAi):

 Return a JSON array of {webSearchPrompt.maxNumberPages} most relevant URLs.
 {'' if not countryName else f'Focus on results from {countryName}.'}
-{'' if not webSearchPrompt.timeRange else f'Limit to results from the last {webSearchPrompt.timeRange}'}
-{'' if not webSearchPrompt.language else f'Return results in {webSearchPrompt.language} language'}

 Return ONLY a JSON array of URLs, no additional text:
 [
@ -293,6 +327,15 @@ Return ONLY a JSON array of URLs, no additional text:
        """
        WEB_CRAWL operation - crawls ONE URL and returns content.
        
+        Perplexity API Parameters Used:
+        - messages: The prompt containing URL and instruction
+        - max_tokens: Maximum response length
+        - max_results: Number of search results (1-20, default: 10)
+        - temperature: Response randomness (not web search specific)
+        
+        Pagination: Perplexity does NOT return paginated responses.
+        A single response contains all results within max_tokens limit.
+        
        Args:
            modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
            
@ -316,55 +359,68 @@ Return ONLY a JSON array of URLs, no additional text:
            webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
            
            # Build crawl request for Perplexity - ONE URL
-            crawlPrompt = f"""Crawl and extract content from this URL based on the instruction:
+            # Match playground prompt style: just URL + question
+            # This allows Perplexity to return detailed multi-source results
+            crawlPrompt = f"{webCrawlPrompt.url}: {webCrawlPrompt.instruction}"
            
-INSTRUCTION: '{webCrawlPrompt.instruction}'
-
-URL to crawl (maxDepth={webCrawlPrompt.maxDepth}):
-{webCrawlPrompt.url}
-
-Extract and return the relevant content based on the instruction.
-Return as JSON object with this structure:
-{{
-  "url": "{webCrawlPrompt.url}",
-  "title": "Page title",
-  "content": "Extracted content relevant to the instruction"
-}}
-
-Return ONLY valid JSON, no additional text."""
+            # Build payload with optional Perplexity parameters
+            # Note: max_tokens_per_page may not be supported by chat/completions endpoint
+            # The playground Python SDK might use a different internal API
+            maxResults = min(webCrawlPrompt.maxWidth or 10, 20)  # Max 20 results
            
            payload = {
                "model": model.name,
                "messages": [{"role": "user", "content": crawlPrompt}],
                "temperature": temperature,
-                "max_tokens": maxTokens
+                "max_tokens": maxTokens,  # Use model's configured maxTokens (24000)
+                "max_results": maxResults,
+                "return_citations": True  # Request citations explicitly
            }
            
+            logger.info(f"Perplexity crawl payload: model={model.name}, prompt_length={len(crawlPrompt)}, max_tokens={maxTokens}, max_results={maxResults}")
+            
            response = await self.httpClient.post(model.apiUrl, json=payload)
            
            if response.status_code != 200:
                raise HTTPException(status_code=500, detail=f"Perplexity Web Crawl API error: {response.text}")
            
            apiResponse = response.json()
+            
+            # Extract the main content
            content = apiResponse["choices"][0]["message"]["content"]
            
-            # Parse JSON content and ensure it's a single object
-            import json
-            try:
-                parsedContent = json.loads(content)
-                # Ensure it's a single object, not an array
-                if isinstance(parsedContent, list):
-                    parsedContent = parsedContent[0] if parsedContent else {}
-            except:
-                # If not JSON, create structured response
-                parsedContent = {"url": webCrawlPrompt.url, "title": "", "content": content}
+            # Check for citations or search results in the response
+            citations = apiResponse.get("citations", [])
+            searchResults = apiResponse.get("search_results", [])
            
-            # Return as JSON string
+            # Log what we found
+            if citations:
+                logger.info(f"Found {len(citations)} citations in response")
+            if searchResults:
+                logger.info(f"Found {len(searchResults)} search results in response")
+            logger.debug(f"API response keys: {list(apiResponse.keys())}")
+            
+            # Build comprehensive response with citations if available
+            import json
+            responseData = {
+                "content": content,
+                "citations": citations if citations else [],
+                "search_results": searchResults if searchResults else []
+            }
+            
+            # Return comprehensive response
            return AiModelResponse(
-                content=json.dumps(parsedContent, indent=2),
+                content=json.dumps(responseData, indent=2) if (citations or searchResults) else content,
                success=True,
                modelId=model.name,
-                metadata={"response_id": apiResponse.get("id", ""), "operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
+                metadata={
+                    "response_id": apiResponse.get("id", ""), 
+                    "operation": "WEB_CRAWL", 
+                    "url": webCrawlPrompt.url,
+                    "actualPromptSent": crawlPrompt,
+                    "has_citations": len(citations) > 0,
+                    "has_search_results": len(searchResults) > 0
+                }
            )
            
        except Exception as e:
--- a/modules/aicore/aicorePluginTavily.py
+++ b/modules/aicore/aicorePluginTavily.py
@ -27,7 +27,8 @@ class WebCrawlResult:
    content: str
    title: Optional[str] = None

-class ConnectorWeb(BaseConnectorAi):
+
+class AiTavily(BaseConnectorAi):
    """Tavily web search connector."""
    
    def __init__(self):
@ -43,6 +44,35 @@ class ConnectorWeb(BaseConnectorAi):
        # Initialize client if API key is available
        self._initializeClient()

+
+    def getModels(self) -> List[AiModel]:
+        """Get all available Tavily models."""
+        return [
+            AiModel(
+                name="tavily-search",
+                displayName="Tavily Search & Research",
+                connectorType="tavily",
+                apiUrl="https://api.tavily.com",
+                temperature=0.0,  # Web search doesn't use temperature
+                maxTokens=0,  # Web search doesn't use tokens
+                contextLength=0,
+                costPer1kTokensInput=0.0,
+                costPer1kTokensOutput=0.0,
+                speedRating=8,  # Good speed for search and extract
+                qualityRating=9,  # Excellent quality for web research
+                # capabilities removed (not used in business logic)
+                functionCall=self._routeWebOperation,
+                priority=PriorityEnum.BALANCED,
+                processingMode=ProcessingModeEnum.BASIC,
+                operationTypes=createOperationTypeRatings(
+                    (OperationTypeEnum.WEB_SEARCH, 9),
+                    (OperationTypeEnum.WEB_CRAWL, 10)
+                ),
+                version="tavily-search",
+                calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008  # Simple flat rate
+            )
+        ]
+
    def _initializeClient(self):
        """Initialize the Tavily client if API key is available."""
        try:
@ -206,34 +236,6 @@ class ConnectorWeb(BaseConnectorAi):
        
        return filteredResults
    
-    def getModels(self) -> List[AiModel]:
-        """Get all available Tavily models."""
-        return [
-            AiModel(
-                name="tavily-search",
-                displayName="Tavily Search & Research",
-                connectorType="tavily",
-                apiUrl="https://api.tavily.com",
-                temperature=0.0,  # Web search doesn't use temperature
-                maxTokens=0,  # Web search doesn't use tokens
-                contextLength=0,
-                costPer1kTokensInput=0.0,
-                costPer1kTokensOutput=0.0,
-                speedRating=8,  # Good speed for search and extract
-                qualityRating=9,  # Excellent quality for web research
-                # capabilities removed (not used in business logic)
-                functionCall=self._routeWebOperation,
-                priority=PriorityEnum.BALANCED,
-                processingMode=ProcessingModeEnum.BASIC,
-                operationTypes=createOperationTypeRatings(
-                    (OperationTypeEnum.WEB_SEARCH, 9),
-                    (OperationTypeEnum.WEB_CRAWL, 8)
-                ),
-                version="tavily-search",
-                calculatePriceUsd=lambda processingTime, bytesSent, bytesReceived: 0.008  # Simple flat rate
-            )
-        ]
-
    @classmethod
    async def create(cls):
        apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
@ -273,10 +275,9 @@ class ConnectorWeb(BaseConnectorAi):
        topic: str | None = None,
        includeDomains: list[str] | None = None,
        excludeDomains: list[str] | None = None,
-        language: str | None = None,
        country: str | None = None,
-        includeAnswer: bool | None = None,
-        includeRawContent: bool | None = None,
+        includeAnswer: str | None = None,
+        includeRawContent: str | None = None,
    ) -> list[WebSearchResult]:
        """Calls the Tavily API to perform a web search."""
        # Make sure maxResults is within the allowed range (use cached values)
@ -298,8 +299,6 @@ class ConnectorWeb(BaseConnectorAi):
            kwargs["include_domains"] = includeDomains
        if excludeDomains is not None:
            kwargs["exclude_domains"] = excludeDomains
-        if language is not None:
-            kwargs["language"] = language
        if country is not None:
            kwargs["country"] = country
        if includeAnswer is not None:
@ -307,7 +306,8 @@ class ConnectorWeb(BaseConnectorAi):
        if includeRawContent is not None:
            kwargs["include_raw_content"] = includeRawContent

-        logger.debug(f"Tavily.search kwargs: {kwargs}")
+        # Log the final API call parameters for comparison
+        logger.info(f"Tavily API call parameters: {kwargs}")
        
        # Ensure client is initialized
        if self.client is None:
@ -317,6 +317,10 @@ class ConnectorWeb(BaseConnectorAi):
        
        response = await self.client.search(**kwargs)
        
+        # Return all results without score filtering
+        # Tavily's scoring is already applied by the API
+        logger.info(f"Tavily returned {len(response.get('results', []))} results")
+        
        return [
            WebSearchResult(
                title=result["title"], 
@ -328,69 +332,77 @@ class ConnectorWeb(BaseConnectorAi):

    async def _crawl(
        self,
-        urls: list,
-        extractDepth: str | None = None,
-        format: str | None = None,
+        url: str,
+        instructions: str | None = None,
+        limit: int = 20,
+        maxDepth: int = 2,
+        maxBreadth: int = 40,
    ) -> list[WebCrawlResult]:
-        """Calls the Tavily API to extract text content from URLs with retry logic."""
+        """Calls the Tavily API to crawl ONE URL with link following and retry logic."""
        maxRetries = self.crawlMaxRetries
        retryDelay = self.crawlRetryDelay
        timeout = self.crawlTimeout
        
-        logger.debug(f"Starting crawl of {len(urls)} URLs: {urls}")
-        logger.debug(f"Crawl settings: extractDepth={extractDepth}, format={format}, timeout={timeout}s")
+        logger.debug(f"Starting crawl of URL: {url}")
+        logger.debug(f"Crawl settings: instructions={instructions}, limit={limit}, maxDepth={maxDepth}, maxBreadth={maxBreadth}, timeout={timeout}s")

        for attempt in range(maxRetries + 1):
            try:
                logger.debug(f"Crawl attempt {attempt + 1}/{maxRetries + 1}")
                
-                # Use asyncio.wait_for for timeout
-                # Build kwargs for extract
-                kwargsExtract: dict = {"urls": urls}
-                kwargsExtract["extract_depth"] = extractDepth or "advanced"
-                kwargsExtract["format"] = format or "markdown"  # Use markdown to get HTML structure
-
-                logger.debug(f"Sending request to Tavily with kwargs: {kwargsExtract}")
-                
                # Ensure client is initialized
                if self.client is None:
                    self._initializeClient()
                    if self.client is None:
                        raise ValueError("Tavily client not initialized. Please check API key configuration.")
                
+                logger.debug(f"Crawling URL: {url}")
+                
+                # Build kwargs for crawl
+                kwargsCrawl: dict = {"url": url}
+                if instructions:
+                    kwargsCrawl["instructions"] = instructions
+                if limit:
+                    kwargsCrawl["limit"] = limit
+                if maxDepth:
+                    kwargsCrawl["max_depth"] = maxDepth
+                if maxBreadth:
+                    kwargsCrawl["max_breadth"] = maxBreadth
+                
+                logger.debug(f"Sending request to Tavily with kwargs: {kwargsCrawl}")
+                
                response = await asyncio.wait_for(
-                    self.client.extract(**kwargsExtract),
+                    self.client.crawl(**kwargsCrawl),
                    timeout=timeout
                )
                
-                logger.debug(f"Tavily response received: {list(response.keys())}")
+                logger.debug(f"Tavily response received: {type(response)}")
                
-                # Debug: Log what Tavily actually returns
-                if "results" in response and response["results"]:
-                    logger.debug(f"Tavily returned {len(response['results'])} results")
-                    logger.debug(f"First result keys: {list(response['results'][0].keys())}")
-                    logger.debug(f"First result has raw_content: {'raw_content' in response['results'][0]}")
-                    
-                    # Log each result
-                    for i, result in enumerate(response["results"]):
-                        logger.debug(f"Result {i+1}: URL={result.get('url', 'N/A')}, content_length={len(result.get('raw_content', result.get('content', '')))}")
+                # Parse response - could be dict with results or list
+                if isinstance(response, dict) and "results" in response:
+                    pageResults = response["results"]
+                elif isinstance(response, list):
+                    pageResults = response
                else:
-                    logger.warning(f"Tavily returned no results in response: {response}")
+                    logger.warning(f"Unexpected response format: {type(response)}")
+                    pageResults = []
                
-                results = [
-                    WebCrawlResult(
-                        url=result["url"], 
-                        content=result.get("raw_content", result.get("content", "")),  # Try raw_content first, fallback to content
-                        title=result.get("title", "")  # Extract title if available
-                    )
-                    for result in response["results"]
-                ]
+                logger.debug(f"Got {len(pageResults)} pages from crawl")
                
-                logger.debug(f"Crawl successful: extracted {len(results)} results")
+                # Convert to WebCrawlResult format
+                results = []
+                for result in pageResults:
+                    results.append(WebCrawlResult(
+                        url=result.get("url", url),
+                        content=result.get("raw_content", result.get("content", "")),
+                        title=result.get("title", "")
+                    ))
+                
+                logger.debug(f"Crawl successful: extracted {len(results)} pages from URL")
                return results
                
            except asyncio.TimeoutError:
-                logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URLs: {urls}")
+                logger.warning(f"Crawl attempt {attempt + 1} timed out after {timeout} seconds for URL: {url}")
                if attempt < maxRetries:
                    logger.info(f"Retrying in {retryDelay} seconds...")
                    await asyncio.sleep(retryDelay)
@ -398,14 +410,13 @@ class ConnectorWeb(BaseConnectorAi):
                    raise Exception(f"Crawl failed after {maxRetries + 1} attempts due to timeout")
                    
            except Exception as e:
-                logger.warning(f"Crawl attempt {attempt + 1} failed for URLs {urls}: {str(e)}")
+                logger.warning(f"Crawl attempt {attempt + 1} failed for URL {url}: {str(e)}")
                logger.debug(f"Full error details: {type(e).__name__}: {str(e)}")
                
                # Check if it's a validation error and log more details
                if "validation" in str(e).lower():
                    logger.debug(f"URL validation failed. Checking URL format:")
-                    for i, url in enumerate(urls):
-                        logger.debug(f"  URL {i+1}: '{url}' (length: {len(url)})")
+                    logger.debug(f"  URL: '{url}' (length: {len(url)})")
                    # Check for common URL issues
                    if ' ' in url:
                        logger.debug(f"    WARNING: URL contains spaces!")
@ -468,15 +479,15 @@ class ConnectorWeb(BaseConnectorAi):
            if countryName:
                countryName = self._convertIsoCodeToCountryName(countryName)
            
-            # Perform search
+            # Perform search - use exact parameters from prompt
+            # NOTE: timeRange parameter causes generic results, so we don't use it
            searchResults = await self._search(
                query=webSearchPrompt.instruction,
                maxResults=webSearchPrompt.maxNumberPages,
-                timeRange=webSearchPrompt.timeRange,
+                timeRange=None,  # Not used - causes generic results
                country=countryName,
-                language=webSearchPrompt.language,
-                includeAnswer=False,
-                includeRawContent=False
+                includeAnswer="basic",
+                includeRawContent="text"
            )
            
            # Extract URLs from results
@ -500,13 +511,13 @@ class ConnectorWeb(BaseConnectorAi):
    
    async def webCrawl(self, modelCall: AiModelCall) -> "AiModelResponse":
        """
-        WEB_CRAWL operation - crawls one URL using Tavily.
+        WEB_CRAWL operation - crawls one URL using Tavily with link following.
        
        Args:
            modelCall: AiModelCall with AiCallPromptWebCrawl as prompt
            
        Returns:
-            AiModelResponse with crawl results as JSON
+            AiModelResponse with crawl results as JSON (may include multiple pages)
        """
        try:
            # Extract parameters
@ -517,37 +528,49 @@ class ConnectorWeb(BaseConnectorAi):
            # Create Pydantic model
            webCrawlPrompt = AiCallPromptWebCrawl(**promptData)
            
-            # Perform crawl for ONE URL
-            # Note: _crawl expects a list, so we wrap the single URL in a list
+            # Perform crawl for ONE URL with link following
+            # Use maxWidth as limit, maxDepth as maxDepth, and calculate maxBreadth
            crawlResults = await self._crawl(
-                urls=[webCrawlPrompt.url],
-                extractDepth="advanced" if webCrawlPrompt.maxDepth > 2 else "basic",
-                format="markdown"
+                url=webCrawlPrompt.url,
+                instructions=webCrawlPrompt.instruction,
+                limit=webCrawlPrompt.maxWidth or 20,  # maxWidth controls number of pages
+                maxDepth=webCrawlPrompt.maxDepth or 2,
+                maxBreadth=webCrawlPrompt.maxWidth or 40  # Use same as limit for breadth
            )
            
-            # Format result for single URL - consistent with Perplexity format
+            # If we got multiple pages from the crawl, we need to format them differently
+            # Return the first result for backwards compatibility, but include total page count
            if crawlResults and len(crawlResults) > 0:
-                firstResult = crawlResults[0]
+                # Get all pages content
+                allContent = ""
+                for i, result in enumerate(crawlResults, 1):
+                    pageHeader = f"\n{'='*60}\nPAGE {i}: {result.url}\n{'='*60}\n"
+                    if result.title:
+                        allContent += f"{pageHeader}Title: {result.title}\n\n"
+                    allContent += f"{result.content}\n"
+                
                resultData = {
-                    "url": firstResult.url,
-                    "title": firstResult.title if firstResult.title else "Content",
-                    "content": firstResult.content
+                    "url": webCrawlPrompt.url,
+                    "title": crawlResults[0].title if crawlResults[0].title else "Content",
+                    "content": allContent,
+                    "pagesCrawled": len(crawlResults),
+                    "pageUrls": [result.url for result in crawlResults]
                }
            else:
-                resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted"}
+                resultData = {"url": webCrawlPrompt.url, "title": "", "content": "", "error": "No content extracted", "pagesCrawled": 0}
            
-            # Return as JSON - same format as Perplexity
+            # Return as JSON - same format as Perplexity but with multiple pages content
            import json
            return AiModelResponse(
                content=json.dumps(resultData, indent=2),
                success=True,
-                metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url}
+                metadata={"operation": "WEB_CRAWL", "url": webCrawlPrompt.url, "pagesCrawled": len(crawlResults) if crawlResults else 0}
            )
            
        except Exception as e:
            logger.error(f"Error in Tavily web crawl: {str(e)}")
            import json
-            errorResult = {"error": str(e), "url": ""}
+            errorResult = {"error": str(e), "url": webCrawlPrompt.url if 'webCrawlPrompt' in locals() else ""}
            return AiModelResponse(
                content=json.dumps(errorResult, indent=2),
                success=False,
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@ -200,7 +200,6 @@ class AiCallPromptWebSearch(BaseModel):
    instruction: str = Field(description="Search instruction/query for finding relevant URLs")
    country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
    maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
-    timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
    language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
    researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
    
--- a/modules/datamodels/datamodelTools.py
+++ b/modules/datamodels/datamodelTools.py
@ -186,12 +186,13 @@ class CountryCodes:
        Get Tavily-compatible country name from ISO-2 code.
        
        Args:
-            isoCode: ISO-2 country code (e.g., "CH", "US")
+            isoCode: ISO-2 country code (e.g., "CH", "ch", "US", "us")
            
        Returns:
            Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
        """
-        isoCodeUpper = isoCode.upper()
+        # Convert to uppercase for lookup
+        isoCodeUpper = isoCode.upper() if isoCode else ""
        mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
        return mapping[0] if mapping else isoCode
    
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@ -11,7 +11,7 @@ from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiC
 logger = logging.getLogger(__name__)


-class WebcrawlService:
+class WebService:
    """Service for web search and crawling operations."""
    
    def __init__(self, services):
@ -56,7 +56,6 @@ class WebcrawlService:
            extractedUrls = analysisResult.get("urls", [])
            needsSearch = analysisResult.get("needsSearch", True)  # Default to True
            maxNumberPages = analysisResult.get("maxNumberPages", 10)
-            timeRange = analysisResult.get("timeRange")
            countryCode = analysisResult.get("country", country)
            languageCode = analysisResult.get("language", language)
            finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
@ -77,7 +76,6 @@ class WebcrawlService:
                searchUrls = await self._performWebSearch(
                    instruction=instruction,
                    maxNumberPages=maxNumberPages - len(allUrls),
-                    timeRange=timeRange,
                    country=countryCode,
                    language=languageCode
                )
@ -153,10 +151,9 @@ Extract and provide a JSON response with:
 2. urls: List of URLs found in the prompt text
 3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
 4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
-5. timeRange: Time range if mentioned (d, w, m, y, or null)
-6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
-7. language: Language code if specified (lowercase, e.g., de, en, fr)
-8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
+5. country: Country code if identified in the prompt (2-digit lowercase, e.g., ch, us, de)
+6. language: Language identified from the prompt (lowercase, e.g., de, en, fr)
+7. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)

 Return ONLY valid JSON, no additional text:
 {{
@ -164,7 +161,6 @@ Return ONLY valid JSON, no additional text:
    "urls": ["url1", "url2"],
    "needsSearch": true,
    "maxNumberPages": 10,
-    "timeRange": null,
    "country": "ch",
    "language": "de",
    "researchDepth": "general"
@ -188,7 +184,6 @@ Return ONLY valid JSON, no additional text:
                "urls": [],
                "needsSearch": True,
                "maxNumberPages": 10,
-                "timeRange": None,
                "country": country,
                "language": language,
                "researchDepth": researchDepth
@ -198,7 +193,6 @@ Return ONLY valid JSON, no additional text:
        self,
        instruction: str,
        maxNumberPages: int,
-        timeRange: Optional[str],
        country: Optional[str],
        language: Optional[str]
    ) -> List[str]:
@ -209,7 +203,6 @@ Return ONLY valid JSON, no additional text:
                instruction=instruction,
                country=country,
                maxNumberPages=maxNumberPages,
-                timeRange=timeRange,
                language=language
            )
            searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
@ -269,7 +262,7 @@ Return ONLY valid JSON, no additional text:
                    instruction=instruction,
                    url=url,  # Single URL
                    maxDepth=maxDepth,
-                    maxWidth=10
+                    maxWidth=50
                )
                crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
                
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@ -170,7 +170,7 @@ class MethodAi(MethodBase):
        - Output format: JSON with research results including URLs and content.

        Parameters:
-        - prompt (str, required): Natural language research instruction, including time range if relevant.
+        - prompt (str, required): Natural language research instruction.
        - list(url) (list, optional): Specific URLs to crawl, if needed.
        - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
        - language (str, optional): Language code (lowercase, e.g., de, en, fr).
--- a/test_ai_models.py
+++ b/test_ai_models.py
@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 """
-AI Models Test - Tests all available AI models individually
+AI Models Test - Tests WEB_CRAWL functionality on all models that support it
+
+This script tests all models that have WEB_CRAWL capability, validates that
+they can crawl specific URLs and return content, and analyzes the quality of results.
 """

 import asyncio
@ -53,9 +56,18 @@ class AIModelsTester:
    
    async def initialize(self):
        """Initialize the AI service."""
-        # Set logging level to INFO to reduce noise
+        # Set logging level to DEBUG for detailed output
        import logging
-        logging.getLogger().setLevel(logging.INFO)
+        logging.getLogger().setLevel(logging.DEBUG)
+        
+        # Initialize the model registry with all connectors
+        from modules.aicore.aicoreModelRegistry import modelRegistry
+        from modules.aicore.aicorePluginTavily import AiTavily
+        from modules.aicore.aicorePluginPerplexity import AiPerplexity
+        
+        # Register web connectors that support WEB_CRAWL
+        modelRegistry.registerConnector(AiTavily())
+        modelRegistry.registerConnector(AiPerplexity())
        
        # The AI service needs to be recreated with proper initialization
        from modules.services.serviceAi.mainServiceAi import AiService
@ -86,27 +98,53 @@ class AIModelsTester:
        print(f"📁 Results will be saved to: {self.modelTestDir}")
    
    async def testModel(self, modelName: str) -> Dict[str, Any]:
-        """Test a specific AI model with a simple prompt."""
+        """Test a specific AI model with WEB_CRAWL operation."""
        print(f"\n{'='*60}")
        print(f"TESTING MODEL: {modelName}")
+        print(f"OPERATION TYPE: WEB_CRAWL")
        print(f"{'='*60}")
        
-        # Use same prompt for all web models
-        import json
+        # CRAWL CONFIGURATION
+        # Deep and Broad Web Crawl Example:
+        # - maxDepth: 3 (deep) - follows links up to 3 levels from starting page
+        #   - Level 1: Starting page
+        #   - Level 2: Pages linked from starting page
+        #   - Level 3: Pages linked from Level 2 pages
+        # - maxWidth: 50 (broad) - crawls up to 50 pages at each depth level
+        # This results in potential maximum of ~1,250 pages (if 50 links exist at each level)
+        # 
+        # Common configurations:
+        # - Fast/Overview: maxDepth=1, maxWidth=5  (shallow, focused)
+        # - General/Standard: maxDepth=2, maxWidth=10  (balanced)
+        # - Deep and Broad: maxDepth=3, maxWidth=50  (comprehensive)
        
-        if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
-            # All web models use the same JSON formatted prompt
-            # Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
+        CRAWL_DEPTH = 3  # Deep crawl: follows links 3 levels deep
+        CRAWL_WIDTH = 50  # Broad crawl: up to 50 pages per level
+        
+        print(f"Crawl Configuration:")
+        print(f"  - Depth: {CRAWL_DEPTH} levels (deep)")
+        print(f"  - Width: {CRAWL_WIDTH} pages per level (broad)")
+        print(f"  - Theoretical max: {CRAWL_WIDTH ** min(CRAWL_DEPTH, 3)} pages")
+        
+        # Use WEB_CRAWL specific prompt format
+        from modules.datamodels.datamodelAi import AiCallPromptWebCrawl
+        
+        # Test with simple prompt like playground example
+        simplePrompt = f"https://www.valueon.ch: Who works in this company?"
+        
+        # But keep structured format for now to match our API
        testPrompt = json.dumps({
-                "prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
-                "maxResults": 5,
-                "timeRange": "y",
-                "country": "CH",  # ISO-2 code, Perplexity will convert to "Switzerland"
-                "format": "json"
+            "instruction": "Who works in this company?",
+            "url": "https://www.valueon.ch",
+            "maxDepth": CRAWL_DEPTH,
+            "maxWidth": CRAWL_WIDTH
        }, indent=2)
-        else:
-            # Fallback for other models
-            testPrompt = "Generate a comprehensive analysis of the current state of artificial intelligence. Return as JSON."
+        
+        print(f"Simple prompt (playground style): {simplePrompt}")
+        
+        # For Tavily models, test direct API call for better link following
+        if "tavily" in modelName.lower():
+            return await self._testTavilyDirect(modelName, CRAWL_DEPTH, CRAWL_WIDTH)
        
        print(f"Test prompt: {testPrompt}")
        print(f"Prompt length: {len(testPrompt)} characters")
@ -114,15 +152,9 @@ class AIModelsTester:
        startTime = asyncio.get_event_loop().time()
        
        try:
-            # Create options to force this specific model
-            if "internal" in modelName.lower():
+            # Create options for WEB_CRAWL operation
            options = AiCallOptions(
-                    operationType=OperationTypeEnum.DATA_EXTRACT,
-                    preferredModel=modelName
-                )
-            else:
-                options = AiCallOptions(
-                    operationType=OperationTypeEnum.DATA_GENERATE,
+                operationType=OperationTypeEnum.WEB_CRAWL,
                preferredModel=modelName
            )
            
@ -140,22 +172,7 @@ class AIModelsTester:
            import base64
            import os
            
-            # Prepare messages and options based on model type
-            if "vision" in modelName.lower():
-                # For vision models, skip for now since they require special handling
-                print(f"⚠️  Skipping vision model {modelName} - requires special image handling")
-                return {
-                    "modelName": modelName,
-                    "status": "SKIPPED",
-                    "processingTime": 0.0,
-                    "responseLength": 0,
-                    "responseType": "skipped",
-                    "hasContent": False,
-                    "error": "Vision model requires special image handling",
-                    "fullResponse": "Skipped - vision model requires special image handling"
-                }
-            else:
-                # For other models, use normal functionCall
+            # For WEB_CRAWL models, use normal functionCall with structured prompt
            messages = [{"role": "user", "content": testPrompt}]
            modelCall = AiModelCall(
                messages=messages,
@ -185,6 +202,10 @@ class AIModelsTester:
                        "bytesReceived": len(response.content.encode('utf-8')) if response.content else 0
                    }
                    
+                    # Extract actual prompt sent if available in metadata
+                    if hasattr(response, 'metadata') and response.metadata:
+                        result["actualPromptSent"] = response.metadata.get("actualPromptSent", "N/A")
+                    
                    # Try to parse content as JSON
                    if response.content:
                        try:
@ -289,9 +310,16 @@ class AIModelsTester:
                print(f"📄 Response length: {len(str(response))} characters")
                print(f"📄 Response preview: {result['responsePreview']}")
            
-            # Save text response for all models
-            if result.get("status") == "SUCCESS":
-                self._saveTextResponse(modelName, result)
+            # Add prompt to result for logging
+            result["testPrompt"] = testPrompt
+            result["crawlConfig"] = {
+                "depth": CRAWL_DEPTH,
+                "width": CRAWL_WIDTH
+            }
+            
+            # For WEB_CRAWL, also validate that content was extracted
+            if result.get("status") == "SUCCESS" and result.get("fullResponse"):
+                self._validateCrawlResponse(modelName, result)
            
        except Exception as e:
            endTime = asyncio.get_event_loop().time()
@ -304,13 +332,22 @@ class AIModelsTester:
                "responseLength": 0,
                "responseType": "exception",
                "hasContent": False,
-                "error": str(e)
+                "error": str(e),
+                "testPrompt": testPrompt,
+                "crawlConfig": {
+                    "depth": CRAWL_DEPTH,
+                    "width": CRAWL_WIDTH
+                }
            }
            
            print(f"💥 EXCEPTION - {str(e)}")
        
        self.testResults.append(result)
        
+        # Save text response even for exceptions to log the prompt
+        if result.get("status") in ["SUCCESS", "EXCEPTION", "ERROR"]:
+            self._saveTextResponse(modelName, result)
+        
        # Save individual model result immediately
        self._saveIndividualModelResult(modelName, result)
        
@ -378,6 +415,19 @@ class AIModelsTester:
            if not content:
                content = result.get("responsePreview", "No content available")
            
+            # If there's an error, include it in the content
+            if result.get("error"):
+                content = f"ERROR: {result.get('error')}\n\n{content}"
+            
+            # Get prompt and config for logging
+            config = result.get("crawlConfig", {})
+            crawlDepth = config.get("depth", "N/A")
+            crawlWidth = config.get("width", "N/A")
+            
+            # Get both the original JSON prompt and the actual prompt sent
+            originalPrompt = result.get("testPrompt", "N/A")
+            actualPromptSent = result.get("actualPromptSent", "N/A")
+            
            # Add metadata header
            metadata = f"""Model: {modelName}
 Test Time: {timestamp}
@ -385,6 +435,23 @@ Status: {result.get('status', 'Unknown')}
 Processing Time: {result.get('processingTime', 0):.2f}s
 Response Length: {result.get('responseLength', 0)} characters
 Is Valid JSON: {result.get('isValidJson', False)}
+Test Method: {result.get('testMethod', 'standard')}
+Pages Crawled: {result.get('pagesCrawled', 'N/A')}
+Crawled URL: {result.get('crawledUrl', 'N/A')}
+Has URL: {result.get('hasUrl', 'N/A')}
+Has Title: {result.get('hasTitle', 'N/A')}
+Has Content: {result.get('hasContent', 'N/A')}
+Content Length: {result.get('contentLength', 'N/A')} characters
+
+--- CRAWL CONFIGURATION ---
+Depth: {crawlDepth}
+Width: {crawlWidth}
+
+--- ORIGINAL JSON PROMPT (input) ---
+{originalPrompt}
+
+--- ACTUAL PROMPT SENT TO API (EXACT) ---
+{actualPromptSent}

 --- RESPONSE CONTENT ---
 {content}
@ -400,6 +467,174 @@ Is Valid JSON: {result.get('isValidJson', False)}
            print(f"❌ Error saving text response: {str(e)}")
            result["textSaveError"] = str(e)
    
+    def _validateCrawlResponse(self, modelName: str, result: Dict[str, Any]):
+        """Validate that the WEB_CRAWL response contains crawled content."""
+        try:
+            content = result.get("fullResponse", "")
+            
+            # Try to parse as JSON
+            crawledData = {}
+            try:
+                parsed = json.loads(content)
+                if isinstance(parsed, dict):
+                    crawledData = parsed
+            except:
+                pass
+            
+            # Check for expected fields: url, title, content
+            hasUrl = bool(crawledData.get("url"))
+            hasTitle = bool(crawledData.get("title"))
+            hasContent = bool(crawledData.get("content"))
+            contentLength = len(crawledData.get("content", ""))
+            
+            result["hasUrl"] = hasUrl
+            result["hasTitle"] = hasTitle
+            result["hasContent"] = hasContent
+            result["contentLength"] = contentLength
+            result["crawledUrl"] = crawledData.get("url", "")
+            
+            if hasUrl and hasContent:
+                print(f"✅ Successfully crawled content from URL: {crawledData.get('url', 'unknown')}")
+                print(f"   Content length: {contentLength} characters")
+                print(f"   Title: {crawledData.get('title', 'N/A')}")
+            else:
+                print(f"⚠️  Incomplete crawl response - URL: {hasUrl}, Content: {hasContent}")
+                
+        except Exception as e:
+            print(f"❌ Error validating crawl response: {str(e)}")
+            result["crawlValidationError"] = str(e)
+    
+    async def _testTavilyDirect(self, modelName: str, crawlDepth: int = 3, crawlWidth: int = 50) -> Dict[str, Any]:
+        """Test Tavily API directly using the crawl() method with better link following."""
+        print(f"\n{'='*60}")
+        print(f"TESTING TAVILY DIRECT API (crawl method)")
+        print(f"{'='*60}")
+        
+        startTime = asyncio.get_event_loop().time()
+        
+        try:
+            from tavily import AsyncTavilyClient
+            from modules.shared.configuration import APP_CONFIG
+            
+            apiKey = APP_CONFIG.get("Connector_AiTavily_API_SECRET")
+            if not apiKey:
+                raise Exception("Tavily API key not found")
+            
+            client = AsyncTavilyClient(api_key=apiKey)
+            
+            # Map our configuration to Tavily parameters
+            # maxWidth -> limit (pages per level)
+            # maxDepth -> max_depth (link following depth)
+            # max_breadth = maxWidth (breadth of crawl at each level)
+            tavilyLimit = crawlWidth
+            tavilyMaxDepth = crawlDepth
+            tavilyMaxBreadth = crawlWidth
+            
+            print(f"Calling Tavily API with crawl() method...")
+            print(f"URL: https://www.valueon.ch")
+            print(f"Instructions: Who works in this company?")
+            print(f"Limit: {tavilyLimit} pages per level")
+            print(f"Max depth: {tavilyMaxDepth} (follows links {tavilyMaxDepth} levels deep)")
+            print(f"Max breadth: {tavilyMaxBreadth} (up to {tavilyMaxBreadth} pages at each level)")
+            print(f"Deep and Broad Crawl Configuration Active")
+            
+            response = await client.crawl(
+                url="https://www.valueon.ch",
+                instructions="Who works in this company?",
+                limit=tavilyLimit,
+                max_depth=tavilyMaxDepth,
+                max_breadth=tavilyMaxBreadth
+            )
+            
+            endTime = asyncio.get_event_loop().time()
+            processingTime = endTime - startTime
+            
+            # Analyze response
+            contentLength = 0
+            pagesCrawled = 0
+            fullContent = ""
+            
+            if isinstance(response, dict):
+                # Check if it has results
+                if "results" in response:
+                    results = response["results"]
+                    pagesCrawled = len(results)
+                    content_parts = []
+                    for result in results:
+                        url = result.get("url", "")
+                        title = result.get("title", "")
+                        content = result.get("raw_content", result.get("content", ""))
+                        content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
+                        contentLength += len(content)
+                    
+                    fullContent = "\n".join(content_parts)
+                else:
+                    fullContent = json.dumps(response, indent=2)
+                    contentLength = len(fullContent)
+            elif isinstance(response, list):
+                pagesCrawled = len(response)
+                content_parts = []
+                for item in response:
+                    if isinstance(item, dict):
+                        url = item.get("url", "")
+                        title = item.get("title", "")
+                        content = item.get("raw_content", item.get("content", ""))
+                        content_parts.append(f"URL: {url}\nTitle: {title}\nContent: {content}\n{'='*60}\n")
+                        contentLength += len(content)
+                
+                fullContent = "\n".join(content_parts)
+            else:
+                fullContent = str(response)
+                contentLength = len(fullContent)
+            
+            result = {
+                "modelName": modelName,
+                "status": "SUCCESS",
+                "processingTime": round(processingTime, 2),
+                "responseLength": contentLength,
+                "responseType": "TavilyDirectAPI",
+                "hasContent": True,
+                "error": None,
+                "modelUsed": modelName,
+                "priceUsd": 0.0,
+                "bytesSent": 0,
+                "bytesReceived": contentLength,
+                "isValidJson": True,
+                "fullResponse": fullContent,
+                "pagesCrawled": pagesCrawled,
+                "testMethod": "direct_api_crawl"
+            }
+            
+            print(f"✅ SUCCESS - Processing time: {processingTime:.2f}s")
+            print(f"📄 Pages crawled: {pagesCrawled}")
+            print(f"📄 Total content length: {contentLength} characters")
+            
+            # Save the response
+            self._saveTextResponse(modelName, result)
+            self._validateCrawlResponse(modelName, result)
+            self._saveIndividualModelResult(modelName, result)
+            
+            self.testResults.append(result)
+            return result
+            
+        except Exception as e:
+            endTime = asyncio.get_event_loop().time()
+            processingTime = endTime - startTime
+            
+            result = {
+                "modelName": modelName,
+                "status": "EXCEPTION",
+                "processingTime": round(processingTime, 2),
+                "responseLength": 0,
+                "responseType": "exception",
+                "hasContent": False,
+                "error": str(e)
+            }
+            
+            print(f"💥 EXCEPTION - {str(e)}")
+            self.testResults.append(result)
+            return result
+    
    def _saveIndividualModelResult(self, modelName: str, result: Dict[str, Any]):
        """Save individual model test result to file."""
        try:
@ -425,22 +660,30 @@ Is Valid JSON: {result.get('isValidJson', False)}
            print(f"❌ Error saving individual result: {str(e)}")
    
    def getAllAvailableModels(self) -> List[str]:
-        """Get all available model names."""
-        # Hardcoded list of known models - same approach as test_ai_behavior.py
-        return [
-            # "claude-3-5-sonnet-20241022",  # Skipped - text model, test later
-            # "claude-3-5-sonnet-20241022-vision",  # Skipped - requires image input
-            # "gpt-4o",  # Skipped - text model, test later
-            # "gpt-3.5-turbo",  # Skipped - text model, test later
-            # "gpt-4o-vision",  # Skipped - requires image input
-            # "dall-e-3",  # Skipped - image generation, test later
-            "sonar",  # Perplexity web model
-            "sonar-pro",  # Perplexity web model
-            "tavily-search",  # Tavily web model (unified research)
-            # "internal-extractor",  # Skipped - internal model, test later
-            # "internal-generator",  # Skipped - internal model, test later
-            # "internal-renderer"  # Skipped - internal model, test later
-        ]
+        """Get all available model names that support WEB_CRAWL."""
+        from modules.aicore.aicoreModelRegistry import modelRegistry
+        from modules.datamodels.datamodelAi import OperationTypeEnum
+        
+        # Get all models from registry
+        allModels = modelRegistry.getAvailableModels()
+        
+        # Filter models that support WEB_CRAWL
+        webCrawlModels = []
+        for model in allModels:
+            if model.operationTypes and any(
+                ot.operationType == OperationTypeEnum.WEB_CRAWL 
+                for ot in model.operationTypes
+            ):  # Include both Tavily and Perplexity models
+                webCrawlModels.append(model.name)
+        
+        # Filter to only "sonar" model for testing
+        webCrawlModels = [m for m in webCrawlModels if m == "sonar"]
+        
+        print(f"Found {len(webCrawlModels)} models that support WEB_CRAWL (filtered to sonar):")
+        for modelName in webCrawlModels:
+            print(f"  - {modelName}")
+        
+        return webCrawlModels
    
    def saveTestResults(self):
        """Save detailed test results to file."""
@ -508,6 +751,15 @@ Is Valid JSON: {result.get('isValidJson', False)}
            if result.get("isValidJson") is not None:
                print(f"   Valid JSON: {'Yes' if result['isValidJson'] else 'No'}")
            
+            if result.get("crawledUrl"):
+                print(f"   Crawled URL: {result['crawledUrl']}")
+            
+            if result.get("contentLength") is not None:
+                print(f"   Content length: {result['contentLength']} characters")
+            
+            if result.get("pagesCrawled") is not None:
+                print(f"   Pages crawled: {result['pagesCrawled']}")
+            
            if result["error"]:
                print(f"   Error: {result['error']}")
            
@ -526,11 +778,31 @@ Is Valid JSON: {result.get('isValidJson', False)}
            print(f"🚀 Fastest model: {fastest['modelName']} ({fastest['processingTime']}s)")
            print(f"🐌 Slowest model: {slowest['modelName']} ({slowest['processingTime']}s)")
            
+            # Find models with most content
+            modelsWithContent = [r for r in successfulResults if r.get("contentLength", 0) > 0]
+            if modelsWithContent:
+                mostContent = max(modelsWithContent, key=lambda x: x.get("contentLength", 0))
+                totalContent = sum(r.get("contentLength", 0) for r in modelsWithContent)
+                avgContent = totalContent / len(modelsWithContent)
+                print(f"📄 Model with most content: {mostContent['modelName']} ({mostContent.get('contentLength', 0)} chars)")
+                print(f"📊 Average content per model: {avgContent:.0f} characters")
+                print(f"📊 Total content crawled across all models: {totalContent} characters")
+            
+            # Find models with most pages crawled (for Tavily direct API)
+            modelsWithPages = [r for r in successfulResults if r.get("pagesCrawled", 0) > 0]
+            if modelsWithPages:
+                mostPages = max(modelsWithPages, key=lambda x: x.get("pagesCrawled", 0))
+                totalPages = sum(r.get("pagesCrawled", 0) for r in modelsWithPages)
+                avgPages = totalPages / len(modelsWithPages)
+                print(f"🔍 Model with most pages crawled: {mostPages['modelName']} ({mostPages.get('pagesCrawled', 0)} pages)")
+                print(f"📊 Average pages per model: {avgPages:.1f} pages")
+                print(f"📊 Total pages crawled across all models: {totalPages} pages")
+
 async def main():
-    """Run AI models testing."""
+    """Run AI models testing for WEB_CRAWL operation."""
    tester = AIModelsTester()
    
-    print("Starting AI Models Testing...")
+    print("Starting AI Models Testing for WEB_CRAWL...")
    print("Initializing AI service...")
    await tester.initialize()
    
@ -542,8 +814,9 @@ async def main():
        print(f"  {i}. {model}")
    
    print(f"\n{'='*80}")
-    print("STARTING INDIVIDUAL MODEL TESTS")
+    print("STARTING WEB_CRAWL TESTS")
    print(f"{'='*80}")
+    print("Testing each model's ability to crawl URLs and return content...")
    print("Press Enter after each model test to continue to the next one...")
    
    # Test each model individually