refactored web research to be generic and in 2 steps: url, crawl

2025-10-26 14:02:44 +01:00 · 2025-10-26 14:02:44 +01:00 · 72e0687826
commit 72e0687826
parent e8c3052176
9 changed files with 1079 additions and 2169 deletions
--- a/modules/aicore/aicorePluginPerplexity.py
+++ b/modules/aicore/aicorePluginPerplexity.py
--- a/modules/aicore/aicorePluginTavily.py
+++ b/modules/aicore/aicorePluginTavily.py
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@ -21,11 +21,8 @@ class OperationTypeEnum(str, Enum):
    IMAGE_GENERATE = "imageGenerate"

    # Web Operations
-    WEB_SEARCH = "webSearch"          # Returns list of URLs only
-    WEB_CRAWL = "webCrawl"            # Returns content from given URLs
-    WEB_RESEARCH = "webResearch"      # WEB_SEARCH + WEB_CRAWL combined (scrape function)
-    WEB_QUESTIONS = "webQuestions"    # Question-answering web research
-    WEB_NEWS = "webNews"              # News-specific web research
+    WEB_SEARCH = "webSearch"    # Returns list of URLs only
+    WEB_CRAWL = "webCrawl"      # Web crawl for a given URL


 # Operation Type Rating - Helper class for capability ratings
@ -49,8 +46,8 @@ def createOperationTypeRatings(*ratings: Tuple[OperationTypeEnum, int]) -> List[
    Usage:
        operationTypes = createOperationTypeRatings(
            (OperationTypeEnum.DATA_ANALYSE, 8),
-            (OperationTypeEnum.WEB_RESEARCH, 10),
-            (OperationTypeEnum.WEB_NEWS, 7)
+            (OperationTypeEnum.WEB_SEARCH, 10),
+            (OperationTypeEnum.WEB_CRAWL, 9)
        )
    """
    return [OperationTypeRating(operationType=ot, rating=rating) for ot, rating in ratings]
@ -195,3 +192,42 @@ class AiModelResponse(BaseModel):
    class Config:
        arbitraryTypesAllowed = True

+
+# Structured prompt models for specialized operations
+class AiCallPromptWebSearch(BaseModel):
+    """Structured prompt format for WEB_SEARCH operation - returns list of URLs."""
+    
+    instruction: str = Field(description="Search instruction/query for finding relevant URLs")
+    country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
+    maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
+    timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
+    language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
+    researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
+    
+    class Config:
+        pass
+
+
+class AiCallPromptWebCrawl(BaseModel):
+    """Structured prompt format for WEB_CRAWL operation - crawls ONE specific URL and returns content."""
+    
+    instruction: str = Field(description="Instruction for what content to extract from URL")
+    url: str = Field(description="Single URL to crawl")
+    maxDepth: Optional[int] = Field(default=2, description="Maximum number of hops from starting page (default: 2)")
+    maxWidth: Optional[int] = Field(default=10, description="Maximum pages to crawl per level (default: 10)")
+    
+    class Config:
+        pass
+
+
+class AiCallPromptImage(BaseModel):
+    """Structured prompt format for image generation."""
+    
+    prompt: str = Field(description="Text description of the image to generate")
+    size: Optional[str] = Field(default="1024x1024", description="Image size (1024x1024, 1792x1024, 1024x1792)")
+    quality: Optional[str] = Field(default="standard", description="Image quality (standard, hd)")
+    style: Optional[str] = Field(default="vivid", description="Image style (vivid, natural)")
+    
+    class Config:
+        pass
+
--- a/modules/datamodels/datamodelTools.py
+++ b/modules/datamodels/datamodelTools.py
@ -0,0 +1,225 @@
+"""
+Utility data models and classes for common tools and mappings.
+"""
+
+class CountryCodes:
+    """
+    Centralized country code mapping for different services.
+    
+    Maps ISO-2 country codes to service-specific country names.
+    Each service may have different requirements for country names.
+    """
+    
+    # Mapping: ISO-2 code -> (Tavily country name, Perplexity country name)
+    _COUNTRY_MAP = {
+        "AF": ("afghanistan", "Afghanistan"),
+        "AL": ("albania", "Albania"),
+        "DZ": ("algeria", "Algeria"),
+        "AD": ("andorra", "Andorra"),
+        "AO": ("angola", "Angola"),
+        "AR": ("argentina", "Argentina"),
+        "AM": ("armenia", "Armenia"),
+        "AU": ("australia", "Australia"),
+        "AT": ("austria", "Austria"),
+        "AZ": ("azerbaijan", "Azerbaijan"),
+        "BS": ("bahamas", "Bahamas"),
+        "BH": ("bahrain", "Bahrain"),
+        "BD": ("bangladesh", "Bangladesh"),
+        "BB": ("barbados", "Barbados"),
+        "BY": ("belarus", "Belarus"),
+        "BE": ("belgium", "Belgium"),
+        "BZ": ("belize", "Belize"),
+        "BJ": ("benin", "Benin"),
+        "BT": ("bhutan", "Bhutan"),
+        "BO": ("bolivia", "Bolivia"),
+        "BA": ("bosnia and herzegovina", "Bosnia and Herzegovina"),
+        "BW": ("botswana", "Botswana"),
+        "BR": ("brazil", "Brazil"),
+        "BN": ("brunei", "Brunei"),
+        "BG": ("bulgaria", "Bulgaria"),
+        "BF": ("burkina faso", "Burkina Faso"),
+        "BI": ("burundi", "Burundi"),
+        "KH": ("cambodia", "Cambodia"),
+        "CM": ("cameroon", "Cameroon"),
+        "CA": ("canada", "Canada"),
+        "CV": ("cape verde", "Cape Verde"),
+        "CF": ("central african republic", "Central African Republic"),
+        "TD": ("chad", "Chad"),
+        "CL": ("chile", "Chile"),
+        "CN": ("china", "China"),
+        "CO": ("colombia", "Colombia"),
+        "KM": ("comoros", "Comoros"),
+        "CG": ("congo", "Congo"),
+        "CR": ("costa rica", "Costa Rica"),
+        "HR": ("croatia", "Croatia"),
+        "CU": ("cuba", "Cuba"),
+        "CY": ("cyprus", "Cyprus"),
+        "CZ": ("czech republic", "Czech Republic"),
+        "DK": ("denmark", "Denmark"),
+        "DJ": ("djibouti", "Djibouti"),
+        "DO": ("dominican republic", "Dominican Republic"),
+        "EC": ("ecuador", "Ecuador"),
+        "EG": ("egypt", "Egypt"),
+        "SV": ("el salvador", "El Salvador"),
+        "GQ": ("equatorial guinea", "Equatorial Guinea"),
+        "ER": ("eritrea", "Eritrea"),
+        "EE": ("estonia", "Estonia"),
+        "ET": ("ethiopia", "Ethiopia"),
+        "FJ": ("fiji", "Fiji"),
+        "FI": ("finland", "Finland"),
+        "FR": ("france", "France"),
+        "GA": ("gabon", "Gabon"),
+        "GM": ("gambia", "Gambia"),
+        "GE": ("georgia", "Georgia"),
+        "DE": ("germany", "Germany"),
+        "GH": ("ghana", "Ghana"),
+        "GR": ("greece", "Greece"),
+        "GT": ("guatemala", "Guatemala"),
+        "GN": ("guinea", "Guinea"),
+        "HT": ("haiti", "Haiti"),
+        "HN": ("honduras", "Honduras"),
+        "HU": ("hungary", "Hungary"),
+        "IS": ("iceland", "Iceland"),
+        "IN": ("india", "India"),
+        "ID": ("indonesia", "Indonesia"),
+        "IR": ("iran", "Iran"),
+        "IQ": ("iraq", "Iraq"),
+        "IE": ("ireland", "Ireland"),
+        "IL": ("israel", "Israel"),
+        "IT": ("italy", "Italy"),
+        "JM": ("jamaica", "Jamaica"),
+        "JP": ("japan", "Japan"),
+        "JO": ("jordan", "Jordan"),
+        "KZ": ("kazakhstan", "Kazakhstan"),
+        "KE": ("kenya", "Kenya"),
+        "KW": ("kuwait", "Kuwait"),
+        "KG": ("kyrgyzstan", "Kyrgyzstan"),
+        "LV": ("latvia", "Latvia"),
+        "LB": ("lebanon", "Lebanon"),
+        "LS": ("lesotho", "Lesotho"),
+        "LR": ("liberia", "Liberia"),
+        "LY": ("libya", "Libya"),
+        "LI": ("liechtenstein", "Liechtenstein"),
+        "LT": ("lithuania", "Lithuania"),
+        "LU": ("luxembourg", "Luxembourg"),
+        "MG": ("madagascar", "Madagascar"),
+        "MW": ("malawi", "Malawi"),
+        "MY": ("malaysia", "Malaysia"),
+        "MV": ("maldives", "Maldives"),
+        "ML": ("mali", "Mali"),
+        "MT": ("malta", "Malta"),
+        "MR": ("mauritania", "Mauritania"),
+        "MU": ("mauritius", "Mauritius"),
+        "MX": ("mexico", "Mexico"),
+        "MD": ("moldova", "Moldova"),
+        "MC": ("monaco", "Monaco"),
+        "MN": ("mongolia", "Mongolia"),
+        "ME": ("montenegro", "Montenegro"),
+        "MA": ("morocco", "Morocco"),
+        "MZ": ("mozambique", "Mozambique"),
+        "MM": ("myanmar", "Myanmar"),
+        "NA": ("namibia", "Namibia"),
+        "NP": ("nepal", "Nepal"),
+        "NL": ("netherlands", "Netherlands"),
+        "NZ": ("new zealand", "New Zealand"),
+        "NI": ("nicaragua", "Nicaragua"),
+        "NE": ("niger", "Niger"),
+        "NG": ("nigeria", "Nigeria"),
+        "KP": ("north korea", "North Korea"),
+        "MK": ("north macedonia", "North Macedonia"),
+        "NO": ("norway", "Norway"),
+        "OM": ("oman", "Oman"),
+        "PK": ("pakistan", "Pakistan"),
+        "PA": ("panama", "Panama"),
+        "PG": ("papua new guinea", "Papua New Guinea"),
+        "PY": ("paraguay", "Paraguay"),
+        "PE": ("peru", "Peru"),
+        "PH": ("philippines", "Philippines"),
+        "PL": ("poland", "Poland"),
+        "PT": ("portugal", "Portugal"),
+        "QA": ("qatar", "Qatar"),
+        "RO": ("romania", "Romania"),
+        "RU": ("russia", "Russia"),
+        "RW": ("rwanda", "Rwanda"),
+        "SA": ("saudi arabia", "Saudi Arabia"),
+        "SN": ("senegal", "Senegal"),
+        "RS": ("serbia", "Serbia"),
+        "SG": ("singapore", "Singapore"),
+        "SK": ("slovakia", "Slovakia"),
+        "SI": ("slovenia", "Slovenia"),
+        "SO": ("somalia", "Somalia"),
+        "ZA": ("south africa", "South Africa"),
+        "KR": ("south korea", "South Korea"),
+        "SS": ("south sudan", "South Sudan"),
+        "ES": ("spain", "Spain"),
+        "LK": ("sri lanka", "Sri Lanka"),
+        "SD": ("sudan", "Sudan"),
+        "SE": ("sweden", "Sweden"),
+        "CH": ("switzerland", "Switzerland"),
+        "SY": ("syria", "Syria"),
+        "TW": ("taiwan", "Taiwan"),
+        "TJ": ("tajikistan", "Tajikistan"),
+        "TZ": ("tanzania", "Tanzania"),
+        "TH": ("thailand", "Thailand"),
+        "TG": ("togo", "Togo"),
+        "TT": ("trinidad and tobago", "Trinidad and Tobago"),
+        "TN": ("tunisia", "Tunisia"),
+        "TR": ("turkey", "Turkey"),
+        "TM": ("turkmenistan", "Turkmenistan"),
+        "UG": ("uganda", "Uganda"),
+        "UA": ("ukraine", "Ukraine"),
+        "AE": ("united arab emirates", "United Arab Emirates"),
+        "GB": ("united kingdom", "United Kingdom"),
+        "US": ("united states", "United States"),
+        "UY": ("uruguay", "Uruguay"),
+        "UZ": ("uzbekistan", "Uzbekistan"),
+        "VE": ("venezuela", "Venezuela"),
+        "VN": ("vietnam", "Vietnam"),
+        "YE": ("yemen", "Yemen"),
+        "ZM": ("zambia", "Zambia"),
+        "ZW": ("zimbabwe", "Zimbabwe"),
+    }
+    
+    @classmethod
+    def getForTavily(cls, isoCode: str) -> str:
+        """
+        Get Tavily-compatible country name from ISO-2 code.
+        
+        Args:
+            isoCode: ISO-2 country code (e.g., "CH", "US")
+            
+        Returns:
+            Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
+        """
+        isoCodeUpper = isoCode.upper()
+        mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
+        return mapping[0] if mapping else isoCode
+    
+    @classmethod
+    def getForPerplexity(cls, isoCode: str) -> str:
+        """
+        Get Perplexity-compatible country name from ISO-2 code.
+        
+        Args:
+            isoCode: ISO-2 country code (e.g., "CH", "US")
+            
+        Returns:
+            Full country name as required by Perplexity (e.g., "Switzerland", "United States")
+        """
+        isoCodeUpper = isoCode.upper()
+        mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
+        return mapping[1] if mapping else isoCode
+    
+    @classmethod
+    def isValid(cls, isoCode: str) -> bool:
+        """
+        Check if ISO-2 code is valid.
+        
+        Args:
+            isoCode: ISO-2 country code to check
+            
+        Returns:
+            True if valid, False otherwise
+        """
+        return isoCode.upper() in cls._COUNTRY_MAP
+
--- a/modules/services/init.py
+++ b/modules/services/init.py
@ -81,6 +81,9 @@ class Services:
        from .serviceUtils.mainServiceUtils import UtilsService
        self.utils = PublicService(UtilsService(self))
        
+        from .serviceWeb.mainServiceWeb import WebService
+        self.web = PublicService(WebService(self))
+        

 def getInterface(user: User, workflow: ChatWorkflow) -> Services:
    return Services(user, workflow)
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -3,7 +3,6 @@ from typing import Dict, Any, List, Optional, Union
 from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
 from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
 from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
-from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
 from modules.interfaces.interfaceAiObjects import AiObjects
 from modules.services.serviceAi.subCoreAi import SubCoreAi
 from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@ -0,0 +1,314 @@
+"""
+Web crawl service for handling web research operations.
+Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
+"""
+
+import json
+import logging
+from typing import Dict, Any, List, Optional
+from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
+
+logger = logging.getLogger(__name__)
+
+
+class WebcrawlService:
+    """Service for web search and crawling operations."""
+    
+    def __init__(self, services):
+        """Initialize webcrawl service with service center access."""
+        self.services = services
+    
+    async def performWebResearch(
+        self,
+        prompt: str,
+        urls: List[str],
+        country: Optional[str],
+        language: Optional[str],
+        researchDepth: str = "general",
+        operationId: str = None
+    ) -> Dict[str, Any]:
+        """
+        Perform web research in two steps:
+        1. Use AI to analyze prompt and extract parameters + URLs
+        2. Call WEB_SEARCH to get URLs (if needed)
+        3. Combine URLs and filter to maxNumberPages
+        4. Call WEB_CRAWL for each URL
+        5. Return consolidated result
+        
+        Args:
+            prompt: Natural language research prompt
+            urls: Optional list of URLs provided by user
+            country: Optional country code
+            language: Optional language code
+            operationId: Operation ID for progress tracking
+            
+        Returns:
+            Consolidated research results as dictionary
+        """
+        try:
+            # Step 1: AI intention analysis - extract URLs and parameters from prompt
+            self.services.workflow.progressLogUpdate(operationId, 0.1, "Analyzing research intent")
+            
+            analysisResult = await self._analyzeResearchIntent(prompt, urls, country, language, researchDepth)
+            
+            # Extract parameters from AI analysis
+            instruction = analysisResult.get("instruction", prompt)
+            extractedUrls = analysisResult.get("urls", [])
+            needsSearch = analysisResult.get("needsSearch", True)  # Default to True
+            maxNumberPages = analysisResult.get("maxNumberPages", 10)
+            timeRange = analysisResult.get("timeRange")
+            countryCode = analysisResult.get("country", country)
+            languageCode = analysisResult.get("language", language)
+            finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
+            
+            logger.info(f"AI Analysis: instruction='{instruction[:100]}...', urls={len(extractedUrls)}, needsSearch={needsSearch}, maxNumberPages={maxNumberPages}, researchDepth={finalResearchDepth}")
+            
+            # Combine URLs (from user + from prompt extraction)
+            allUrls = []
+            if urls:
+                allUrls.extend(urls)
+            if extractedUrls:
+                allUrls.extend(extractedUrls)
+            
+            # Step 2: Search for URLs if needed (based on needsSearch flag)
+            if needsSearch and (not allUrls or len(allUrls) < maxNumberPages):
+                self.services.workflow.progressLogUpdate(operationId, 0.3, "Searching for URLs")
+                
+                searchUrls = await self._performWebSearch(
+                    instruction=instruction,
+                    maxNumberPages=maxNumberPages - len(allUrls),
+                    timeRange=timeRange,
+                    country=countryCode,
+                    language=languageCode
+                )
+                
+                # Add search URLs to the list
+                allUrls.extend(searchUrls)
+                
+                self.services.workflow.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
+            
+            # Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering)
+            if len(allUrls) > maxNumberPages:
+                allUrls = allUrls[:maxNumberPages]
+                logger.info(f"Limited URLs to {maxNumberPages}")
+            
+            if not allUrls:
+                return {"error": "No URLs found to crawl"}
+            
+            # Step 4: Translate researchDepth to maxDepth
+            depthMap = {"fast": 1, "general": 2, "deep": 3}
+            maxDepth = depthMap.get(finalResearchDepth.lower(), 2)
+            
+            # Step 5: Crawl all URLs
+            self.services.workflow.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs")
+            
+            crawlResult = await self._performWebCrawl(
+                instruction=instruction,
+                urls=allUrls,
+                maxDepth=maxDepth
+            )
+            
+            self.services.workflow.progressLogUpdate(operationId, 0.9, "Consolidating results")
+            
+            # Return consolidated result
+            return {
+                "instruction": instruction,
+                "urls_crawled": allUrls,
+                "total_urls": len(allUrls),
+                "results": crawlResult,
+                "total_results": len(crawlResult) if isinstance(crawlResult, list) else 1
+            }
+            
+        except Exception as e:
+            logger.error(f"Error in web research: {str(e)}")
+            raise
+    
+    async def _analyzeResearchIntent(
+        self,
+        prompt: str,
+        urls: List[str],
+        country: Optional[str],
+        language: Optional[str],
+        researchDepth: str = "general"
+    ) -> Dict[str, Any]:
+        """
+        Use AI to analyze prompt and extract:
+        - URLs from the prompt text
+        - Research instruction
+        - maxNumberPages, timeRange, country, language from context
+        """
+        # Build analysis prompt for AI
+        analysisPrompt = f"""Analyze this web research request and extract structured information.
+
+RESEARCH REQUEST:
+{prompt}
+
+USER PROVIDED:
+- URLs: {json.dumps(urls) if urls else "None"}
+- Country: {country or "Not specified"}
+- Language: {language or "Not specified"}
+
+Extract and provide a JSON response with:
+1. instruction: The core research instruction (cleaned prompt without URLs)
+2. urls: List of URLs found in the prompt text
+3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
+4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
+5. timeRange: Time range if mentioned (d, w, m, y, or null)
+6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
+7. language: Language code if specified (lowercase, e.g., de, en, fr)
+8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
+
+Return ONLY valid JSON, no additional text:
+{{
+    "instruction": "cleaned research instruction",
+    "urls": ["url1", "url2"],
+    "needsSearch": true,
+    "maxNumberPages": 10,
+    "timeRange": null,
+    "country": "ch",
+    "language": "de",
+    "researchDepth": "general"
+}}"""
+
+        try:
+            # Call AI planning to analyze intent
+            analysisJson = await self.services.ai.callAiPlanning(analysisPrompt)
+            
+            # Parse JSON response
+            result = json.loads(analysisJson)
+            
+            logger.info(f"Intent analysis result: {result}")
+            return result
+            
+        except Exception as e:
+            logger.warning(f"Error in AI intent analysis: {str(e)}")
+            # Fallback to basic extraction
+            return {
+                "instruction": prompt,
+                "urls": [],
+                "needsSearch": True,
+                "maxNumberPages": 10,
+                "timeRange": None,
+                "country": country,
+                "language": language,
+                "researchDepth": researchDepth
+            }
+    
+    async def _performWebSearch(
+        self,
+        instruction: str,
+        maxNumberPages: int,
+        timeRange: Optional[str],
+        country: Optional[str],
+        language: Optional[str]
+    ) -> List[str]:
+        """Perform web search to find URLs."""
+        try:
+            # Build search prompt model
+            searchPromptModel = AiCallPromptWebSearch(
+                instruction=instruction,
+                country=country,
+                maxNumberPages=maxNumberPages,
+                timeRange=timeRange,
+                language=language
+            )
+            searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
+            
+            # Call AI with WEB_SEARCH operation
+            searchOptions = AiCallOptions(
+                operationType=OperationTypeEnum.WEB_SEARCH,
+                resultFormat="json"
+            )
+            
+            searchResult = await self.services.ai.callAiDocuments(
+                prompt=searchPrompt,
+                documents=None,
+                options=searchOptions,
+                outputFormat="json"
+            )
+            
+            # Parse and extract URLs
+            if isinstance(searchResult, str):
+                searchData = json.loads(searchResult)
+            else:
+                searchData = searchResult
+            
+            # Extract URLs from response
+            urls = []
+            if isinstance(searchData, dict):
+                if "urls" in searchData:
+                    urls = searchData["urls"]
+                elif "results" in searchData:
+                    urls = [r.get("url") for r in searchData["results"] if r.get("url")]
+            elif isinstance(searchData, list):
+                urls = [item.get("url") for item in searchData if item.get("url")]
+            
+            logger.info(f"Web search returned {len(urls)} URLs")
+            return urls
+            
+        except Exception as e:
+            logger.error(f"Error in web search: {str(e)}")
+            return []
+    
+    async def _performWebCrawl(
+        self,
+        instruction: str,
+        urls: List[str],
+        maxDepth: int = 2
+    ) -> List[Dict[str, Any]]:
+        """Perform web crawl on list of URLs - calls plugin for each URL individually."""
+        crawlResults = []
+        
+        # Loop over each URL and crawl one at a time
+        for url in urls:
+            try:
+                logger.info(f"Crawling URL: {url}")
+                
+                # Build crawl prompt model for single URL
+                crawlPromptModel = AiCallPromptWebCrawl(
+                    instruction=instruction,
+                    url=url,  # Single URL
+                    maxDepth=maxDepth,
+                    maxWidth=10
+                )
+                crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
+                
+                # Call AI with WEB_CRAWL operation
+                crawlOptions = AiCallOptions(
+                    operationType=OperationTypeEnum.WEB_CRAWL,
+                    resultFormat="json"
+                )
+                
+                crawlResult = await self.services.ai.callAiDocuments(
+                    prompt=crawlPrompt,
+                    documents=None,
+                    options=crawlOptions,
+                    outputFormat="json"
+                )
+                
+                # Parse crawl result
+                if isinstance(crawlResult, str):
+                    try:
+                        crawlData = json.loads(crawlResult)
+                    except:
+                        crawlData = {"url": url, "content": crawlResult}
+                else:
+                    crawlData = crawlResult
+                
+                # Ensure it's a list of results
+                if isinstance(crawlData, list):
+                    crawlResults.extend(crawlData)
+                elif isinstance(crawlData, dict):
+                    if "results" in crawlData:
+                        crawlResults.extend(crawlData["results"])
+                    else:
+                        crawlResults.append(crawlData)
+                else:
+                    crawlResults.append({"url": url, "content": str(crawlData)})
+                    
+            except Exception as e:
+                logger.error(f"Error crawling URL {url}: {str(e)}")
+                crawlResults.append({"url": url, "error": str(e)})
+        
+        return crawlResults
+
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@ -10,9 +10,7 @@ from datetime import datetime, UTC

 from modules.workflows.methods.methodBase import MethodBase, action
 from modules.datamodels.datamodelChat import ActionResult
-from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
-from modules.datamodels.datamodelChat import ChatDocument
-from modules.aicore.aicorePluginTavily import WebResearchRequest
+from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptImage

 logger = logging.getLogger(__name__)

@ -163,272 +161,50 @@ class MethodAi(MethodBase):
            )


-    @action
-    async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
-        """
-        GENERAL:
-        - Purpose: Search the web and return a list of relevant URLs only.
-        - Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
-        - Output format: JSON with search results and URLs.
-
-        Parameters:
-        - searchPrompt (str, required): Natural language search prompt describing what to search for.
-        - maxResults (int, optional): Maximum number of search results. Default: 5.
-        - timeRange (str, optional): d | w | m | y for time filtering.
-        - country (str, optional): Country name for localized results.
-        - language (str, optional): Language code (e.g., de, en, fr).
-        """
-        try:
-            searchPrompt = parameters.get("searchPrompt")
-            if not searchPrompt:
-                return ActionResult.isFailure(error="Search prompt is required")
-            
-            # Extract optional parameters
-            maxResults = parameters.get("maxResults", 5)
-            timeRange = parameters.get("timeRange")
-            country = parameters.get("country")
-            language = parameters.get("language")
-            
-            # Build AI call options for web search
-            options = AiCallOptions(
-                operationType=OperationTypeEnum.WEB_SEARCH,
-                resultFormat="json"
-            )
-            
-            # Create unified prompt JSON that both Tavily and Perplexity can understand
-            promptData = {
-                "searchPrompt": searchPrompt,
-                "maxResults": maxResults,
-                "timeRange": timeRange,
-                "country": country,
-                "language": language,
-                "instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
-            }
-            
-            import json
-            prompt = json.dumps(promptData, indent=2)
-            
-            # Call AI service through unified path
-            result = await self.services.ai.callAiDocuments(
-                prompt=prompt,
-                documents=None,
-                options=options,
-                outputFormat="json"
-            )
-            
-            # Process result to ensure consistent format
-            processedResult = self._processWebSearchResult(result)
-            
-            # Create meaningful filename
-            meaningfulName = self._generateMeaningfulFileName(
-                base_name="web_search",
-                extension="json",
-                action_name="search"
-            )
-            
-            from modules.datamodels.datamodelChat import ActionDocument
-            actionDocument = ActionDocument(
-                documentName=meaningfulName,
-                documentData=processedResult,
-                mimeType="application/json"
-            )
-            
-            return ActionResult.isSuccess(documents=[actionDocument])
-            
-        except Exception as e:
-            logger.error(f"Error in web search: {str(e)}")
-            return ActionResult.isFailure(error=str(e))
-
-    def _processWebSearchResult(self, result: str) -> str:
-        """
-        Process web search result to ensure consistent JSON format with URL list.
-        Both Tavily and Perplexity now return proper JSON format.
-        """
-        try:
-            import json
-            data = json.loads(result)
-            
-            # If it's already a proper search result format, return as-is
-            if isinstance(data, dict) and "results" in data:
-                return result
-            
-            # If it's a different JSON format, try to extract URLs
-            if isinstance(data, dict):
-                # Look for URL patterns in the JSON
-                urls = self._extractUrlsFromJson(data)
-                if urls:
-                    processedData = {
-                        "query": data.get("query", "web search"),
-                        "results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
-                        "total_count": len(urls)
-                    }
-                    return json.dumps(processedData, indent=2)
-            
-            # No URLs found, return original result in a structured format
-            processedData = {
-                "query": "web search",
-                "results": [],
-                "total_count": 0,
-                "raw_response": result
-            }
-            return json.dumps(processedData, indent=2)
-                
-        except Exception as e:
-            logger.warning(f"Error processing web search result: {str(e)}")
-            # Return original result wrapped in error format
-            errorData = {
-                "query": "web search",
-                "results": [],
-                "total_count": 0,
-                "error": f"Failed to process result: {str(e)}",
-                "raw_response": result
-            }
-            return json.dumps(errorData, indent=2)
-
-    def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
-        """Extract URLs from JSON data structure."""
-        urls = []
-        
-        def _extractFromValue(value):
-            if isinstance(value, str):
-                # Check if it's a URL
-                if value.startswith(('http://', 'https://')):
-                    urls.append(value)
-            elif isinstance(value, dict):
-                for v in value.values():
-                    _extractFromValue(v)
-            elif isinstance(value, list):
-                for item in value:
-                    _extractFromValue(item)
-        
-        _extractFromValue(data)
-        return list(set(urls))  # Remove duplicates
-
-
-    @action
-    async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
-        """
-        GENERAL:
-        - Purpose: Extract content from specific URLs.
-        - Input requirements: urls (required); optional extractDepth, format.
-        - Output format: JSON with extracted content from URLs.
-
-        Parameters:
-        - urls (list, required): List of URLs to crawl and extract content from.
-        - extractDepth (str, optional): basic | advanced. Default: advanced.
-        - format (str, optional): markdown | html | text. Default: markdown.
-        """
-        try:
-            urls = parameters.get("urls")
-            if not urls or not isinstance(urls, list):
-                return ActionResult.isFailure(error="URLs list is required")
-            
-            # Extract optional parameters
-            extractDepth = parameters.get("extractDepth", "advanced")
-            formatType = parameters.get("format", "markdown")
-            
-            # Build AI call options for web crawling
-            options = AiCallOptions(
-                operationType=OperationTypeEnum.WEB_CRAWL,
-                resultFormat="json"
-            )
-            
-            # Create unified prompt JSON for web crawling
-            promptData = {
-                "urls": urls,
-                "extractDepth": extractDepth,
-                "format": formatType,
-                "instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
-            }
-            
-            import json
-            prompt = json.dumps(promptData, indent=2)
-            
-            # Call AI service through unified path
-            result = await self.services.ai.callAiDocuments(
-                prompt=prompt,
-                documents=None,
-                options=options,
-                outputFormat="json"
-            )
-            
-            # Create meaningful filename
-            meaningfulName = self._generateMeaningfulFileName(
-                base_name="web_crawl",
-                extension="json",
-                action_name="crawl"
-            )
-            
-            from modules.datamodels.datamodelChat import ActionDocument
-            actionDocument = ActionDocument(
-                documentName=meaningfulName,
-                documentData=result,
-                mimeType="application/json"
-            )
-            
-            return ActionResult.isSuccess(documents=[actionDocument])
-            
-        except Exception as e:
-            logger.error(f"Error in web crawl: {str(e)}")
-            return ActionResult.isFailure(error=str(e))
-
-
    @action
    async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
-        - Purpose: Comprehensive web research combining search and content extraction.
-        - Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
-        - Output format: JSON with research results, sources, and analysis.
+        - Purpose: Web research with two-step process: search for URLs, then crawl content.
+        - Input requirements: prompt (required); optional list(url), country, language, researchDepth.
+        - Output format: JSON with research results including URLs and content.

        Parameters:
-        - researchPrompt (str, required): Natural language research prompt describing what to research.
-        - maxResults (int, optional): Maximum search results. Default: 5.
-        - urls (list, optional): Specific URLs to include in research.
-        - timeRange (str, optional): d | w | m | y for time filtering.
-        - country (str, optional): Country name for localized results.
-        - language (str, optional): Language code (e.g., de, en, fr).
+        - prompt (str, required): Natural language research instruction, including time range if relevant.
+        - list(url) (list, optional): Specific URLs to crawl, if needed.
+        - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
+        - language (str, optional): Language code (lowercase, e.g., de, en, fr).
+        - researchDepth (str, optional): Research depth - fast, general, or deep. Default: general.
        """
        try:
-            researchPrompt = parameters.get("researchPrompt")
-            if not researchPrompt:
+            prompt = parameters.get("prompt")
+            if not prompt:
                return ActionResult.isFailure(error="Research prompt is required")
            
-            # Extract optional parameters
-            maxResults = parameters.get("maxResults", 5)
-            urls = parameters.get("urls")
-            timeRange = parameters.get("timeRange")
-            country = parameters.get("country")
-            language = parameters.get("language")
+            # Init progress logger
+            operationId = f"web_research_{self.services.currentWorkflow.id}_{int(time.time())}"
            
-            # Build AI call options for web research
-            options = AiCallOptions(
-                operationType=OperationTypeEnum.WEB_RESEARCH,
-                resultFormat="json"
+            # Start progress tracking
+            self.services.workflow.progressLogStart(
+                operationId,
+                "Web Research",
+                "Searching and Crawling",
+                "Extracting URLs and Content"
            )
            
-            # Create unified prompt JSON for web research
-            promptData = {
-                "researchPrompt": researchPrompt,
-                "maxResults": maxResults,
-                "urls": urls,
-                "timeRange": timeRange,
-                "country": country,
-                "language": language,
-                "instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
-            }
-            
-            import json
-            prompt = json.dumps(promptData, indent=2)
-            
-            # Call AI service through unified path
-            result = await self.services.ai.callAiDocuments(
+            # Call webcrawl service - service handles all AI intention analysis and processing
+            result = await self.services.web.performWebResearch(
                prompt=prompt,
-                documents=None,
-                options=options,
-                outputFormat="json"
+                urls=parameters.get("list(url)", []),
+                country=parameters.get("country"),
+                language=parameters.get("language"),
+                researchDepth=parameters.get("researchDepth", "general"),
+                operationId=operationId
            )
            
+            # Complete progress tracking
+            self.services.workflow.progressLogFinish(operationId, True)
+            
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_research",
@ -447,157 +223,10 @@ class MethodAi(MethodBase):
            
        except Exception as e:
            logger.error(f"Error in web research: {str(e)}")
-            return ActionResult.isFailure(error=str(e))
-
-
-    @action
-    async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
-        """
-        GENERAL:
-        - Purpose: Answer questions using web research and AI analysis.
-        - Input requirements: question (required); optional context, maxResults, timeRange, country, language.
-        - Output format: JSON with question answer and supporting sources.
-
-        Parameters:
-        - question (str, required): Question to be answered using web research.
-        - context (str, optional): Additional context for the question.
-        - maxResults (int, optional): Maximum search results. Default: 5.
-        - timeRange (str, optional): d | w | m | y for time filtering.
-        - country (str, optional): Country name for localized results.
-        - language (str, optional): Language code (e.g., de, en, fr).
-        """
-        try:
-            question = parameters.get("question")
-            if not question:
-                return ActionResult.isFailure(error="Question is required")
-            
-            # Extract optional parameters
-            context = parameters.get("context", "")
-            maxResults = parameters.get("maxResults", 5)
-            timeRange = parameters.get("timeRange")
-            country = parameters.get("country")
-            language = parameters.get("language")
-            
-            # Build AI call options for web questions
-            options = AiCallOptions(
-                operationType=OperationTypeEnum.WEB_QUESTIONS,
-                resultFormat="json"
-            )
-            
-            # Create unified prompt JSON for web questions
-            promptData = {
-                "question": question,
-                "context": context,
-                "maxResults": maxResults,
-                "timeRange": timeRange,
-                "country": country,
-                "language": language,
-                "instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
-            }
-            
-            import json
-            prompt = json.dumps(promptData, indent=2)
-            
-            # Call AI service through unified path
-            result = await self.services.ai.callAiDocuments(
-                prompt=prompt,
-                documents=None,
-                options=options,
-                outputFormat="json"
-            )
-            
-            # Create meaningful filename
-            meaningfulName = self._generateMeaningfulFileName(
-                base_name="web_questions",
-                extension="json",
-                action_name="questions"
-            )
-            
-            from modules.datamodels.datamodelChat import ActionDocument
-            actionDocument = ActionDocument(
-                documentName=meaningfulName,
-                documentData=result,
-                mimeType="application/json"
-            )
-            
-            return ActionResult.isSuccess(documents=[actionDocument])
-            
-        except Exception as e:
-            logger.error(f"Error in web questions: {str(e)}")
-            return ActionResult.isFailure(error=str(e))
-
-
-    @action
-    async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
-        """
-        GENERAL:
-        - Purpose: Search and analyze news articles on specific topics.
-        - Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
-        - Output format: JSON with news articles, summaries, and analysis.
-
-        Parameters:
-        - newsPrompt (str, required): Natural language prompt describing what news to search for.
-        - maxResults (int, optional): Maximum news articles. Default: 5.
-        - timeRange (str, optional): d | w | m | y for time filtering. Default: w.
-        - country (str, optional): Country name for localized news.
-        - language (str, optional): Language code (e.g., de, en, fr).
-        """
-        try:
-            newsPrompt = parameters.get("newsPrompt")
-            if not newsPrompt:
-                return ActionResult.isFailure(error="News prompt is required")
-            
-            # Extract optional parameters
-            maxResults = parameters.get("maxResults", 5)
-            timeRange = parameters.get("timeRange", "w")  # Default to week
-            country = parameters.get("country")
-            language = parameters.get("language")
-            
-            # Build AI call options for web news
-            options = AiCallOptions(
-                operationType=OperationTypeEnum.WEB_NEWS,
-                resultFormat="json"
-            )
-            
-            # Create unified prompt JSON for web news
-            promptData = {
-                "newsPrompt": newsPrompt,
-                "maxResults": maxResults,
-                "timeRange": timeRange,
-                "country": country,
-                "language": language,
-                "instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
-            }
-            
-            import json
-            prompt = json.dumps(promptData, indent=2)
-            
-            # Call AI service through unified path
-            result = await self.services.ai.callAiDocuments(
-                prompt=prompt,
-                documents=None,
-                options=options,
-                outputFormat="json"
-            )
-            
-            # Create meaningful filename
-            meaningfulName = self._generateMeaningfulFileName(
-                base_name="web_news",
-                extension="json",
-                action_name="news"
-            )
-            
-            from modules.datamodels.datamodelChat import ActionDocument
-            actionDocument = ActionDocument(
-                documentName=meaningfulName,
-                documentData=result,
-                mimeType="application/json"
-            )
-            
-            return ActionResult.isSuccess(documents=[actionDocument])
-            
-        except Exception as e:
-            logger.error(f"Error in web news: {str(e)}")
+            try:
+                self.services.workflow.progressLogFinish(operationId, False)
+            except:
+                pass
            return ActionResult.isFailure(error=str(e))


@ -631,17 +260,16 @@ class MethodAi(MethodBase):
                resultFormat="base64"
            )
            
-            # Create unified prompt JSON for image generation
-            promptData = {
-                "prompt": prompt,
-                "size": size,
-                "quality": quality,
-                "style": style,
-                "instructions": "Generate an image based on the prompt and return the base64 encoded image data."
-            }
+            # Create structured prompt using Pydantic model
+            promptModel = AiCallPromptImage(
+                prompt=prompt,
+                size=size,
+                quality=quality,
+                style=style
+            )
            
-            import json
-            promptJson = json.dumps(promptData, indent=2)
+            # Convert to JSON string for prompt
+            promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
            
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
--- a/test_ai_models.py
+++ b/test_ai_models.py
@ -91,26 +91,18 @@ class AIModelsTester:
        print(f"TESTING MODEL: {modelName}")
        print(f"{'='*60}")
        
-        # Choose test prompt based on model type - Web models get JSON formatted prompts
+        # Use same prompt for all web models
        import json
        
-        if "tavily" in modelName.lower():
-            # Tavily models get web search prompt in JSON format (from methodAi.py)
+        if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
+            # All web models use the same JSON formatted prompt
+            # Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
            testPrompt = json.dumps({
-                "searchPrompt": "Search for recent news about artificial intelligence developments in 2024. Return the top 3 results as JSON with fields: title, url, snippet.",
-                "maxResults": 3,
-                "timeRange": "y",
-                "country": "United States",
-                "instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
-            }, indent=2)
-        elif "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
-            # Perplexity models get web research prompt in JSON format (from methodAi.py)
-            testPrompt = json.dumps({
-                "researchPrompt": "Research the latest trends in renewable energy technology. Provide a comprehensive overview with key developments, companies involved, and future prospects. Return as JSON.",
+                "prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
                "maxResults": 5,
                "timeRange": "y",
-                "country": "United States",
-                "instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
+                "country": "CH",  # ISO-2 code, Perplexity will convert to "Switzerland"
+                "format": "json"
            }, indent=2)
        else:
            # Fallback for other models
@ -444,9 +436,7 @@ Is Valid JSON: {result.get('isValidJson', False)}
            # "dall-e-3",  # Skipped - image generation, test later
            "sonar",  # Perplexity web model
            "sonar-pro",  # Perplexity web model
-            "tavily-search",  # Tavily web model
-            "tavily-extract",  # Tavily web model
-            "tavily-search-extract",  # Tavily web model
+            "tavily-search",  # Tavily web model (unified research)
            # "internal-extractor",  # Skipped - internal model, test later
            # "internal-generator",  # Skipped - internal model, test later
            # "internal-renderer"  # Skipped - internal model, test later