refactored web research to be generic and in 2 steps: url, crawl

2025-10-26 14:02:44 +01:00 · 2025-10-26 14:02:44 +01:00 · 72e0687826
commit 72e0687826
parent e8c3052176
9 changed files with 1079 additions and 2169 deletions
--- a/modules/aicore/aicorePluginPerplexity.py
+++ b/modules/aicore/aicorePluginPerplexity.py
--- a/modules/aicore/aicorePluginTavily.py
+++ b/modules/aicore/aicorePluginTavily.py
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@ -21,11 +21,8 @@ class OperationTypeEnum(str, Enum):
    IMAGE_GENERATE = "imageGenerate"
    # Web Operations
-    WEB_SEARCH = "webSearch"          # Returns list of URLs only
+    WEB_SEARCH = "webSearch"    # Returns list of URLs only
-    WEB_CRAWL = "webCrawl"            # Returns content from given URLs
+    WEB_CRAWL = "webCrawl"      # Web crawl for a given URL
    WEB_RESEARCH = "webResearch"      # WEB_SEARCH + WEB_CRAWL combined (scrape function)
    WEB_QUESTIONS = "webQuestions"    # Question-answering web research
    WEB_NEWS = "webNews"              # News-specific web research
 # Operation Type Rating - Helper class for capability ratings
@ -49,8 +46,8 @@ def createOperationTypeRatings(*ratings: Tuple[OperationTypeEnum, int]) -> List[
    Usage:
        operationTypes = createOperationTypeRatings(
            (OperationTypeEnum.DATA_ANALYSE, 8),
-            (OperationTypeEnum.WEB_RESEARCH, 10),
+            (OperationTypeEnum.WEB_SEARCH, 10),
-            (OperationTypeEnum.WEB_NEWS, 7)
+            (OperationTypeEnum.WEB_CRAWL, 9)
        )
    """
    return [OperationTypeRating(operationType=ot, rating=rating) for ot, rating in ratings]
@ -195,3 +192,42 @@ class AiModelResponse(BaseModel):
    class Config:
        arbitraryTypesAllowed = True
 # Structured prompt models for specialized operations
 class AiCallPromptWebSearch(BaseModel):
    """Structured prompt format for WEB_SEARCH operation - returns list of URLs."""
    instruction: str = Field(description="Search instruction/query for finding relevant URLs")
    country: Optional[str] = Field(default=None, description="Two-digit country code (lowercase, e.g., ch, us, de, fr)")
    maxNumberPages: Optional[int] = Field(default=10, description="Maximum number of pages to search (default: 10)")
    timeRange: Optional[str] = Field(default=None, description="Time range filter (d, w, m, y)")
    language: Optional[str] = Field(default=None, description="Language code (lowercase, e.g., de, en, fr)")
    researchDepth: Optional[str] = Field(default="general", description="Research depth: fast (maxDepth=1), general (maxDepth=2), deep (maxDepth=3)")
    class Config:
        pass
 class AiCallPromptWebCrawl(BaseModel):
    """Structured prompt format for WEB_CRAWL operation - crawls ONE specific URL and returns content."""
    instruction: str = Field(description="Instruction for what content to extract from URL")
    url: str = Field(description="Single URL to crawl")
    maxDepth: Optional[int] = Field(default=2, description="Maximum number of hops from starting page (default: 2)")
    maxWidth: Optional[int] = Field(default=10, description="Maximum pages to crawl per level (default: 10)")
    class Config:
        pass
 class AiCallPromptImage(BaseModel):
    """Structured prompt format for image generation."""
    prompt: str = Field(description="Text description of the image to generate")
    size: Optional[str] = Field(default="1024x1024", description="Image size (1024x1024, 1792x1024, 1024x1792)")
    quality: Optional[str] = Field(default="standard", description="Image quality (standard, hd)")
    style: Optional[str] = Field(default="vivid", description="Image style (vivid, natural)")
    class Config:
        pass
--- a/modules/datamodels/datamodelTools.py
+++ b/modules/datamodels/datamodelTools.py
@ -0,0 +1,225 @@
 """
 Utility data models and classes for common tools and mappings.
 """
 class CountryCodes:
    """
    Centralized country code mapping for different services.
    Maps ISO-2 country codes to service-specific country names.
    Each service may have different requirements for country names.
    """
    # Mapping: ISO-2 code -> (Tavily country name, Perplexity country name)
    _COUNTRY_MAP = {
        "AF": ("afghanistan", "Afghanistan"),
        "AL": ("albania", "Albania"),
        "DZ": ("algeria", "Algeria"),
        "AD": ("andorra", "Andorra"),
        "AO": ("angola", "Angola"),
        "AR": ("argentina", "Argentina"),
        "AM": ("armenia", "Armenia"),
        "AU": ("australia", "Australia"),
        "AT": ("austria", "Austria"),
        "AZ": ("azerbaijan", "Azerbaijan"),
        "BS": ("bahamas", "Bahamas"),
        "BH": ("bahrain", "Bahrain"),
        "BD": ("bangladesh", "Bangladesh"),
        "BB": ("barbados", "Barbados"),
        "BY": ("belarus", "Belarus"),
        "BE": ("belgium", "Belgium"),
        "BZ": ("belize", "Belize"),
        "BJ": ("benin", "Benin"),
        "BT": ("bhutan", "Bhutan"),
        "BO": ("bolivia", "Bolivia"),
        "BA": ("bosnia and herzegovina", "Bosnia and Herzegovina"),
        "BW": ("botswana", "Botswana"),
        "BR": ("brazil", "Brazil"),
        "BN": ("brunei", "Brunei"),
        "BG": ("bulgaria", "Bulgaria"),
        "BF": ("burkina faso", "Burkina Faso"),
        "BI": ("burundi", "Burundi"),
        "KH": ("cambodia", "Cambodia"),
        "CM": ("cameroon", "Cameroon"),
        "CA": ("canada", "Canada"),
        "CV": ("cape verde", "Cape Verde"),
        "CF": ("central african republic", "Central African Republic"),
        "TD": ("chad", "Chad"),
        "CL": ("chile", "Chile"),
        "CN": ("china", "China"),
        "CO": ("colombia", "Colombia"),
        "KM": ("comoros", "Comoros"),
        "CG": ("congo", "Congo"),
        "CR": ("costa rica", "Costa Rica"),
        "HR": ("croatia", "Croatia"),
        "CU": ("cuba", "Cuba"),
        "CY": ("cyprus", "Cyprus"),
        "CZ": ("czech republic", "Czech Republic"),
        "DK": ("denmark", "Denmark"),
        "DJ": ("djibouti", "Djibouti"),
        "DO": ("dominican republic", "Dominican Republic"),
        "EC": ("ecuador", "Ecuador"),
        "EG": ("egypt", "Egypt"),
        "SV": ("el salvador", "El Salvador"),
        "GQ": ("equatorial guinea", "Equatorial Guinea"),
        "ER": ("eritrea", "Eritrea"),
        "EE": ("estonia", "Estonia"),
        "ET": ("ethiopia", "Ethiopia"),
        "FJ": ("fiji", "Fiji"),
        "FI": ("finland", "Finland"),
        "FR": ("france", "France"),
        "GA": ("gabon", "Gabon"),
        "GM": ("gambia", "Gambia"),
        "GE": ("georgia", "Georgia"),
        "DE": ("germany", "Germany"),
        "GH": ("ghana", "Ghana"),
        "GR": ("greece", "Greece"),
        "GT": ("guatemala", "Guatemala"),
        "GN": ("guinea", "Guinea"),
        "HT": ("haiti", "Haiti"),
        "HN": ("honduras", "Honduras"),
        "HU": ("hungary", "Hungary"),
        "IS": ("iceland", "Iceland"),
        "IN": ("india", "India"),
        "ID": ("indonesia", "Indonesia"),
        "IR": ("iran", "Iran"),
        "IQ": ("iraq", "Iraq"),
        "IE": ("ireland", "Ireland"),
        "IL": ("israel", "Israel"),
        "IT": ("italy", "Italy"),
        "JM": ("jamaica", "Jamaica"),
        "JP": ("japan", "Japan"),
        "JO": ("jordan", "Jordan"),
        "KZ": ("kazakhstan", "Kazakhstan"),
        "KE": ("kenya", "Kenya"),
        "KW": ("kuwait", "Kuwait"),
        "KG": ("kyrgyzstan", "Kyrgyzstan"),
        "LV": ("latvia", "Latvia"),
        "LB": ("lebanon", "Lebanon"),
        "LS": ("lesotho", "Lesotho"),
        "LR": ("liberia", "Liberia"),
        "LY": ("libya", "Libya"),
        "LI": ("liechtenstein", "Liechtenstein"),
        "LT": ("lithuania", "Lithuania"),
        "LU": ("luxembourg", "Luxembourg"),
        "MG": ("madagascar", "Madagascar"),
        "MW": ("malawi", "Malawi"),
        "MY": ("malaysia", "Malaysia"),
        "MV": ("maldives", "Maldives"),
        "ML": ("mali", "Mali"),
        "MT": ("malta", "Malta"),
        "MR": ("mauritania", "Mauritania"),
        "MU": ("mauritius", "Mauritius"),
        "MX": ("mexico", "Mexico"),
        "MD": ("moldova", "Moldova"),
        "MC": ("monaco", "Monaco"),
        "MN": ("mongolia", "Mongolia"),
        "ME": ("montenegro", "Montenegro"),
        "MA": ("morocco", "Morocco"),
        "MZ": ("mozambique", "Mozambique"),
        "MM": ("myanmar", "Myanmar"),
        "NA": ("namibia", "Namibia"),
        "NP": ("nepal", "Nepal"),
        "NL": ("netherlands", "Netherlands"),
        "NZ": ("new zealand", "New Zealand"),
        "NI": ("nicaragua", "Nicaragua"),
        "NE": ("niger", "Niger"),
        "NG": ("nigeria", "Nigeria"),
        "KP": ("north korea", "North Korea"),
        "MK": ("north macedonia", "North Macedonia"),
        "NO": ("norway", "Norway"),
        "OM": ("oman", "Oman"),
        "PK": ("pakistan", "Pakistan"),
        "PA": ("panama", "Panama"),
        "PG": ("papua new guinea", "Papua New Guinea"),
        "PY": ("paraguay", "Paraguay"),
        "PE": ("peru", "Peru"),
        "PH": ("philippines", "Philippines"),
        "PL": ("poland", "Poland"),
        "PT": ("portugal", "Portugal"),
        "QA": ("qatar", "Qatar"),
        "RO": ("romania", "Romania"),
        "RU": ("russia", "Russia"),
        "RW": ("rwanda", "Rwanda"),
        "SA": ("saudi arabia", "Saudi Arabia"),
        "SN": ("senegal", "Senegal"),
        "RS": ("serbia", "Serbia"),
        "SG": ("singapore", "Singapore"),
        "SK": ("slovakia", "Slovakia"),
        "SI": ("slovenia", "Slovenia"),
        "SO": ("somalia", "Somalia"),
        "ZA": ("south africa", "South Africa"),
        "KR": ("south korea", "South Korea"),
        "SS": ("south sudan", "South Sudan"),
        "ES": ("spain", "Spain"),
        "LK": ("sri lanka", "Sri Lanka"),
        "SD": ("sudan", "Sudan"),
        "SE": ("sweden", "Sweden"),
        "CH": ("switzerland", "Switzerland"),
        "SY": ("syria", "Syria"),
        "TW": ("taiwan", "Taiwan"),
        "TJ": ("tajikistan", "Tajikistan"),
        "TZ": ("tanzania", "Tanzania"),
        "TH": ("thailand", "Thailand"),
        "TG": ("togo", "Togo"),
        "TT": ("trinidad and tobago", "Trinidad and Tobago"),
        "TN": ("tunisia", "Tunisia"),
        "TR": ("turkey", "Turkey"),
        "TM": ("turkmenistan", "Turkmenistan"),
        "UG": ("uganda", "Uganda"),
        "UA": ("ukraine", "Ukraine"),
        "AE": ("united arab emirates", "United Arab Emirates"),
        "GB": ("united kingdom", "United Kingdom"),
        "US": ("united states", "United States"),
        "UY": ("uruguay", "Uruguay"),
        "UZ": ("uzbekistan", "Uzbekistan"),
        "VE": ("venezuela", "Venezuela"),
        "VN": ("vietnam", "Vietnam"),
        "YE": ("yemen", "Yemen"),
        "ZM": ("zambia", "Zambia"),
        "ZW": ("zimbabwe", "Zimbabwe"),
    }
    @classmethod
    def getForTavily(cls, isoCode: str) -> str:
        """
        Get Tavily-compatible country name from ISO-2 code.
        Args:
            isoCode: ISO-2 country code (e.g., "CH", "US")
        Returns:
            Country name in lowercase as required by Tavily (e.g., "switzerland", "united states")
        """
        isoCodeUpper = isoCode.upper()
        mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
        return mapping[0] if mapping else isoCode
    @classmethod
    def getForPerplexity(cls, isoCode: str) -> str:
        """
        Get Perplexity-compatible country name from ISO-2 code.
        Args:
            isoCode: ISO-2 country code (e.g., "CH", "US")
        Returns:
            Full country name as required by Perplexity (e.g., "Switzerland", "United States")
        """
        isoCodeUpper = isoCode.upper()
        mapping = cls._COUNTRY_MAP.get(isoCodeUpper)
        return mapping[1] if mapping else isoCode
    @classmethod
    def isValid(cls, isoCode: str) -> bool:
        """
        Check if ISO-2 code is valid.
        Args:
            isoCode: ISO-2 country code to check
        Returns:
            True if valid, False otherwise
        """
        return isoCode.upper() in cls._COUNTRY_MAP
--- a/modules/services/init.py
+++ b/modules/services/init.py
@ -81,6 +81,9 @@ class Services:
        from .serviceUtils.mainServiceUtils import UtilsService
        self.utils = PublicService(UtilsService(self))
        from .serviceWeb.mainServiceWeb import WebService
        self.web = PublicService(WebService(self))
 def getInterface(user: User, workflow: ChatWorkflow) -> Services:
    return Services(user, workflow)
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -3,7 +3,6 @@ from typing import Dict, Any, List, Optional, Union
 from modules.datamodels.datamodelChat import PromptPlaceholder, ChatDocument
 from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
 from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
 from modules.aicore.aicorePluginTavily import WebResearchRequest, WebResearchResult
 from modules.interfaces.interfaceAiObjects import AiObjects
 from modules.services.serviceAi.subCoreAi import SubCoreAi
 from modules.services.serviceAi.subDocumentProcessing import SubDocumentProcessing
--- a/modules/services/serviceWeb/mainServiceWeb.py
+++ b/modules/services/serviceWeb/mainServiceWeb.py
@ -0,0 +1,314 @@
 """
 Web crawl service for handling web research operations.
 Manages the two-step process: WEB_SEARCH then WEB_CRAWL.
 """
 import json
 import logging
 from typing import Dict, Any, List, Optional
 from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptWebSearch, AiCallPromptWebCrawl
 logger = logging.getLogger(__name__)
 class WebcrawlService:
    """Service for web search and crawling operations."""
    def __init__(self, services):
        """Initialize webcrawl service with service center access."""
        self.services = services
    async def performWebResearch(
        self,
        prompt: str,
        urls: List[str],
        country: Optional[str],
        language: Optional[str],
        researchDepth: str = "general",
        operationId: str = None
    ) -> Dict[str, Any]:
        """
        Perform web research in two steps:
        1. Use AI to analyze prompt and extract parameters + URLs
        2. Call WEB_SEARCH to get URLs (if needed)
        3. Combine URLs and filter to maxNumberPages
        4. Call WEB_CRAWL for each URL
        5. Return consolidated result
        Args:
            prompt: Natural language research prompt
            urls: Optional list of URLs provided by user
            country: Optional country code
            language: Optional language code
            operationId: Operation ID for progress tracking
        Returns:
            Consolidated research results as dictionary
        """
        try:
            # Step 1: AI intention analysis - extract URLs and parameters from prompt
            self.services.workflow.progressLogUpdate(operationId, 0.1, "Analyzing research intent")
            analysisResult = await self._analyzeResearchIntent(prompt, urls, country, language, researchDepth)
            # Extract parameters from AI analysis
            instruction = analysisResult.get("instruction", prompt)
            extractedUrls = analysisResult.get("urls", [])
            needsSearch = analysisResult.get("needsSearch", True)  # Default to True
            maxNumberPages = analysisResult.get("maxNumberPages", 10)
            timeRange = analysisResult.get("timeRange")
            countryCode = analysisResult.get("country", country)
            languageCode = analysisResult.get("language", language)
            finalResearchDepth = analysisResult.get("researchDepth", researchDepth)
            logger.info(f"AI Analysis: instruction='{instruction[:100]}...', urls={len(extractedUrls)}, needsSearch={needsSearch}, maxNumberPages={maxNumberPages}, researchDepth={finalResearchDepth}")
            # Combine URLs (from user + from prompt extraction)
            allUrls = []
            if urls:
                allUrls.extend(urls)
            if extractedUrls:
                allUrls.extend(extractedUrls)
            # Step 2: Search for URLs if needed (based on needsSearch flag)
            if needsSearch and (not allUrls or len(allUrls) < maxNumberPages):
                self.services.workflow.progressLogUpdate(operationId, 0.3, "Searching for URLs")
                searchUrls = await self._performWebSearch(
                    instruction=instruction,
                    maxNumberPages=maxNumberPages - len(allUrls),
                    timeRange=timeRange,
                    country=countryCode,
                    language=languageCode
                )
                # Add search URLs to the list
                allUrls.extend(searchUrls)
                self.services.workflow.progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
            # Step 3: Filter to maxNumberPages (simple cut, no intelligent filtering)
            if len(allUrls) > maxNumberPages:
                allUrls = allUrls[:maxNumberPages]
                logger.info(f"Limited URLs to {maxNumberPages}")
            if not allUrls:
                return {"error": "No URLs found to crawl"}
            # Step 4: Translate researchDepth to maxDepth
            depthMap = {"fast": 1, "general": 2, "deep": 3}
            maxDepth = depthMap.get(finalResearchDepth.lower(), 2)
            # Step 5: Crawl all URLs
            self.services.workflow.progressLogUpdate(operationId, 0.6, f"Crawling {len(allUrls)} URLs")
            crawlResult = await self._performWebCrawl(
                instruction=instruction,
                urls=allUrls,
                maxDepth=maxDepth
            )
            self.services.workflow.progressLogUpdate(operationId, 0.9, "Consolidating results")
            # Return consolidated result
            return {
                "instruction": instruction,
                "urls_crawled": allUrls,
                "total_urls": len(allUrls),
                "results": crawlResult,
                "total_results": len(crawlResult) if isinstance(crawlResult, list) else 1
            }
        except Exception as e:
            logger.error(f"Error in web research: {str(e)}")
            raise
    async def _analyzeResearchIntent(
        self,
        prompt: str,
        urls: List[str],
        country: Optional[str],
        language: Optional[str],
        researchDepth: str = "general"
    ) -> Dict[str, Any]:
        """
        Use AI to analyze prompt and extract:
        - URLs from the prompt text
        - Research instruction
        - maxNumberPages, timeRange, country, language from context
        """
        # Build analysis prompt for AI
        analysisPrompt = f"""Analyze this web research request and extract structured information.
 RESEARCH REQUEST:
 {prompt}
 USER PROVIDED:
 - URLs: {json.dumps(urls) if urls else "None"}
 - Country: {country or "Not specified"}
 - Language: {language or "Not specified"}
 Extract and provide a JSON response with:
 1. instruction: The core research instruction (cleaned prompt without URLs)
 2. urls: List of URLs found in the prompt text
 3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
 4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
 5. timeRange: Time range if mentioned (d, w, m, y, or null)
 6. country: Country code if specified (2-digit lowercase, e.g., ch, us, de)
 7. language: Language code if specified (lowercase, e.g., de, en, fr)
 8. researchDepth: Research depth based on instruction complexity - "fast" (quick overview, maxDepth=1), "general" (standard research, maxDepth=2), or "deep" (comprehensive research, maxDepth=3)
 Return ONLY valid JSON, no additional text:
 {{
    "instruction": "cleaned research instruction",
    "urls": ["url1", "url2"],
    "needsSearch": true,
    "maxNumberPages": 10,
    "timeRange": null,
    "country": "ch",
    "language": "de",
    "researchDepth": "general"
 }}"""
        try:
            # Call AI planning to analyze intent
            analysisJson = await self.services.ai.callAiPlanning(analysisPrompt)
            # Parse JSON response
            result = json.loads(analysisJson)
            logger.info(f"Intent analysis result: {result}")
            return result
        except Exception as e:
            logger.warning(f"Error in AI intent analysis: {str(e)}")
            # Fallback to basic extraction
            return {
                "instruction": prompt,
                "urls": [],
                "needsSearch": True,
                "maxNumberPages": 10,
                "timeRange": None,
                "country": country,
                "language": language,
                "researchDepth": researchDepth
            }
    async def _performWebSearch(
        self,
        instruction: str,
        maxNumberPages: int,
        timeRange: Optional[str],
        country: Optional[str],
        language: Optional[str]
    ) -> List[str]:
        """Perform web search to find URLs."""
        try:
            # Build search prompt model
            searchPromptModel = AiCallPromptWebSearch(
                instruction=instruction,
                country=country,
                maxNumberPages=maxNumberPages,
                timeRange=timeRange,
                language=language
            )
            searchPrompt = searchPromptModel.model_dump_json(exclude_none=True, indent=2)
            # Call AI with WEB_SEARCH operation
            searchOptions = AiCallOptions(
                operationType=OperationTypeEnum.WEB_SEARCH,
                resultFormat="json"
            )
            searchResult = await self.services.ai.callAiDocuments(
                prompt=searchPrompt,
                documents=None,
                options=searchOptions,
                outputFormat="json"
            )
            # Parse and extract URLs
            if isinstance(searchResult, str):
                searchData = json.loads(searchResult)
            else:
                searchData = searchResult
            # Extract URLs from response
            urls = []
            if isinstance(searchData, dict):
                if "urls" in searchData:
                    urls = searchData["urls"]
                elif "results" in searchData:
                    urls = [r.get("url") for r in searchData["results"] if r.get("url")]
            elif isinstance(searchData, list):
                urls = [item.get("url") for item in searchData if item.get("url")]
            logger.info(f"Web search returned {len(urls)} URLs")
            return urls
        except Exception as e:
            logger.error(f"Error in web search: {str(e)}")
            return []
    async def _performWebCrawl(
        self,
        instruction: str,
        urls: List[str],
        maxDepth: int = 2
    ) -> List[Dict[str, Any]]:
        """Perform web crawl on list of URLs - calls plugin for each URL individually."""
        crawlResults = []
        # Loop over each URL and crawl one at a time
        for url in urls:
            try:
                logger.info(f"Crawling URL: {url}")
                # Build crawl prompt model for single URL
                crawlPromptModel = AiCallPromptWebCrawl(
                    instruction=instruction,
                    url=url,  # Single URL
                    maxDepth=maxDepth,
                    maxWidth=10
                )
                crawlPrompt = crawlPromptModel.model_dump_json(exclude_none=True, indent=2)
                # Call AI with WEB_CRAWL operation
                crawlOptions = AiCallOptions(
                    operationType=OperationTypeEnum.WEB_CRAWL,
                    resultFormat="json"
                )
                crawlResult = await self.services.ai.callAiDocuments(
                    prompt=crawlPrompt,
                    documents=None,
                    options=crawlOptions,
                    outputFormat="json"
                )
                # Parse crawl result
                if isinstance(crawlResult, str):
                    try:
                        crawlData = json.loads(crawlResult)
                    except:
                        crawlData = {"url": url, "content": crawlResult}
                else:
                    crawlData = crawlResult
                # Ensure it's a list of results
                if isinstance(crawlData, list):
                    crawlResults.extend(crawlData)
                elif isinstance(crawlData, dict):
                    if "results" in crawlData:
                        crawlResults.extend(crawlData["results"])
                    else:
                        crawlResults.append(crawlData)
                else:
                    crawlResults.append({"url": url, "content": str(crawlData)})
            except Exception as e:
                logger.error(f"Error crawling URL {url}: {str(e)}")
                crawlResults.append({"url": url, "error": str(e)})
        return crawlResults
--- a/modules/workflows/methods/methodAi.py
+++ b/modules/workflows/methods/methodAi.py
@ -10,9 +10,7 @@ from datetime import datetime, UTC
 from modules.workflows.methods.methodBase import MethodBase, action
 from modules.datamodels.datamodelChat import ActionResult
-from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
+from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallPromptImage
 from modules.datamodels.datamodelChat import ChatDocument
 from modules.aicore.aicorePluginTavily import WebResearchRequest
 logger = logging.getLogger(__name__)
@ -163,272 +161,50 @@ class MethodAi(MethodBase):
            )
    @action
    async def webSearch(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Search the web and return a list of relevant URLs only.
        - Input requirements: searchPrompt (required); optional maxResults, timeRange, country, language.
        - Output format: JSON with search results and URLs.
        Parameters:
        - searchPrompt (str, required): Natural language search prompt describing what to search for.
        - maxResults (int, optional): Maximum number of search results. Default: 5.
        - timeRange (str, optional): d | w | m | y for time filtering.
        - country (str, optional): Country name for localized results.
        - language (str, optional): Language code (e.g., de, en, fr).
        """
        try:
            searchPrompt = parameters.get("searchPrompt")
            if not searchPrompt:
                return ActionResult.isFailure(error="Search prompt is required")
            # Extract optional parameters
            maxResults = parameters.get("maxResults", 5)
            timeRange = parameters.get("timeRange")
            country = parameters.get("country")
            language = parameters.get("language")
            # Build AI call options for web search
            options = AiCallOptions(
                operationType=OperationTypeEnum.WEB_SEARCH,
                resultFormat="json"
            )
            # Create unified prompt JSON that both Tavily and Perplexity can understand
            promptData = {
                "searchPrompt": searchPrompt,
                "maxResults": maxResults,
                "timeRange": timeRange,
                "country": country,
                "language": language,
                "instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
            }
            import json
            prompt = json.dumps(promptData, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
                prompt=prompt,
                documents=None,
                options=options,
                outputFormat="json"
            )
            # Process result to ensure consistent format
            processedResult = self._processWebSearchResult(result)
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_search",
                extension="json",
                action_name="search"
            )
            from modules.datamodels.datamodelChat import ActionDocument
            actionDocument = ActionDocument(
                documentName=meaningfulName,
                documentData=processedResult,
                mimeType="application/json"
            )
            return ActionResult.isSuccess(documents=[actionDocument])
        except Exception as e:
            logger.error(f"Error in web search: {str(e)}")
            return ActionResult.isFailure(error=str(e))
    def _processWebSearchResult(self, result: str) -> str:
        """
        Process web search result to ensure consistent JSON format with URL list.
        Both Tavily and Perplexity now return proper JSON format.
        """
        try:
            import json
            data = json.loads(result)
            # If it's already a proper search result format, return as-is
            if isinstance(data, dict) and "results" in data:
                return result
            # If it's a different JSON format, try to extract URLs
            if isinstance(data, dict):
                # Look for URL patterns in the JSON
                urls = self._extractUrlsFromJson(data)
                if urls:
                    processedData = {
                        "query": data.get("query", "web search"),
                        "results": [{"title": f"Result {i+1}", "url": url} for i, url in enumerate(urls)],
                        "total_count": len(urls)
                    }
                    return json.dumps(processedData, indent=2)
            # No URLs found, return original result in a structured format
            processedData = {
                "query": "web search",
                "results": [],
                "total_count": 0,
                "raw_response": result
            }
            return json.dumps(processedData, indent=2)
        except Exception as e:
            logger.warning(f"Error processing web search result: {str(e)}")
            # Return original result wrapped in error format
            errorData = {
                "query": "web search",
                "results": [],
                "total_count": 0,
                "error": f"Failed to process result: {str(e)}",
                "raw_response": result
            }
            return json.dumps(errorData, indent=2)
    def _extractUrlsFromJson(self, data: Dict[str, Any]) -> List[str]:
        """Extract URLs from JSON data structure."""
        urls = []
        def _extractFromValue(value):
            if isinstance(value, str):
                # Check if it's a URL
                if value.startswith(('http://', 'https://')):
                    urls.append(value)
            elif isinstance(value, dict):
                for v in value.values():
                    _extractFromValue(v)
            elif isinstance(value, list):
                for item in value:
                    _extractFromValue(item)
        _extractFromValue(data)
        return list(set(urls))  # Remove duplicates
    @action
    async def webCrawl(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Extract content from specific URLs.
        - Input requirements: urls (required); optional extractDepth, format.
        - Output format: JSON with extracted content from URLs.
        Parameters:
        - urls (list, required): List of URLs to crawl and extract content from.
        - extractDepth (str, optional): basic | advanced. Default: advanced.
        - format (str, optional): markdown | html | text. Default: markdown.
        """
        try:
            urls = parameters.get("urls")
            if not urls or not isinstance(urls, list):
                return ActionResult.isFailure(error="URLs list is required")
            # Extract optional parameters
            extractDepth = parameters.get("extractDepth", "advanced")
            formatType = parameters.get("format", "markdown")
            # Build AI call options for web crawling
            options = AiCallOptions(
                operationType=OperationTypeEnum.WEB_CRAWL,
                resultFormat="json"
            )
            # Create unified prompt JSON for web crawling
            promptData = {
                "urls": urls,
                "extractDepth": extractDepth,
                "format": formatType,
                "instructions": "Extract content from the provided URLs and return a JSON response with 'results' array containing objects with 'url', 'title', 'content', and 'extractedAt' fields."
            }
            import json
            prompt = json.dumps(promptData, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
                prompt=prompt,
                documents=None,
                options=options,
                outputFormat="json"
            )
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_crawl",
                extension="json",
                action_name="crawl"
            )
            from modules.datamodels.datamodelChat import ActionDocument
            actionDocument = ActionDocument(
                documentName=meaningfulName,
                documentData=result,
                mimeType="application/json"
            )
            return ActionResult.isSuccess(documents=[actionDocument])
        except Exception as e:
            logger.error(f"Error in web crawl: {str(e)}")
            return ActionResult.isFailure(error=str(e))
    @action
    async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
-        - Purpose: Comprehensive web research combining search and content extraction.
+        - Purpose: Web research with two-step process: search for URLs, then crawl content.
-        - Input requirements: researchPrompt (required); optional maxResults, urls, timeRange, country, language.
+        - Input requirements: prompt (required); optional list(url), country, language, researchDepth.
-        - Output format: JSON with research results, sources, and analysis.
+        - Output format: JSON with research results including URLs and content.
        Parameters:
-        - researchPrompt (str, required): Natural language research prompt describing what to research.
+        - prompt (str, required): Natural language research instruction, including time range if relevant.
-        - maxResults (int, optional): Maximum search results. Default: 5.
+        - list(url) (list, optional): Specific URLs to crawl, if needed.
-        - urls (list, optional): Specific URLs to include in research.
+        - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
-        - timeRange (str, optional): d | w | m | y for time filtering.
+        - language (str, optional): Language code (lowercase, e.g., de, en, fr).
-        - country (str, optional): Country name for localized results.
+        - researchDepth (str, optional): Research depth - fast, general, or deep. Default: general.
        - language (str, optional): Language code (e.g., de, en, fr).
        """
        try:
-            researchPrompt = parameters.get("researchPrompt")
+            prompt = parameters.get("prompt")
-            if not researchPrompt:
+            if not prompt:
                return ActionResult.isFailure(error="Research prompt is required")
-            # Extract optional parameters
+            # Init progress logger
-            maxResults = parameters.get("maxResults", 5)
+            operationId = f"web_research_{self.services.currentWorkflow.id}_{int(time.time())}"
            urls = parameters.get("urls")
            timeRange = parameters.get("timeRange")
            country = parameters.get("country")
            language = parameters.get("language")
-            # Build AI call options for web research
+            # Start progress tracking
-            options = AiCallOptions(
+            self.services.workflow.progressLogStart(
-                operationType=OperationTypeEnum.WEB_RESEARCH,
+                operationId,
-                resultFormat="json"
+                "Web Research",
                "Searching and Crawling",
                "Extracting URLs and Content"
            )
-            # Create unified prompt JSON for web research
+            # Call webcrawl service - service handles all AI intention analysis and processing
-            promptData = {
+            result = await self.services.web.performWebResearch(
                "researchPrompt": researchPrompt,
                "maxResults": maxResults,
                "urls": urls,
                "timeRange": timeRange,
                "country": country,
                "language": language,
                "instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
            }
            import json
            prompt = json.dumps(promptData, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
                prompt=prompt,
-                documents=None,
+                urls=parameters.get("list(url)", []),
-                options=options,
+                country=parameters.get("country"),
-                outputFormat="json"
+                language=parameters.get("language"),
                researchDepth=parameters.get("researchDepth", "general"),
                operationId=operationId
            )
            # Complete progress tracking
            self.services.workflow.progressLogFinish(operationId, True)
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_research",
@ -447,157 +223,10 @@ class MethodAi(MethodBase):
        except Exception as e:
            logger.error(f"Error in web research: {str(e)}")
-            return ActionResult.isFailure(error=str(e))
+            try:
-
+                self.services.workflow.progressLogFinish(operationId, False)
-
+            except:
-    @action
+                pass
    async def webQuestions(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Answer questions using web research and AI analysis.
        - Input requirements: question (required); optional context, maxResults, timeRange, country, language.
        - Output format: JSON with question answer and supporting sources.
        Parameters:
        - question (str, required): Question to be answered using web research.
        - context (str, optional): Additional context for the question.
        - maxResults (int, optional): Maximum search results. Default: 5.
        - timeRange (str, optional): d | w | m | y for time filtering.
        - country (str, optional): Country name for localized results.
        - language (str, optional): Language code (e.g., de, en, fr).
        """
        try:
            question = parameters.get("question")
            if not question:
                return ActionResult.isFailure(error="Question is required")
            # Extract optional parameters
            context = parameters.get("context", "")
            maxResults = parameters.get("maxResults", 5)
            timeRange = parameters.get("timeRange")
            country = parameters.get("country")
            language = parameters.get("language")
            # Build AI call options for web questions
            options = AiCallOptions(
                operationType=OperationTypeEnum.WEB_QUESTIONS,
                resultFormat="json"
            )
            # Create unified prompt JSON for web questions
            promptData = {
                "question": question,
                "context": context,
                "maxResults": maxResults,
                "timeRange": timeRange,
                "country": country,
                "language": language,
                "instructions": "Answer the question using web research and return a JSON response with 'answer', 'sources' array containing objects with 'title', 'url', 'content', and 'relevance' fields."
            }
            import json
            prompt = json.dumps(promptData, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
                prompt=prompt,
                documents=None,
                options=options,
                outputFormat="json"
            )
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_questions",
                extension="json",
                action_name="questions"
            )
            from modules.datamodels.datamodelChat import ActionDocument
            actionDocument = ActionDocument(
                documentName=meaningfulName,
                documentData=result,
                mimeType="application/json"
            )
            return ActionResult.isSuccess(documents=[actionDocument])
        except Exception as e:
            logger.error(f"Error in web questions: {str(e)}")
            return ActionResult.isFailure(error=str(e))
    @action
    async def webNews(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Search and analyze news articles on specific topics.
        - Input requirements: newsPrompt (required); optional maxResults, timeRange, country, language.
        - Output format: JSON with news articles, summaries, and analysis.
        Parameters:
        - newsPrompt (str, required): Natural language prompt describing what news to search for.
        - maxResults (int, optional): Maximum news articles. Default: 5.
        - timeRange (str, optional): d | w | m | y for time filtering. Default: w.
        - country (str, optional): Country name for localized news.
        - language (str, optional): Language code (e.g., de, en, fr).
        """
        try:
            newsPrompt = parameters.get("newsPrompt")
            if not newsPrompt:
                return ActionResult.isFailure(error="News prompt is required")
            # Extract optional parameters
            maxResults = parameters.get("maxResults", 5)
            timeRange = parameters.get("timeRange", "w")  # Default to week
            country = parameters.get("country")
            language = parameters.get("language")
            # Build AI call options for web news
            options = AiCallOptions(
                operationType=OperationTypeEnum.WEB_NEWS,
                resultFormat="json"
            )
            # Create unified prompt JSON for web news
            promptData = {
                "newsPrompt": newsPrompt,
                "maxResults": maxResults,
                "timeRange": timeRange,
                "country": country,
                "language": language,
                "instructions": "Find and analyze recent news articles and return a JSON response with 'articles' array containing objects with 'title', 'url', 'content', 'date', 'source', and 'summary' fields."
            }
            import json
            prompt = json.dumps(promptData, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
                prompt=prompt,
                documents=None,
                options=options,
                outputFormat="json"
            )
            # Create meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_news",
                extension="json",
                action_name="news"
            )
            from modules.datamodels.datamodelChat import ActionDocument
            actionDocument = ActionDocument(
                documentName=meaningfulName,
                documentData=result,
                mimeType="application/json"
            )
            return ActionResult.isSuccess(documents=[actionDocument])
        except Exception as e:
            logger.error(f"Error in web news: {str(e)}")
            return ActionResult.isFailure(error=str(e))
@ -631,17 +260,16 @@ class MethodAi(MethodBase):
                resultFormat="base64"
            )
-            # Create unified prompt JSON for image generation
+            # Create structured prompt using Pydantic model
-            promptData = {
+            promptModel = AiCallPromptImage(
-                "prompt": prompt,
+                prompt=prompt,
-                "size": size,
+                size=size,
-                "quality": quality,
+                quality=quality,
-                "style": style,
+                style=style
-                "instructions": "Generate an image based on the prompt and return the base64 encoded image data."
+            )
            }
-            import json
+            # Convert to JSON string for prompt
-            promptJson = json.dumps(promptData, indent=2)
+            promptJson = promptModel.model_dump_json(exclude_none=True, indent=2)
            # Call AI service through unified path
            result = await self.services.ai.callAiDocuments(
--- a/test_ai_models.py
+++ b/test_ai_models.py
@ -91,26 +91,18 @@ class AIModelsTester:
        print(f"TESTING MODEL: {modelName}")
        print(f"{'='*60}")
-        # Choose test prompt based on model type - Web models get JSON formatted prompts
+        # Use same prompt for all web models
        import json
-        if "tavily" in modelName.lower():
+        if "tavily" in modelName.lower() or "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
-            # Tavily models get web search prompt in JSON format (from methodAi.py)
+            # All web models use the same JSON formatted prompt
            # Country format: Use full name for Tavily (Switzerland), Perplexity converts ISO codes to names
            testPrompt = json.dumps({
-                "searchPrompt": "Search for recent news about artificial intelligence developments in 2024. Return the top 3 results as JSON with fields: title, url, snippet.",
+                "prompt": "Research, what ValueOn company in switzerland does and who works there? Return as JSON.",
                "maxResults": 3,
                "timeRange": "y",
                "country": "United States",
                "instructions": "Search the web and return a JSON response with a 'results' array containing objects with 'title', 'url', and optionally 'content' fields. Focus on finding relevant URLs for the search prompt."
            }, indent=2)
        elif "perplexity" in modelName.lower() or "llama" in modelName.lower() or "sonar" in modelName.lower() or "mistral" in modelName.lower():
            # Perplexity models get web research prompt in JSON format (from methodAi.py)
            testPrompt = json.dumps({
                "researchPrompt": "Research the latest trends in renewable energy technology. Provide a comprehensive overview with key developments, companies involved, and future prospects. Return as JSON.",
                "maxResults": 5,
                "timeRange": "y",
-                "country": "United States",
+                "country": "CH",  # ISO-2 code, Perplexity will convert to "Switzerland"
-                "instructions": "Conduct comprehensive web research and return a JSON response with 'results' array containing objects with 'title', 'url', 'content', and 'analysis' fields. Provide detailed analysis and insights."
+                "format": "json"
            }, indent=2)
        else:
            # Fallback for other models
@ -444,9 +436,7 @@ Is Valid JSON: {result.get('isValidJson', False)}
            # "dall-e-3",  # Skipped - image generation, test later
            "sonar",  # Perplexity web model
            "sonar-pro",  # Perplexity web model
-            "tavily-search",  # Tavily web model
+            "tavily-search",  # Tavily web model (unified research)
            "tavily-extract",  # Tavily web model
            "tavily-search-extract",  # Tavily web model
            # "internal-extractor",  # Skipped - internal model, test later
            # "internal-generator",  # Skipped - internal model, test later
            # "internal-renderer"  # Skipped - internal model, test later