gateway/modules/workflows/methods/methodAi/actions/webResearch.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

"""
Web Research action for AI operations.
Web research with two-step process: search for URLs, then crawl content.
"""

import logging
import time
import re
from typing import Dict, Any
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

@action
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    GENERAL:
    - Purpose: Web research with two-step process: search for URLs, then crawl content.
    - Input requirements: prompt (required); optional list(url), country, language, researchDepth.
    - Output format: JSON with research results including URLs and content.

    Parameters:
    - prompt (str, required): Natural language research instruction.
    - urlList (list, optional): Specific URLs to crawl, if needed.
    - country (str, optional): Two-digit country code (lowercase, e.g., ch, us, de).
    - language (str, optional): Language code (lowercase, e.g., de, en, fr).
    - researchDepth (str, optional): Research depth - fast, general, or deep. Default: general.
    """
    try:
        prompt = parameters.get("prompt")
        if not prompt:
            return ActionResult.isFailure(error="Research prompt is required")

        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"web_research_{workflowId}_{int(time.time())}"

        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "Web Research",
            "Searching and Crawling",
            "Extracting URLs and Content",
            parentOperationId=parentOperationId
        )

        # Call webcrawl service - service handles all AI intention analysis and processing
        result = await self.services.web.performWebResearch(
            prompt=prompt,
            urls=parameters.get("urlList", []),
            country=parameters.get("country"),
            language=parameters.get("language"),
            researchDepth=parameters.get("researchDepth", "general"),
            operationId=operationId
        )

        # Complete progress tracking
        self.services.chat.progressLogFinish(operationId, True)

        # Get meaningful filename from research result (generated by intent analyzer)
        suggestedFilename = result.get("suggested_filename")
        if suggestedFilename:
            # Clean and validate filename
            cleaned = suggestedFilename.strip().strip('"\'')
            cleaned = cleaned.replace('\n', ' ').replace('\r', ' ').strip()
            # Ensure it doesn't already have extension
            if cleaned.lower().endswith('.json'):
                cleaned = cleaned[:-5]
            # Validate: should be reasonable length and contain only safe characters
            if cleaned and len(cleaned) <= 60 and re.match(r'^[a-zA-Z0-9_\-]+$', cleaned):
                meaningfulName = f"{cleaned}.json"
            else:
                # Fallback to generic meaningful filename
                meaningfulName = self._generateMeaningfulFileName(
                    base_name="web_research",
                    extension="json",
                    action_name="research"
                )
        else:
            # Fallback to generic meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_research",
                extension="json",
                action_name="research"
            )

        validationMetadata = {
            "actionType": "ai.webResearch",
            "prompt": prompt,
            "urlList": parameters.get("urlList", []),
            "country": parameters.get("country"),
            "language": parameters.get("language"),
            "researchDepth": parameters.get("researchDepth", "general"),
            "resultFormat": "json"
        }
        actionDocument = ActionDocument(
            documentName=meaningfulName,
            documentData=result,
            mimeType="application/json",
            validationMetadata=validationMetadata
        )

        return ActionResult.isSuccess(documents=[actionDocument])

    except Exception as e:
        logger.error(f"Error in web research: {str(e)}")
        try:
            self.services.chat.progressLogFinish(operationId, False)
        except:
            pass
        return ActionResult.isFailure(error=str(e))