gateway/modules/workflows/methods/methodAi/actions/webResearch.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import logging
import time
import re
import json
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.serviceCenter import ServiceCenterContext, getService, can_access_service
from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription import SubscriptionInactiveException
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import BillingContextError

logger = logging.getLogger(__name__)

def _build_research_prompt(parameters: Dict[str, Any]) -> str:
    """Assemble the final research prompt from prompt + optional context/documentList."""
    base_prompt = (parameters.get("prompt") or "").strip()
    context_val = parameters.get("context")
    doc_list = parameters.get("documentList")

    parts: list[str] = []

    # Prepend context string if provided
    if context_val and isinstance(context_val, str) and context_val.strip():
        parts.append(f"Kontext:\n{context_val.strip()}")

    # Extract text from documentList items if provided
    if doc_list:
        docs: list = []
        if isinstance(doc_list, dict):
            docs = doc_list.get("documents", []) or doc_list.get("items", [])
        elif isinstance(doc_list, list):
            docs = doc_list
        doc_texts = []
        for d in docs:
            if isinstance(d, dict):
                text = d.get("documentData") or d.get("text") or d.get("content") or ""
                if text and isinstance(text, str):
                    doc_texts.append(text.strip())
        if doc_texts:
            parts.append("Dokumente:\n" + "\n---\n".join(doc_texts))

    parts.append(base_prompt)
    return "\n\n".join(p for p in parts if p)


async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
    operationId = None
    try:
        prompt = _build_research_prompt(parameters)
        if not prompt:
            return ActionResult.isFailure(error="Research prompt is required")

        # RBAC: Check service-level permission
        rbac = getattr(self.services, "rbac", None)
        if rbac and not can_access_service(
            self.services.user,
            rbac,
            "web",
            mandate_id=getattr(self.services, "mandateId", None),
            feature_instance_id=getattr(self.services, "featureInstanceId", None),
        ):
            return ActionResult.isFailure(error="Permission denied: Web research service")

        # Build context for service center
        context = ServiceCenterContext(
            user=self.services.user,
            mandate_id=getattr(self.services, "mandateId", None),
            feature_instance_id=getattr(self.services, "featureInstanceId", None),
            workflow_id=self.services.workflow.id if self.services.workflow else None,
            workflow=self.services.workflow,
        )
        web_service = getService("web", context)

        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"web_research_{workflowId}_{int(time.time())}"

        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "Web Research",
            "Searching and Crawling",
            "Extracting URLs and Content",
            parentOperationId=parentOperationId
        )

        # Call webcrawl service - service handles all AI intention analysis and processing
        result = await web_service.performWebResearch(
            prompt=prompt,
            urls=parameters.get("urlList", []),
            country=parameters.get("country"),
            language=parameters.get("language"),
            researchDepth=parameters.get("researchDepth", "general"),
            operationId=operationId
        )

        # Complete progress tracking
        self.services.chat.progressLogFinish(operationId, True)

        # Get meaningful filename from research result (generated by intent analyzer)
        suggestedFilename = result.get("suggested_filename")
        if suggestedFilename:
            # Clean and validate filename
            cleaned = suggestedFilename.strip().strip('"\'')
            cleaned = cleaned.replace('\n', ' ').replace('\r', ' ').strip()
            # Ensure it doesn't already have extension
            if cleaned.lower().endswith('.json'):
                cleaned = cleaned[:-5]
            # Validate: should be reasonable length and contain only safe characters
            if cleaned and len(cleaned) <= 60 and re.match(r'^[a-zA-Z0-9_\-]+$', cleaned):
                meaningfulName = f"{cleaned}.json"
            else:
                # Fallback to generic meaningful filename
                meaningfulName = self._generateMeaningfulFileName(
                    base_name="web_research",
                    extension="json",
                    action_name="research"
                )
        else:
            # Fallback to generic meaningful filename
            meaningfulName = self._generateMeaningfulFileName(
                base_name="web_research",
                extension="json",
                action_name="research"
            )

        validationMetadata = {
            "actionType": "ai.webResearch",
            "prompt": prompt,
            "urlList": parameters.get("urlList", []),
            "country": parameters.get("country"),
            "language": parameters.get("language"),
            "researchDepth": parameters.get("researchDepth", "general"),
            "resultFormat": "json"
        }
        documentData = json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else result
        actionDocument = ActionDocument(
            documentName=meaningfulName,
            documentData=documentData,
            mimeType="application/json",
            validationMetadata=validationMetadata
        )

        return ActionResult.isSuccess(documents=[actionDocument])

    except (SubscriptionInactiveException, BillingContextError):
        try:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
        except Exception:
            pass
        raise
    except Exception as e:
        logger.error(f"Error in web research: {str(e)}")
        try:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
        except Exception:
            pass
        return ActionResult.isFailure(error=str(e))