gateway/modules/workflows/methods/methodAi/actions/webResearch.py

164 lines
6.5 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import time
import re
import json
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.serviceCenter import ServiceCenterContext, getService, can_access_service
from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription import SubscriptionInactiveException
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import BillingContextError
logger = logging.getLogger(__name__)
def _build_research_prompt(parameters: Dict[str, Any]) -> str:
"""Assemble the final research prompt from prompt + optional context/documentList."""
base_prompt = (parameters.get("prompt") or "").strip()
context_val = parameters.get("context")
doc_list = parameters.get("documentList")
parts: list[str] = []
# Prepend context string if provided
if context_val and isinstance(context_val, str) and context_val.strip():
parts.append(f"Kontext:\n{context_val.strip()}")
# Extract text from documentList items if provided
if doc_list:
docs: list = []
if isinstance(doc_list, dict):
docs = doc_list.get("documents", []) or doc_list.get("items", [])
elif isinstance(doc_list, list):
docs = doc_list
doc_texts = []
for d in docs:
if isinstance(d, dict):
text = d.get("documentData") or d.get("text") or d.get("content") or ""
if text and isinstance(text, str):
doc_texts.append(text.strip())
if doc_texts:
parts.append("Dokumente:\n" + "\n---\n".join(doc_texts))
parts.append(base_prompt)
return "\n\n".join(p for p in parts if p)
async def webResearch(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
prompt = _build_research_prompt(parameters)
if not prompt:
return ActionResult.isFailure(error="Research prompt is required")
# RBAC: Check service-level permission
rbac = getattr(self.services, "rbac", None)
if rbac and not can_access_service(
self.services.user,
rbac,
"web",
mandate_id=getattr(self.services, "mandateId", None),
feature_instance_id=getattr(self.services, "featureInstanceId", None),
):
return ActionResult.isFailure(error="Permission denied: Web research service")
# Build context for service center
context = ServiceCenterContext(
user=self.services.user,
mandate_id=getattr(self.services, "mandateId", None),
feature_instance_id=getattr(self.services, "featureInstanceId", None),
workflow_id=self.services.workflow.id if self.services.workflow else None,
workflow=self.services.workflow,
)
web_service = getService("web", context)
# Init progress logger
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"web_research_{workflowId}_{int(time.time())}"
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"Web Research",
"Searching and Crawling",
"Extracting URLs and Content",
parentOperationId=parentOperationId
)
# Call webcrawl service - service handles all AI intention analysis and processing
result = await web_service.performWebResearch(
prompt=prompt,
urls=parameters.get("urlList", []),
country=parameters.get("country"),
language=parameters.get("language"),
researchDepth=parameters.get("researchDepth", "general"),
operationId=operationId
)
# Complete progress tracking
self.services.chat.progressLogFinish(operationId, True)
# Get meaningful filename from research result (generated by intent analyzer)
suggestedFilename = result.get("suggested_filename")
if suggestedFilename:
# Clean and validate filename
cleaned = suggestedFilename.strip().strip('"\'')
cleaned = cleaned.replace('\n', ' ').replace('\r', ' ').strip()
# Ensure it doesn't already have extension
if cleaned.lower().endswith('.json'):
cleaned = cleaned[:-5]
# Validate: should be reasonable length and contain only safe characters
if cleaned and len(cleaned) <= 60 and re.match(r'^[a-zA-Z0-9_\-]+$', cleaned):
meaningfulName = f"{cleaned}.json"
else:
# Fallback to generic meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_research",
extension="json",
action_name="research"
)
else:
# Fallback to generic meaningful filename
meaningfulName = self._generateMeaningfulFileName(
base_name="web_research",
extension="json",
action_name="research"
)
validationMetadata = {
"actionType": "ai.webResearch",
"prompt": prompt,
"urlList": parameters.get("urlList", []),
"country": parameters.get("country"),
"language": parameters.get("language"),
"researchDepth": parameters.get("researchDepth", "general"),
"resultFormat": "json"
}
documentData = json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else result
actionDocument = ActionDocument(
documentName=meaningfulName,
documentData=documentData,
mimeType="application/json",
validationMetadata=validationMetadata
)
return ActionResult.isSuccess(documents=[actionDocument])
except (SubscriptionInactiveException, BillingContextError):
try:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
except Exception:
pass
raise
except Exception as e:
logger.error(f"Error in web research: {str(e)}")
try:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
except Exception:
pass
return ActionResult.isFailure(error=str(e))