460 lines
22 KiB
Python
460 lines
22 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Context and workflow information method module.
|
|
Handles workflow context queries and document indexing.
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
import logging
|
|
import aiohttp
|
|
from typing import Dict, Any, List
|
|
from datetime import datetime, UTC
|
|
|
|
from modules.workflows.methods.methodBase import MethodBase, action
|
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
|
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
|
|
from modules.shared.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MethodContext(MethodBase):
|
|
"""Context and workflow information methods."""
|
|
|
|
def __init__(self, services):
|
|
super().__init__(services)
|
|
self.name = "context"
|
|
self.description = "Context and workflow information methods"
|
|
|
|
@action
|
|
async def getDocumentIndex(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
GENERAL:
|
|
- Purpose: Generate a comprehensive index of all documents available in the current workflow, including documents from all rounds and tasks.
|
|
- Input requirements: No input documents required. Optional resultType parameter.
|
|
- Output format: Structured document index in JSON format (default) or text format, listing all documents with their references, metadata, and organization by rounds/tasks.
|
|
|
|
Parameters:
|
|
- resultType (str, optional): Output format (json, txt, md). Default: json.
|
|
"""
|
|
try:
|
|
workflow = self.services.workflow
|
|
if not workflow:
|
|
return ActionResult.isFailure(
|
|
error="No workflow available"
|
|
)
|
|
|
|
resultType = parameters.get("resultType", "json").lower().strip().lstrip('.')
|
|
|
|
# Get available documents index from chat service
|
|
documentsIndex = self.services.chat.getAvailableDocuments(workflow)
|
|
|
|
if not documentsIndex or documentsIndex == "No documents available" or documentsIndex == "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.":
|
|
# Return empty index structure
|
|
if resultType == "json":
|
|
indexData = {
|
|
"workflowId": getattr(workflow, 'id', 'unknown'),
|
|
"totalDocuments": 0,
|
|
"rounds": [],
|
|
"documentReferences": []
|
|
}
|
|
indexContent = json.dumps(indexData, indent=2, ensure_ascii=False)
|
|
else:
|
|
indexContent = "Document Index\n==============\n\nNo documents available in this workflow.\n"
|
|
else:
|
|
# Parse the document index string to extract structured information
|
|
indexData = self._parseDocumentIndex(documentsIndex, workflow)
|
|
|
|
if resultType == "json":
|
|
indexContent = json.dumps(indexData, indent=2, ensure_ascii=False)
|
|
elif resultType == "md":
|
|
indexContent = self._formatAsMarkdown(indexData)
|
|
else: # txt
|
|
indexContent = self._formatAsText(indexData, documentsIndex)
|
|
|
|
# Generate meaningful filename
|
|
workflowContext = self.services.chat.getWorkflowContext()
|
|
filename = self._generateMeaningfulFileName(
|
|
"document_index",
|
|
resultType if resultType in ["json", "txt", "md"] else "json",
|
|
workflowContext,
|
|
"getDocumentIndex"
|
|
)
|
|
|
|
validationMetadata = {
|
|
"actionType": "context.getDocumentIndex",
|
|
"resultType": resultType,
|
|
"workflowId": getattr(workflow, 'id', 'unknown'),
|
|
"totalDocuments": indexData.get("totalDocuments", 0) if isinstance(indexData, dict) else 0
|
|
}
|
|
|
|
# Create ActionDocument
|
|
document = ActionDocument(
|
|
documentName=filename,
|
|
documentData=indexContent,
|
|
mimeType="application/json" if resultType == "json" else "text/plain",
|
|
validationMetadata=validationMetadata
|
|
)
|
|
|
|
return ActionResult.isSuccess(documents=[document])
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating document index: {str(e)}")
|
|
return ActionResult.isFailure(
|
|
error=f"Failed to generate document index: {str(e)}"
|
|
)
|
|
|
|
def _parseDocumentIndex(self, documentsIndex: str, workflow: Any) -> Dict[str, Any]:
|
|
"""Parse the document index string into structured data."""
|
|
try:
|
|
indexData = {
|
|
"workflowId": getattr(workflow, 'id', 'unknown'),
|
|
"generatedAt": datetime.now(UTC).isoformat(),
|
|
"totalDocuments": 0,
|
|
"rounds": [],
|
|
"documentReferences": []
|
|
}
|
|
|
|
# Extract document references from the index string
|
|
lines = documentsIndex.split('\n')
|
|
currentRound = None
|
|
currentDocList = None
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Check for round headers
|
|
if "Current round documents:" in line:
|
|
currentRound = "current"
|
|
continue
|
|
elif "Past rounds documents:" in line:
|
|
currentRound = "past"
|
|
continue
|
|
|
|
# Check for document list references (docList:...)
|
|
if line.startswith("- docList:"):
|
|
docListRef = line.replace("- docList:", "").strip()
|
|
currentDocList = {
|
|
"reference": docListRef,
|
|
"round": currentRound,
|
|
"documents": []
|
|
}
|
|
indexData["rounds"].append(currentDocList)
|
|
continue
|
|
|
|
# Check for individual document references (docItem:...)
|
|
if line.startswith(" - docItem:") or line.startswith("- docItem:"):
|
|
docItemRef = line.replace(" - docItem:", "").replace("- docItem:", "").strip()
|
|
indexData["documentReferences"].append({
|
|
"reference": docItemRef,
|
|
"round": currentRound,
|
|
"docList": currentDocList["reference"] if currentDocList else None
|
|
})
|
|
indexData["totalDocuments"] += 1
|
|
if currentDocList:
|
|
currentDocList["documents"].append(docItemRef)
|
|
|
|
return indexData
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing document index: {str(e)}")
|
|
return {
|
|
"workflowId": getattr(workflow, 'id', 'unknown'),
|
|
"error": f"Failed to parse document index: {str(e)}",
|
|
"rawIndex": documentsIndex
|
|
}
|
|
|
|
def _formatAsMarkdown(self, indexData: Dict[str, Any]) -> str:
|
|
"""Format document index as Markdown."""
|
|
try:
|
|
md = f"# Document Index\n\n"
|
|
md += f"**Workflow ID:** {indexData.get('workflowId', 'unknown')}\n\n"
|
|
md += f"**Generated At:** {indexData.get('generatedAt', 'unknown')}\n\n"
|
|
md += f"**Total Documents:** {indexData.get('totalDocuments', 0)}\n\n"
|
|
|
|
if indexData.get('rounds'):
|
|
md += "## Documents by Round\n\n"
|
|
for roundInfo in indexData['rounds']:
|
|
roundLabel = roundInfo.get('round', 'unknown').title()
|
|
md += f"### {roundLabel} Round\n\n"
|
|
md += f"**Document List:** `{roundInfo.get('reference', 'unknown')}`\n\n"
|
|
if roundInfo.get('documents'):
|
|
md += "**Documents:**\n\n"
|
|
for docRef in roundInfo['documents']:
|
|
md += f"- `{docRef}`\n"
|
|
md += "\n"
|
|
|
|
if indexData.get('documentReferences'):
|
|
md += "## All Document References\n\n"
|
|
for docRef in indexData['documentReferences']:
|
|
md += f"- `{docRef.get('reference', 'unknown')}`\n"
|
|
|
|
return md
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error formatting as Markdown: {str(e)}")
|
|
return f"# Document Index\n\nError formatting index: {str(e)}\n"
|
|
|
|
def _formatAsText(self, indexData: Dict[str, Any], rawIndex: str) -> str:
|
|
"""Format document index as plain text."""
|
|
try:
|
|
text = "Document Index\n"
|
|
text += "=" * 50 + "\n\n"
|
|
text += f"Workflow ID: {indexData.get('workflowId', 'unknown')}\n"
|
|
text += f"Generated At: {indexData.get('generatedAt', 'unknown')}\n"
|
|
text += f"Total Documents: {indexData.get('totalDocuments', 0)}\n\n"
|
|
|
|
# Include the raw formatted index for readability
|
|
text += rawIndex
|
|
|
|
return text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error formatting as text: {str(e)}")
|
|
return f"Document Index\n\nError formatting index: {str(e)}\n\nRaw index:\n{rawIndex}\n"
|
|
|
|
@action
|
|
async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Extract content from documents (separate from AI calls).
|
|
|
|
This action performs pure content extraction without AI processing.
|
|
The extracted ContentParts can then be used by subsequent AI processing actions.
|
|
|
|
Parameters:
|
|
- documentList (list, required): Document reference(s) to extract content from.
|
|
- extractionOptions (dict, optional): Extraction options (if not provided, defaults are used).
|
|
|
|
Returns:
|
|
- ActionResult with ActionDocument containing ContentExtracted objects
|
|
- ContentExtracted.parts contains List[ContentPart] (already chunked if needed)
|
|
"""
|
|
try:
|
|
# Init progress logger
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operationId = f"context_extract_{workflowId}_{int(time.time())}"
|
|
|
|
# Extract documentList from parameters dict
|
|
from modules.datamodels.datamodelDocref import DocumentReferenceList
|
|
documentListParam = parameters.get("documentList")
|
|
if not documentListParam:
|
|
return ActionResult.isFailure(error="documentList is required")
|
|
|
|
# Convert to DocumentReferenceList if needed
|
|
if isinstance(documentListParam, DocumentReferenceList):
|
|
documentList = documentListParam
|
|
elif isinstance(documentListParam, str):
|
|
documentList = DocumentReferenceList.from_string_list([documentListParam])
|
|
elif isinstance(documentListParam, list):
|
|
documentList = DocumentReferenceList.from_string_list(documentListParam)
|
|
else:
|
|
return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")
|
|
|
|
# Start progress tracking
|
|
parentOperationId = parameters.get('parentOperationId')
|
|
self.services.chat.progressLogStart(
|
|
operationId,
|
|
"Extracting content from documents",
|
|
"Content Extraction",
|
|
f"Documents: {len(documentList.references)}",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
# Get ChatDocuments from documentList
|
|
self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents")
|
|
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
|
|
|
|
if not chatDocuments:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="No documents found in documentList")
|
|
|
|
logger.info(f"Extracting content from {len(chatDocuments)} documents")
|
|
|
|
# Prepare extraction options
|
|
self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options")
|
|
extractionOptionsParam = parameters.get("extractionOptions")
|
|
|
|
# Convert dict to ExtractionOptions object if needed, or create defaults
|
|
if extractionOptionsParam:
|
|
if isinstance(extractionOptionsParam, dict):
|
|
# Convert dict to ExtractionOptions object
|
|
extractionOptions = ExtractionOptions(**extractionOptionsParam)
|
|
elif isinstance(extractionOptionsParam, ExtractionOptions):
|
|
extractionOptions = extractionOptionsParam
|
|
else:
|
|
# Invalid type, use defaults
|
|
extractionOptions = None
|
|
else:
|
|
extractionOptions = None
|
|
|
|
# If extractionOptions not provided, create defaults
|
|
if not extractionOptions:
|
|
# Default extraction options for pure content extraction (no AI processing)
|
|
extractionOptions = ExtractionOptions(
|
|
prompt="Extract all content from the document",
|
|
mergeStrategy=MergeStrategy(
|
|
mergeType="concatenate",
|
|
groupBy="typeGroup",
|
|
orderBy="id"
|
|
),
|
|
processDocumentsIndividually=True
|
|
)
|
|
|
|
# Call extraction service with hierarchical progress logging
|
|
self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating")
|
|
self.services.chat.progressLogUpdate(operationId, 0.5, f"Extracting content from {len(chatDocuments)} documents")
|
|
# Pass operationId for hierarchical per-document progress logging
|
|
extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId)
|
|
|
|
# Build ActionDocuments from ContentExtracted results
|
|
self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents")
|
|
actionDocuments = []
|
|
# Map extracted results back to original documents by index (results are in same order)
|
|
for i, extracted in enumerate(extractedResults):
|
|
# Get original document name if available
|
|
originalDoc = chatDocuments[i] if i < len(chatDocuments) else None
|
|
if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName:
|
|
# Use original filename with "extracted_" prefix
|
|
baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName
|
|
documentName = f"{baseName}_extracted_{extracted.id}.json"
|
|
else:
|
|
# Fallback to generic name with index
|
|
documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json"
|
|
|
|
# Store ContentExtracted object in ActionDocument.documentData
|
|
validationMetadata = {
|
|
"actionType": "context.extractContent",
|
|
"documentIndex": i,
|
|
"extractedId": extracted.id,
|
|
"partCount": len(extracted.parts) if extracted.parts else 0,
|
|
"originalFileName": originalDoc.fileName if originalDoc and hasattr(originalDoc, 'fileName') else None
|
|
}
|
|
actionDoc = ActionDocument(
|
|
documentName=documentName,
|
|
documentData=extracted, # ContentExtracted object
|
|
mimeType="application/json",
|
|
validationMetadata=validationMetadata
|
|
)
|
|
actionDocuments.append(actionDoc)
|
|
|
|
self.services.chat.progressLogFinish(operationId, True)
|
|
|
|
return ActionResult.isSuccess(documents=actionDocuments)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in content extraction: {str(e)}")
|
|
|
|
# Complete progress tracking with failure
|
|
try:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
except:
|
|
pass # Don't fail on progress logging errors
|
|
|
|
return ActionResult.isFailure(error=str(e))
|
|
|
|
@action
|
|
async def triggerPreprocessingServer(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
"""
|
|
Trigger preprocessing server at customer tenant to update database with configuration.
|
|
|
|
This action makes a POST request to the preprocessing server endpoint with the provided
|
|
configuration JSON. The authorization secret is retrieved from APP_CONFIG using the provided config key.
|
|
|
|
Parameters:
|
|
- endpoint (str, required): The full URL endpoint for the preprocessing server API.
|
|
- configJson (dict or str, required): Configuration JSON object to send to the preprocessing server. Can be provided as a dict or as a JSON string that will be parsed.
|
|
- authSecretConfigKey (str, required): The APP_CONFIG key name to retrieve the authorization secret from.
|
|
|
|
Returns:
|
|
- ActionResult with ActionDocument containing "ok" on success, or error message on failure.
|
|
"""
|
|
try:
|
|
endpoint = parameters.get("endpoint")
|
|
if not endpoint:
|
|
return ActionResult.isFailure(error="endpoint parameter is required")
|
|
|
|
configJsonParam = parameters.get("configJson")
|
|
if not configJsonParam:
|
|
return ActionResult.isFailure(error="configJson parameter is required")
|
|
|
|
authSecretConfigKey = parameters.get("authSecretConfigKey")
|
|
if not authSecretConfigKey:
|
|
return ActionResult.isFailure(error="authSecretConfigKey parameter is required")
|
|
|
|
# Handle configJson as either dict or JSON string
|
|
if isinstance(configJsonParam, str):
|
|
try:
|
|
configJson = json.loads(configJsonParam)
|
|
except json.JSONDecodeError as e:
|
|
return ActionResult.isFailure(error=f"configJson is not valid JSON: {str(e)}")
|
|
elif isinstance(configJsonParam, dict):
|
|
configJson = configJsonParam
|
|
else:
|
|
return ActionResult.isFailure(error=f"configJson must be a dict or JSON string, got {type(configJsonParam)}")
|
|
|
|
# Get authorization secret from APP_CONFIG using the provided config key
|
|
authSecret = APP_CONFIG.get(authSecretConfigKey)
|
|
if not authSecret:
|
|
errorMsg = f"{authSecretConfigKey} not found in APP_CONFIG"
|
|
logger.error(errorMsg)
|
|
return ActionResult.isFailure(error=errorMsg)
|
|
|
|
# Prepare headers with authorization (default headers as in original function)
|
|
headers = {
|
|
"X-PP-API-Key": authSecret,
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
# Make POST request
|
|
timeout = aiohttp.ClientTimeout(total=60)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.post(
|
|
endpoint,
|
|
headers=headers,
|
|
json=configJson
|
|
) as response:
|
|
if response.status in [200, 201]:
|
|
responseText = await response.text()
|
|
logger.info(f"Preprocessing server trigger successful: {response.status}")
|
|
logger.debug(f"Response: {responseText}")
|
|
|
|
# Generate meaningful filename
|
|
workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
|
|
filename = self._generateMeaningfulFileName(
|
|
"preprocessing_result",
|
|
"txt",
|
|
workflowContext,
|
|
"triggerPreprocessingServer"
|
|
)
|
|
|
|
# Create validation metadata
|
|
validationMetadata = self._createValidationMetadata(
|
|
"triggerPreprocessingServer",
|
|
endpoint=endpoint,
|
|
statusCode=response.status,
|
|
responseText=responseText
|
|
)
|
|
|
|
# Return success with "ok" document
|
|
document = ActionDocument(
|
|
documentName=filename,
|
|
documentData="ok",
|
|
mimeType="text/plain",
|
|
validationMetadata=validationMetadata
|
|
)
|
|
|
|
return ActionResult.isSuccess(documents=[document])
|
|
else:
|
|
errorText = await response.text()
|
|
errorMsg = f"Preprocessing server trigger failed: {response.status} - {errorText}"
|
|
logger.error(errorMsg)
|
|
return ActionResult.isFailure(error=errorMsg)
|
|
|
|
except Exception as e:
|
|
errorMsg = f"Error triggering preprocessing server: {str(e)}"
|
|
logger.error(errorMsg)
|
|
return ActionResult.isFailure(error=errorMsg)
|
|
|