gateway/modules/workflows/methods/methodContext/actions/neutralizeData.py
2026-03-03 18:57:20 +01:00

237 lines
11 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import time
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import DocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
logger = logging.getLogger(__name__)
async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"context_neutralize_{workflowId}_{int(time.time())}"
# Check if neutralization is enabled
neutralizationEnabled = False
try:
config = self.services.neutralization.getConfig()
neutralizationEnabled = config and config.enabled
except Exception as e:
logger.debug(f"Could not check neutralization config: {str(e)}")
if not neutralizationEnabled:
logger.info("Neutralization is not enabled, returning documents unchanged")
# Return original documents if neutralization is disabled
# Get documents from documentList
documentListParam = parameters.get("documentList")
if not documentListParam:
return ActionResult.isFailure(error="documentList is required")
# Convert to DocumentReferenceList if needed
if isinstance(documentListParam, DocumentReferenceList):
documentList = documentListParam
elif isinstance(documentListParam, str):
documentList = DocumentReferenceList.from_string_list([documentListParam])
elif isinstance(documentListParam, list):
documentList = DocumentReferenceList.from_string_list(documentListParam)
else:
return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")
# Get ChatDocuments from documentList
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
if not chatDocuments:
return ActionResult.isFailure(error="No documents found in documentList")
# Return original documents as ActionDocuments
actionDocuments = []
for chatDoc in chatDocuments:
# Extract ContentExtracted from documentData if available
if hasattr(chatDoc, 'documentData') and chatDoc.documentData:
actionDoc = ActionDocument(
documentName=getattr(chatDoc, 'fileName', 'unknown'),
documentData=chatDoc.documentData,
mimeType=getattr(chatDoc, 'mimeType', 'application/json'),
validationMetadata={
"actionType": "context.neutralizeData",
"neutralized": False,
"reason": "Neutralization disabled"
}
)
actionDocuments.append(actionDoc)
return ActionResult.isSuccess(documents=actionDocuments)
# Extract documentList from parameters dict
documentListParam = parameters.get("documentList")
if not documentListParam:
return ActionResult.isFailure(error="documentList is required")
# Convert to DocumentReferenceList if needed
if isinstance(documentListParam, DocumentReferenceList):
documentList = documentListParam
elif isinstance(documentListParam, str):
documentList = DocumentReferenceList.from_string_list([documentListParam])
elif isinstance(documentListParam, list):
documentList = DocumentReferenceList.from_string_list(documentListParam)
else:
return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"Neutralizing data from documents",
"Data Neutralization",
f"Documents: {len(documentList.references)}",
parentOperationId=parentOperationId
)
# Get ChatDocuments from documentList
self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents")
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
if not chatDocuments:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No documents found in documentList")
logger.info(f"Neutralizing data from {len(chatDocuments)} documents")
# Process each document
self.services.chat.progressLogUpdate(operationId, 0.3, "Processing documents")
actionDocuments = []
for i, chatDoc in enumerate(chatDocuments):
try:
# Extract ContentExtracted from documentData
if not hasattr(chatDoc, 'documentData') or not chatDoc.documentData:
logger.warning(f"Document {i+1} has no documentData, skipping")
continue
documentData = chatDoc.documentData
# Check if it's a ContentExtracted object
if isinstance(documentData, ContentExtracted):
contentExtracted = documentData
elif isinstance(documentData, dict):
# Try to parse as ContentExtracted
try:
contentExtracted = ContentExtracted(**documentData)
except Exception as e:
logger.warning(f"Document {i+1} documentData is not ContentExtracted: {str(e)}")
continue
else:
logger.warning(f"Document {i+1} documentData is not ContentExtracted or dict")
continue
# Neutralize each ContentPart's data field
neutralizedParts = []
for part in contentExtracted.parts:
if not isinstance(part, ContentPart):
# Try to parse as ContentPart
if isinstance(part, dict):
try:
part = ContentPart(**part)
except Exception as e:
logger.warning(f"Could not parse ContentPart: {str(e)}")
neutralizedParts.append(part)
continue
else:
neutralizedParts.append(part)
continue
# Neutralize the data field if it contains text
if part.data:
try:
self.services.chat.progressLogUpdate(
operationId,
0.3 + (i / len(chatDocuments)) * 0.6,
f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}"
)
# Call neutralization service
neutralizationResult = self.services.neutralization.processText(part.data)
if neutralizationResult and 'neutralized_text' in neutralizationResult:
# Replace data with neutralized text
neutralizedData = neutralizationResult['neutralized_text']
# Create new ContentPart with neutralized data
neutralizedPart = ContentPart(
id=part.id,
parentId=part.parentId,
label=part.label,
typeGroup=part.typeGroup,
mimeType=part.mimeType,
data=neutralizedData,
metadata=part.metadata.copy() if part.metadata else {}
)
neutralizedParts.append(neutralizedPart)
else:
# Neutralization failed, use original part
logger.warning(f"Neutralization did not return neutralized_text for part {part.id}")
neutralizedParts.append(part)
except Exception as e:
logger.error(f"Error neutralizing part {part.id}: {str(e)}")
# On error, use original part
neutralizedParts.append(part)
else:
# No data to neutralize, keep original part
neutralizedParts.append(part)
# Create neutralized ContentExtracted object
neutralizedContentExtracted = ContentExtracted(
id=contentExtracted.id,
parts=neutralizedParts,
summary=contentExtracted.summary
)
# Create ActionDocument
originalFileName = getattr(chatDoc, 'fileName', f"document_{i+1}.json")
baseName = originalFileName.rsplit('.', 1)[0] if '.' in originalFileName else originalFileName
documentName = f"{baseName}_neutralized_{contentExtracted.id}.json"
validationMetadata = {
"actionType": "context.neutralizeData",
"documentIndex": i,
"extractedId": contentExtracted.id,
"partCount": len(neutralizedParts),
"neutralized": True,
"originalFileName": originalFileName
}
actionDoc = ActionDocument(
documentName=documentName,
documentData=neutralizedContentExtracted,
mimeType="application/json",
validationMetadata=validationMetadata
)
actionDocuments.append(actionDoc)
except Exception as e:
logger.error(f"Error processing document {i+1}: {str(e)}")
# Continue with other documents
continue
if not actionDocuments:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid ContentExtracted documents found to neutralize")
self.services.chat.progressLogFinish(operationId, True)
return ActionResult.isSuccess(documents=actionDocuments)
except Exception as e:
logger.error(f"Error in data neutralization: {str(e)}")
try:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
except Exception:
pass
return ActionResult.isFailure(error=str(e))