# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Neutralize Data action for Context operations. Neutralizes extracted content data from ContentExtracted documents. """ import logging import time from typing import Dict, Any from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import DocumentReferenceList from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart logger = logging.getLogger(__name__) @action async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult: """ Neutralize data from ContentExtracted documents. This action takes documents containing ContentExtracted objects (from extractContent) and neutralizes the text data in ContentPart.data fields. Parameters: - documentList (list, required): Document reference(s) containing ContentExtracted objects. Returns: - ActionResult with ActionDocument containing neutralized ContentExtracted objects """ try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operationId = f"context_neutralize_{workflowId}_{int(time.time())}" # Check if neutralization is enabled neutralizationEnabled = False try: config = self.services.neutralization.getConfig() neutralizationEnabled = config and config.enabled except Exception as e: logger.debug(f"Could not check neutralization config: {str(e)}") if not neutralizationEnabled: logger.info("Neutralization is not enabled, returning documents unchanged") # Return original documents if neutralization is disabled # Get documents from documentList documentListParam = parameters.get("documentList") if not documentListParam: return ActionResult.isFailure(error="documentList is required") # Convert to DocumentReferenceList if needed if isinstance(documentListParam, DocumentReferenceList): documentList = documentListParam elif isinstance(documentListParam, str): documentList = DocumentReferenceList.from_string_list([documentListParam]) elif isinstance(documentListParam, list): documentList = DocumentReferenceList.from_string_list(documentListParam) else: return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}") # Get ChatDocuments from documentList chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) if not chatDocuments: return ActionResult.isFailure(error="No documents found in documentList") # Return original documents as ActionDocuments actionDocuments = [] for chatDoc in chatDocuments: # Extract ContentExtracted from documentData if available if hasattr(chatDoc, 'documentData') and chatDoc.documentData: actionDoc = ActionDocument( documentName=getattr(chatDoc, 'fileName', 'unknown'), documentData=chatDoc.documentData, mimeType=getattr(chatDoc, 'mimeType', 'application/json'), validationMetadata={ "actionType": "context.neutralizeData", "neutralized": False, "reason": "Neutralization disabled" } ) actionDocuments.append(actionDoc) return ActionResult.isSuccess(documents=actionDocuments) # Extract documentList from parameters dict documentListParam = parameters.get("documentList") if not documentListParam: return ActionResult.isFailure(error="documentList is required") # Convert to DocumentReferenceList if needed if isinstance(documentListParam, DocumentReferenceList): documentList = documentListParam elif isinstance(documentListParam, str): documentList = DocumentReferenceList.from_string_list([documentListParam]) elif isinstance(documentListParam, list): documentList = DocumentReferenceList.from_string_list(documentListParam) else: return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}") # Start progress tracking parentOperationId = parameters.get('parentOperationId') self.services.chat.progressLogStart( operationId, "Neutralizing data from documents", "Data Neutralization", f"Documents: {len(documentList.references)}", parentOperationId=parentOperationId ) # Get ChatDocuments from documentList self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents") chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) if not chatDocuments: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No documents found in documentList") logger.info(f"Neutralizing data from {len(chatDocuments)} documents") # Process each document self.services.chat.progressLogUpdate(operationId, 0.3, "Processing documents") actionDocuments = [] for i, chatDoc in enumerate(chatDocuments): try: # Extract ContentExtracted from documentData if not hasattr(chatDoc, 'documentData') or not chatDoc.documentData: logger.warning(f"Document {i+1} has no documentData, skipping") continue documentData = chatDoc.documentData # Check if it's a ContentExtracted object if isinstance(documentData, ContentExtracted): contentExtracted = documentData elif isinstance(documentData, dict): # Try to parse as ContentExtracted try: contentExtracted = ContentExtracted(**documentData) except Exception as e: logger.warning(f"Document {i+1} documentData is not ContentExtracted: {str(e)}") continue else: logger.warning(f"Document {i+1} documentData is not ContentExtracted or dict") continue # Neutralize each ContentPart's data field neutralizedParts = [] for part in contentExtracted.parts: if not isinstance(part, ContentPart): # Try to parse as ContentPart if isinstance(part, dict): try: part = ContentPart(**part) except Exception as e: logger.warning(f"Could not parse ContentPart: {str(e)}") neutralizedParts.append(part) continue else: neutralizedParts.append(part) continue # Neutralize the data field if it contains text if part.data: try: self.services.chat.progressLogUpdate( operationId, 0.3 + (i / len(chatDocuments)) * 0.6, f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}" ) # Call neutralization service neutralizationResult = self.services.neutralization.processText(part.data) if neutralizationResult and 'neutralized_text' in neutralizationResult: # Replace data with neutralized text neutralizedData = neutralizationResult['neutralized_text'] # Create new ContentPart with neutralized data neutralizedPart = ContentPart( id=part.id, parentId=part.parentId, label=part.label, typeGroup=part.typeGroup, mimeType=part.mimeType, data=neutralizedData, metadata=part.metadata.copy() if part.metadata else {} ) neutralizedParts.append(neutralizedPart) else: # Neutralization failed, use original part logger.warning(f"Neutralization did not return neutralized_text for part {part.id}") neutralizedParts.append(part) except Exception as e: logger.error(f"Error neutralizing part {part.id}: {str(e)}") # On error, use original part neutralizedParts.append(part) else: # No data to neutralize, keep original part neutralizedParts.append(part) # Create neutralized ContentExtracted object neutralizedContentExtracted = ContentExtracted( id=contentExtracted.id, parts=neutralizedParts, summary=contentExtracted.summary ) # Create ActionDocument originalFileName = getattr(chatDoc, 'fileName', f"document_{i+1}.json") baseName = originalFileName.rsplit('.', 1)[0] if '.' in originalFileName else originalFileName documentName = f"{baseName}_neutralized_{contentExtracted.id}.json" validationMetadata = { "actionType": "context.neutralizeData", "documentIndex": i, "extractedId": contentExtracted.id, "partCount": len(neutralizedParts), "neutralized": True, "originalFileName": originalFileName } actionDoc = ActionDocument( documentName=documentName, documentData=neutralizedContentExtracted, mimeType="application/json", validationMetadata=validationMetadata ) actionDocuments.append(actionDoc) except Exception as e: logger.error(f"Error processing document {i+1}: {str(e)}") # Continue with other documents continue if not actionDocuments: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No valid ContentExtracted documents found to neutralize") self.services.chat.progressLogFinish(operationId, True) return ActionResult.isSuccess(documents=actionDocuments) except Exception as e: logger.error(f"Error in data neutralization: {str(e)}") # Complete progress tracking with failure try: self.services.chat.progressLogFinish(operationId, False) except: pass # Don't fail on progress logging errors return ActionResult.isFailure(error=str(e))