# Copyright (c) 2025 Patrick Motsch # All rights reserved. import base64 as _b64 import logging import time from typing import Any, Dict from modules.datamodels.datamodelChat import ActionResult, ActionDocument from modules.datamodels.datamodelDocref import coerceDocumentReferenceList from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart from .extractContent import _one_file_bucket logger = logging.getLogger(__name__) HANDOVER_KIND = "context.extractContent.handover.v1" async def _neutralize_one_content_extracted( *, svc, content_extracted: ContentExtracted, operation_id: str, chat_doc_slot: int, chat_documents_len: int, ) -> ContentExtracted: """Neutralize every part inside a ContentExtracted (copied semantics from legacy inline loop).""" neutralized_parts = [] for part in content_extracted.parts: if not isinstance(part, ContentPart): if isinstance(part, dict): try: part = ContentPart(**part) except Exception as e: logger.warning(f"Could not parse ContentPart: {str(e)}") neutralized_parts.append(part) continue else: neutralized_parts.append(part) continue _type_group = getattr(part, "typeGroup", "") or "" prog = 0.3 + (chat_doc_slot / max(1, chat_documents_len)) * 0.6 if _type_group == "image" and part.data: try: svc.services.chat.progressLogUpdate( operation_id, prog, f"Checking image part {len(neutralized_parts) + 1}", ) _img_bytes = _b64.b64decode(str(part.data)) _img_result = await svc.services.neutralization.processImageAsync(_img_bytes, f"part_{part.id}") if _img_result.get("status") == "ok": neutralized_parts.append(part) else: logger.warning("Fail-Safe: Image part %s blocked (PII), SKIPPING", part.id) except Exception as _img_err: logger.error(f"Fail-Safe: Image check failed for part {part.id}: {_img_err}, SKIPPING") elif part.data: try: svc.services.chat.progressLogUpdate( operation_id, prog, f"Neutralizing part {len(neutralized_parts) + 1}", ) neut_res = await svc.services.neutralization.processTextAsync(part.data) if neut_res and "neutralized_text" in neut_res: neutral_data = neut_res["neutralized_text"] neutralized_parts.append( ContentPart( id=part.id, parentId=part.parentId, label=part.label, typeGroup=part.typeGroup, mimeType=part.mimeType, data=neutral_data, metadata=part.metadata.copy() if part.metadata else {}, ) ) else: logger.warning( "Fail-Safe: Neutralization incomplete for part %s — SKIPPING (not passing original)", part.id, ) continue except Exception as e: logger.error(f"Fail-Safe: Error neutralizing part {part.id}: {str(e)}, SKIPPING") continue else: neutralized_parts.append(part) return ContentExtracted( id=content_extracted.id, parts=neutralized_parts, summary=content_extracted.summary, ) async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult: operation_id = None try: workflow_id = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operation_id = f"context_neutralize_{workflow_id}_{int(time.time())}" neutralization_enabled = False try: config = self.services.neutralization.getConfig() neutralization_enabled = config and config.enabled except Exception as e: logger.debug(f"Could not check neutralization config: {str(e)}") if not neutralization_enabled: logger.info("Neutralization is not enabled, returning documents unchanged") document_list_param = parameters.get("documentList") if not document_list_param: return ActionResult.isFailure(error="documentList is required") doc_list = coerceDocumentReferenceList(document_list_param) if not doc_list.references: return ActionResult.isFailure(error=f"documentList invalid (empty)") chat_docs = self.services.chat.getChatDocumentsFromDocumentList(doc_list) if not chat_docs: return ActionResult.isFailure(error="No documents found in documentList") action_documents = [] for chat_doc in chat_docs: if hasattr(chat_doc, "documentData") and chat_doc.documentData: action_documents.append( ActionDocument( documentName=getattr(chat_doc, "fileName", "unknown"), documentData=chat_doc.documentData, mimeType=getattr(chat_doc, "mimeType", "application/json"), validationMetadata={ "actionType": "context.neutralizeData", "neutralized": False, "reason": "Neutralization disabled", }, ) ) return ActionResult.isSuccess(documents=action_documents) document_list_param = parameters.get("documentList") if not document_list_param: return ActionResult.isFailure(error="documentList is required") doc_list = coerceDocumentReferenceList(document_list_param) if not doc_list.references: return ActionResult.isFailure(error=f"documentList invalid") parent_operation_id = parameters.get("parentOperationId") self.services.chat.progressLogStart( operation_id, "Neutralizing data from documents", "Data Neutralization", f"Documents: {len(doc_list.references)}", parentOperationId=parent_operation_id, ) self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents") chat_documents = self.services.chat.getChatDocumentsFromDocumentList(doc_list) if not chat_documents: self.services.chat.progressLogFinish(operation_id, False) return ActionResult.isFailure(error="No documents found in documentList") logger.info(f"Neutralizing data from {len(chat_documents)} document(s)") self.services.chat.progressLogUpdate(operation_id, 0.3, "Processing documents") action_documents = [] for i, chat_doc in enumerate(chat_documents): try: dd = getattr(chat_doc, "documentData", None) if not dd: logger.warning(f"Document {i + 1} has no documentData, skipping") continue fn = str(getattr(chat_doc, "fileName", "") or "") mime_guess = str(getattr(chat_doc, "mimeType", "") or "").lower() if ( mime_guess.startswith("image/") and fn.startswith("extract_media_") and not (isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND) ): action_documents.append( ActionDocument( documentName=fn or f"media_{i + 1}", documentData=dd, mimeType=mime_guess or "application/octet-stream", validationMetadata={ "actionType": "context.neutralizeData", "neutralized": False, "reason": "extractContent_media_sidecar_pass_through", }, ) ) continue # --- Unified JSON envelope from context.extractContent (v1) --- if isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND: bundle = dict(dd) files_section = dd.get("files") or {} new_files = {} for fk, bucket in files_section.items(): if not isinstance(bucket, dict): continue parts_raw = bucket.get("parts") or [] parsed_parts = [] for pd in parts_raw: parsed_parts.append(ContentPart(**pd) if isinstance(pd, dict) else pd) summary = bucket.get("summary") or {} if hasattr(summary, "model_dump"): summary = summary.model_dump(mode="json") ce = ContentExtracted( id=str(bucket.get("extractedId") or ""), parts=parsed_parts, summary=summary if isinstance(summary, dict) else {}, ) ce_out = await _neutralize_one_content_extracted( svc=self, content_extracted=ce, operation_id=operation_id, chat_doc_slot=i, chat_documents_len=max(len(chat_documents), 1), ) new_files[fk] = _one_file_bucket(ce_out, str(bucket.get("sourceFileName") or fk)) bundle["files"] = new_files original_filename = getattr(chat_doc, "fileName", f"neutralized_bundle_{workflow_id}.json") bn = original_filename.rsplit(".", 1)[0] if "." in original_filename else original_filename action_documents.append( ActionDocument( documentName=f"{bn}_neutralized.json", documentData=bundle, mimeType="application/json", validationMetadata={ "actionType": "context.neutralizeData", "neutralized": True, "handoverKind": HANDOVER_KIND, "bundleFileCount": len(new_files), }, ) ) continue # --- Legacy ContentExtracted per persisted document --- if isinstance(dd, ContentExtracted): content_extracted = dd elif isinstance(dd, dict): try: content_extracted = ContentExtracted(**dd) except Exception: logger.warning(f"Document {i + 1} documentData cannot be parsed as ContentExtracted dict") continue else: logger.warning(f"Document {i + 1} documentData is not supported") continue neut_out = await _neutralize_one_content_extracted( svc=self, content_extracted=content_extracted, operation_id=operation_id, chat_doc_slot=i, chat_documents_len=max(len(chat_documents), 1), ) original_file_name = getattr(chat_doc, "fileName", f"document_{i + 1}.json") base_name = original_file_name.rsplit(".", 1)[0] if "." in original_file_name else original_file_name document_name = f"{base_name}_neutralized_{neut_out.id}.json" action_documents.append( ActionDocument( documentName=document_name, documentData=neut_out, mimeType="application/json", validationMetadata={ "actionType": "context.neutralizeData", "documentIndex": i, "extractedId": neut_out.id, "partCount": len(neut_out.parts), "neutralized": True, "originalFileName": original_file_name, }, ) ) except Exception as e: logger.error(f"Error processing document {i + 1}: {str(e)}") continue if not action_documents: self.services.chat.progressLogFinish(operation_id, False) return ActionResult.isFailure(error="No valid documents found to neutralize") self.services.chat.progressLogFinish(operation_id, True) return ActionResult.isSuccess(documents=action_documents) except Exception as e: logger.error(f"Error in data neutralization: {str(e)}") try: if operation_id: self.services.chat.progressLogFinish(operation_id, False) except Exception: pass return ActionResult.isFailure(error=str(e))