gateway/modules/workflows/methods/methodContext/actions/neutralizeData.py

310 lines
14 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import base64 as _b64
import logging
import time
from typing import Any, Dict
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
from .extractContent import _one_file_bucket
logger = logging.getLogger(__name__)
HANDOVER_KIND = "context.extractContent.handover.v1"
async def _neutralize_one_content_extracted(
*,
svc,
content_extracted: ContentExtracted,
operation_id: str,
chat_doc_slot: int,
chat_documents_len: int,
) -> ContentExtracted:
"""Neutralize every part inside a ContentExtracted (copied semantics from legacy inline loop)."""
neutralized_parts = []
for part in content_extracted.parts:
if not isinstance(part, ContentPart):
if isinstance(part, dict):
try:
part = ContentPart(**part)
except Exception as e:
logger.warning(f"Could not parse ContentPart: {str(e)}")
neutralized_parts.append(part)
continue
else:
neutralized_parts.append(part)
continue
_type_group = getattr(part, "typeGroup", "") or ""
prog = 0.3 + (chat_doc_slot / max(1, chat_documents_len)) * 0.6
if _type_group == "image" and part.data:
try:
svc.services.chat.progressLogUpdate(
operation_id,
prog,
f"Checking image part {len(neutralized_parts) + 1}",
)
_img_bytes = _b64.b64decode(str(part.data))
_img_result = await svc.services.neutralization.processImageAsync(_img_bytes, f"part_{part.id}")
if _img_result.get("status") == "ok":
neutralized_parts.append(part)
else:
logger.warning("Fail-Safe: Image part %s blocked (PII), SKIPPING", part.id)
except Exception as _img_err:
logger.error(f"Fail-Safe: Image check failed for part {part.id}: {_img_err}, SKIPPING")
elif part.data:
try:
svc.services.chat.progressLogUpdate(
operation_id,
prog,
f"Neutralizing part {len(neutralized_parts) + 1}",
)
neut_res = await svc.services.neutralization.processTextAsync(part.data)
if neut_res and "neutralized_text" in neut_res:
neutral_data = neut_res["neutralized_text"]
neutralized_parts.append(
ContentPart(
id=part.id,
parentId=part.parentId,
label=part.label,
typeGroup=part.typeGroup,
mimeType=part.mimeType,
data=neutral_data,
metadata=part.metadata.copy() if part.metadata else {},
)
)
else:
logger.warning(
"Fail-Safe: Neutralization incomplete for part %s — SKIPPING (not passing original)",
part.id,
)
continue
except Exception as e:
logger.error(f"Fail-Safe: Error neutralizing part {part.id}: {str(e)}, SKIPPING")
continue
else:
neutralized_parts.append(part)
return ContentExtracted(
id=content_extracted.id,
parts=neutralized_parts,
summary=content_extracted.summary,
)
async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
operation_id = None
try:
workflow_id = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operation_id = f"context_neutralize_{workflow_id}_{int(time.time())}"
neutralization_enabled = False
try:
config = self.services.neutralization.getConfig()
neutralization_enabled = config and config.enabled
except Exception as e:
logger.debug(f"Could not check neutralization config: {str(e)}")
if not neutralization_enabled:
logger.info("Neutralization is not enabled, returning documents unchanged")
document_list_param = parameters.get("documentList")
if not document_list_param:
return ActionResult.isFailure(error="documentList is required")
doc_list = coerceDocumentReferenceList(document_list_param)
if not doc_list.references:
return ActionResult.isFailure(error=f"documentList invalid (empty)")
chat_docs = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
if not chat_docs:
return ActionResult.isFailure(error="No documents found in documentList")
action_documents = []
for chat_doc in chat_docs:
if hasattr(chat_doc, "documentData") and chat_doc.documentData:
action_documents.append(
ActionDocument(
documentName=getattr(chat_doc, "fileName", "unknown"),
documentData=chat_doc.documentData,
mimeType=getattr(chat_doc, "mimeType", "application/json"),
validationMetadata={
"actionType": "context.neutralizeData",
"neutralized": False,
"reason": "Neutralization disabled",
},
)
)
return ActionResult.isSuccess(documents=action_documents)
document_list_param = parameters.get("documentList")
if not document_list_param:
return ActionResult.isFailure(error="documentList is required")
doc_list = coerceDocumentReferenceList(document_list_param)
if not doc_list.references:
return ActionResult.isFailure(error=f"documentList invalid")
parent_operation_id = parameters.get("parentOperationId")
self.services.chat.progressLogStart(
operation_id,
"Neutralizing data from documents",
"Data Neutralization",
f"Documents: {len(doc_list.references)}",
parentOperationId=parent_operation_id,
)
self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
chat_documents = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
if not chat_documents:
self.services.chat.progressLogFinish(operation_id, False)
return ActionResult.isFailure(error="No documents found in documentList")
logger.info(f"Neutralizing data from {len(chat_documents)} document(s)")
self.services.chat.progressLogUpdate(operation_id, 0.3, "Processing documents")
action_documents = []
for i, chat_doc in enumerate(chat_documents):
try:
dd = getattr(chat_doc, "documentData", None)
if not dd:
logger.warning(f"Document {i + 1} has no documentData, skipping")
continue
fn = str(getattr(chat_doc, "fileName", "") or "")
mime_guess = str(getattr(chat_doc, "mimeType", "") or "").lower()
if (
mime_guess.startswith("image/")
and fn.startswith("extract_media_")
and not (isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND)
):
action_documents.append(
ActionDocument(
documentName=fn or f"media_{i + 1}",
documentData=dd,
mimeType=mime_guess or "application/octet-stream",
validationMetadata={
"actionType": "context.neutralizeData",
"neutralized": False,
"reason": "extractContent_media_sidecar_pass_through",
},
)
)
continue
# --- Unified JSON envelope from context.extractContent (v1) ---
if isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND:
bundle = dict(dd)
files_section = dd.get("files") or {}
new_files = {}
for fk, bucket in files_section.items():
if not isinstance(bucket, dict):
continue
parts_raw = bucket.get("parts") or []
parsed_parts = []
for pd in parts_raw:
parsed_parts.append(ContentPart(**pd) if isinstance(pd, dict) else pd)
summary = bucket.get("summary") or {}
if hasattr(summary, "model_dump"):
summary = summary.model_dump(mode="json")
ce = ContentExtracted(
id=str(bucket.get("extractedId") or ""),
parts=parsed_parts,
summary=summary if isinstance(summary, dict) else {},
)
ce_out = await _neutralize_one_content_extracted(
svc=self,
content_extracted=ce,
operation_id=operation_id,
chat_doc_slot=i,
chat_documents_len=max(len(chat_documents), 1),
)
new_files[fk] = _one_file_bucket(ce_out, str(bucket.get("sourceFileName") or fk))
bundle["files"] = new_files
original_filename = getattr(chat_doc, "fileName", f"neutralized_bundle_{workflow_id}.json")
bn = original_filename.rsplit(".", 1)[0] if "." in original_filename else original_filename
action_documents.append(
ActionDocument(
documentName=f"{bn}_neutralized.json",
documentData=bundle,
mimeType="application/json",
validationMetadata={
"actionType": "context.neutralizeData",
"neutralized": True,
"handoverKind": HANDOVER_KIND,
"bundleFileCount": len(new_files),
},
)
)
continue
# --- Legacy ContentExtracted per persisted document ---
if isinstance(dd, ContentExtracted):
content_extracted = dd
elif isinstance(dd, dict):
try:
content_extracted = ContentExtracted(**dd)
except Exception:
logger.warning(f"Document {i + 1} documentData cannot be parsed as ContentExtracted dict")
continue
else:
logger.warning(f"Document {i + 1} documentData is not supported")
continue
neut_out = await _neutralize_one_content_extracted(
svc=self,
content_extracted=content_extracted,
operation_id=operation_id,
chat_doc_slot=i,
chat_documents_len=max(len(chat_documents), 1),
)
original_file_name = getattr(chat_doc, "fileName", f"document_{i + 1}.json")
base_name = original_file_name.rsplit(".", 1)[0] if "." in original_file_name else original_file_name
document_name = f"{base_name}_neutralized_{neut_out.id}.json"
action_documents.append(
ActionDocument(
documentName=document_name,
documentData=neut_out,
mimeType="application/json",
validationMetadata={
"actionType": "context.neutralizeData",
"documentIndex": i,
"extractedId": neut_out.id,
"partCount": len(neut_out.parts),
"neutralized": True,
"originalFileName": original_file_name,
},
)
)
except Exception as e:
logger.error(f"Error processing document {i + 1}: {str(e)}")
continue
if not action_documents:
self.services.chat.progressLogFinish(operation_id, False)
return ActionResult.isFailure(error="No valid documents found to neutralize")
self.services.chat.progressLogFinish(operation_id, True)
return ActionResult.isSuccess(documents=action_documents)
except Exception as e:
logger.error(f"Error in data neutralization: {str(e)}")
try:
if operation_id:
self.services.chat.progressLogFinish(operation_id, False)
except Exception:
pass
return ActionResult.isFailure(error=str(e))