gateway/modules/workflows/methods/methodContext/actions/neutralizeData.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import base64 as _b64
import logging
import time
from typing import Any, Dict

from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart

from .extractContent import _one_file_bucket

logger = logging.getLogger(__name__)

HANDOVER_KIND = "context.extractContent.handover.v1"


async def _neutralize_one_content_extracted(
    *,
    svc,
    content_extracted: ContentExtracted,
    operation_id: str,
    chat_doc_slot: int,
    chat_documents_len: int,
) -> ContentExtracted:
    """Neutralize every part inside a ContentExtracted (copied semantics from legacy inline loop)."""
    neutralized_parts = []
    for part in content_extracted.parts:
        if not isinstance(part, ContentPart):
            if isinstance(part, dict):
                try:
                    part = ContentPart(**part)
                except Exception as e:
                    logger.warning(f"Could not parse ContentPart: {str(e)}")
                    neutralized_parts.append(part)
                    continue
            else:
                neutralized_parts.append(part)
                continue

        _type_group = getattr(part, "typeGroup", "") or ""
        prog = 0.3 + (chat_doc_slot / max(1, chat_documents_len)) * 0.6

        if _type_group == "image" and part.data:
            try:
                svc.services.chat.progressLogUpdate(
                    operation_id,
                    prog,
                    f"Checking image part {len(neutralized_parts) + 1}",
                )
                _img_bytes = _b64.b64decode(str(part.data))
                _img_result = await svc.services.neutralization.processImageAsync(_img_bytes, f"part_{part.id}")
                if _img_result.get("status") == "ok":
                    neutralized_parts.append(part)
                else:
                    logger.warning("Fail-Safe: Image part %s blocked (PII), SKIPPING", part.id)
            except Exception as _img_err:
                logger.error(f"Fail-Safe: Image check failed for part {part.id}: {_img_err}, SKIPPING")
        elif part.data:
            try:
                svc.services.chat.progressLogUpdate(
                    operation_id,
                    prog,
                    f"Neutralizing part {len(neutralized_parts) + 1}",
                )
                neut_res = await svc.services.neutralization.processTextAsync(part.data)
                if neut_res and "neutralized_text" in neut_res:
                    neutral_data = neut_res["neutralized_text"]
                    neutralized_parts.append(
                        ContentPart(
                            id=part.id,
                            parentId=part.parentId,
                            label=part.label,
                            typeGroup=part.typeGroup,
                            mimeType=part.mimeType,
                            data=neutral_data,
                            metadata=part.metadata.copy() if part.metadata else {},
                        )
                    )
                else:
                    logger.warning(
                        "Fail-Safe: Neutralization incomplete for part %s — SKIPPING (not passing original)",
                        part.id,
                    )
                    continue
            except Exception as e:
                logger.error(f"Fail-Safe: Error neutralizing part {part.id}: {str(e)}, SKIPPING")
                continue
        else:
            neutralized_parts.append(part)

    return ContentExtracted(
        id=content_extracted.id,
        parts=neutralized_parts,
        summary=content_extracted.summary,
    )


async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
    operation_id = None
    try:
        workflow_id = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operation_id = f"context_neutralize_{workflow_id}_{int(time.time())}"

        neutralization_enabled = False
        try:
            config = self.services.neutralization.getConfig()
            neutralization_enabled = config and config.enabled
        except Exception as e:
            logger.debug(f"Could not check neutralization config: {str(e)}")

        if not neutralization_enabled:
            logger.info("Neutralization is not enabled, returning documents unchanged")
            document_list_param = parameters.get("documentList")
            if not document_list_param:
                return ActionResult.isFailure(error="documentList is required")

            doc_list = coerceDocumentReferenceList(document_list_param)
            if not doc_list.references:
                return ActionResult.isFailure(error=f"documentList invalid (empty)")

            chat_docs = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
            if not chat_docs:
                return ActionResult.isFailure(error="No documents found in documentList")

            action_documents = []
            for chat_doc in chat_docs:
                if hasattr(chat_doc, "documentData") and chat_doc.documentData:
                    action_documents.append(
                        ActionDocument(
                            documentName=getattr(chat_doc, "fileName", "unknown"),
                            documentData=chat_doc.documentData,
                            mimeType=getattr(chat_doc, "mimeType", "application/json"),
                            validationMetadata={
                                "actionType": "context.neutralizeData",
                                "neutralized": False,
                                "reason": "Neutralization disabled",
                            },
                        )
                    )
            return ActionResult.isSuccess(documents=action_documents)

        document_list_param = parameters.get("documentList")
        if not document_list_param:
            return ActionResult.isFailure(error="documentList is required")

        doc_list = coerceDocumentReferenceList(document_list_param)
        if not doc_list.references:
            return ActionResult.isFailure(error=f"documentList invalid")

        parent_operation_id = parameters.get("parentOperationId")
        self.services.chat.progressLogStart(
            operation_id,
            "Neutralizing data from documents",
            "Data Neutralization",
            f"Documents: {len(doc_list.references)}",
            parentOperationId=parent_operation_id,
        )

        self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
        chat_documents = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
        if not chat_documents:
            self.services.chat.progressLogFinish(operation_id, False)
            return ActionResult.isFailure(error="No documents found in documentList")

        logger.info(f"Neutralizing data from {len(chat_documents)} document(s)")
        self.services.chat.progressLogUpdate(operation_id, 0.3, "Processing documents")
        action_documents = []

        for i, chat_doc in enumerate(chat_documents):
            try:
                dd = getattr(chat_doc, "documentData", None)
                if not dd:
                    logger.warning(f"Document {i + 1} has no documentData, skipping")
                    continue

                fn = str(getattr(chat_doc, "fileName", "") or "")
                mime_guess = str(getattr(chat_doc, "mimeType", "") or "").lower()
                if (
                    mime_guess.startswith("image/")
                    and fn.startswith("extract_media_")
                    and not (isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND)
                ):
                    action_documents.append(
                        ActionDocument(
                            documentName=fn or f"media_{i + 1}",
                            documentData=dd,
                            mimeType=mime_guess or "application/octet-stream",
                            validationMetadata={
                                "actionType": "context.neutralizeData",
                                "neutralized": False,
                                "reason": "extractContent_media_sidecar_pass_through",
                            },
                        )
                    )
                    continue

                # --- Unified JSON envelope from context.extractContent (v1) ---
                if isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND:
                    bundle = dict(dd)
                    files_section = dd.get("files") or {}
                    new_files = {}
                    for fk, bucket in files_section.items():
                        if not isinstance(bucket, dict):
                            continue
                        parts_raw = bucket.get("parts") or []
                        parsed_parts = []
                        for pd in parts_raw:
                            parsed_parts.append(ContentPart(**pd) if isinstance(pd, dict) else pd)

                        summary = bucket.get("summary") or {}
                        if hasattr(summary, "model_dump"):
                            summary = summary.model_dump(mode="json")

                        ce = ContentExtracted(
                            id=str(bucket.get("extractedId") or ""),
                            parts=parsed_parts,
                            summary=summary if isinstance(summary, dict) else {},
                        )

                        ce_out = await _neutralize_one_content_extracted(
                            svc=self,
                            content_extracted=ce,
                            operation_id=operation_id,
                            chat_doc_slot=i,
                            chat_documents_len=max(len(chat_documents), 1),
                        )
                        new_files[fk] = _one_file_bucket(ce_out, str(bucket.get("sourceFileName") or fk))

                    bundle["files"] = new_files
                    original_filename = getattr(chat_doc, "fileName", f"neutralized_bundle_{workflow_id}.json")
                    bn = original_filename.rsplit(".", 1)[0] if "." in original_filename else original_filename
                    action_documents.append(
                        ActionDocument(
                            documentName=f"{bn}_neutralized.json",
                            documentData=bundle,
                            mimeType="application/json",
                            validationMetadata={
                                "actionType": "context.neutralizeData",
                                "neutralized": True,
                                "handoverKind": HANDOVER_KIND,
                                "bundleFileCount": len(new_files),
                            },
                        )
                    )
                    continue

                # --- Legacy ContentExtracted per persisted document ---
                if isinstance(dd, ContentExtracted):
                    content_extracted = dd
                elif isinstance(dd, dict):
                    try:
                        content_extracted = ContentExtracted(**dd)
                    except Exception:
                        logger.warning(f"Document {i + 1} documentData cannot be parsed as ContentExtracted dict")
                        continue
                else:
                    logger.warning(f"Document {i + 1} documentData is not supported")
                    continue

                neut_out = await _neutralize_one_content_extracted(
                    svc=self,
                    content_extracted=content_extracted,
                    operation_id=operation_id,
                    chat_doc_slot=i,
                    chat_documents_len=max(len(chat_documents), 1),
                )

                original_file_name = getattr(chat_doc, "fileName", f"document_{i + 1}.json")
                base_name = original_file_name.rsplit(".", 1)[0] if "." in original_file_name else original_file_name
                document_name = f"{base_name}_neutralized_{neut_out.id}.json"

                action_documents.append(
                    ActionDocument(
                        documentName=document_name,
                        documentData=neut_out,
                        mimeType="application/json",
                        validationMetadata={
                            "actionType": "context.neutralizeData",
                            "documentIndex": i,
                            "extractedId": neut_out.id,
                            "partCount": len(neut_out.parts),
                            "neutralized": True,
                            "originalFileName": original_file_name,
                        },
                    )
                )

            except Exception as e:
                logger.error(f"Error processing document {i + 1}: {str(e)}")
                continue

        if not action_documents:
            self.services.chat.progressLogFinish(operation_id, False)
            return ActionResult.isFailure(error="No valid documents found to neutralize")

        self.services.chat.progressLogFinish(operation_id, True)
        return ActionResult.isSuccess(documents=action_documents)

    except Exception as e:
        logger.error(f"Error in data neutralization: {str(e)}")
        try:
            if operation_id:
                self.services.chat.progressLogFinish(operation_id, False)
        except Exception:
            pass

        return ActionResult.isFailure(error=str(e))