310 lines
14 KiB
Python
310 lines
14 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
|
|
import base64 as _b64
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict
|
|
|
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
|
from modules.datamodels.datamodelDocref import coerceDocumentReferenceList
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
|
|
|
|
from .extractContent import _one_file_bucket
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
HANDOVER_KIND = "context.extractContent.handover.v1"
|
|
|
|
|
|
async def _neutralize_one_content_extracted(
|
|
*,
|
|
svc,
|
|
content_extracted: ContentExtracted,
|
|
operation_id: str,
|
|
chat_doc_slot: int,
|
|
chat_documents_len: int,
|
|
) -> ContentExtracted:
|
|
"""Neutralize every part inside a ContentExtracted (copied semantics from legacy inline loop)."""
|
|
neutralized_parts = []
|
|
for part in content_extracted.parts:
|
|
if not isinstance(part, ContentPart):
|
|
if isinstance(part, dict):
|
|
try:
|
|
part = ContentPart(**part)
|
|
except Exception as e:
|
|
logger.warning(f"Could not parse ContentPart: {str(e)}")
|
|
neutralized_parts.append(part)
|
|
continue
|
|
else:
|
|
neutralized_parts.append(part)
|
|
continue
|
|
|
|
_type_group = getattr(part, "typeGroup", "") or ""
|
|
prog = 0.3 + (chat_doc_slot / max(1, chat_documents_len)) * 0.6
|
|
|
|
if _type_group == "image" and part.data:
|
|
try:
|
|
svc.services.chat.progressLogUpdate(
|
|
operation_id,
|
|
prog,
|
|
f"Checking image part {len(neutralized_parts) + 1}",
|
|
)
|
|
_img_bytes = _b64.b64decode(str(part.data))
|
|
_img_result = await svc.services.neutralization.processImageAsync(_img_bytes, f"part_{part.id}")
|
|
if _img_result.get("status") == "ok":
|
|
neutralized_parts.append(part)
|
|
else:
|
|
logger.warning("Fail-Safe: Image part %s blocked (PII), SKIPPING", part.id)
|
|
except Exception as _img_err:
|
|
logger.error(f"Fail-Safe: Image check failed for part {part.id}: {_img_err}, SKIPPING")
|
|
elif part.data:
|
|
try:
|
|
svc.services.chat.progressLogUpdate(
|
|
operation_id,
|
|
prog,
|
|
f"Neutralizing part {len(neutralized_parts) + 1}",
|
|
)
|
|
neut_res = await svc.services.neutralization.processTextAsync(part.data)
|
|
if neut_res and "neutralized_text" in neut_res:
|
|
neutral_data = neut_res["neutralized_text"]
|
|
neutralized_parts.append(
|
|
ContentPart(
|
|
id=part.id,
|
|
parentId=part.parentId,
|
|
label=part.label,
|
|
typeGroup=part.typeGroup,
|
|
mimeType=part.mimeType,
|
|
data=neutral_data,
|
|
metadata=part.metadata.copy() if part.metadata else {},
|
|
)
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"Fail-Safe: Neutralization incomplete for part %s — SKIPPING (not passing original)",
|
|
part.id,
|
|
)
|
|
continue
|
|
except Exception as e:
|
|
logger.error(f"Fail-Safe: Error neutralizing part {part.id}: {str(e)}, SKIPPING")
|
|
continue
|
|
else:
|
|
neutralized_parts.append(part)
|
|
|
|
return ContentExtracted(
|
|
id=content_extracted.id,
|
|
parts=neutralized_parts,
|
|
summary=content_extracted.summary,
|
|
)
|
|
|
|
|
|
async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
operation_id = None
|
|
try:
|
|
workflow_id = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operation_id = f"context_neutralize_{workflow_id}_{int(time.time())}"
|
|
|
|
neutralization_enabled = False
|
|
try:
|
|
config = self.services.neutralization.getConfig()
|
|
neutralization_enabled = config and config.enabled
|
|
except Exception as e:
|
|
logger.debug(f"Could not check neutralization config: {str(e)}")
|
|
|
|
if not neutralization_enabled:
|
|
logger.info("Neutralization is not enabled, returning documents unchanged")
|
|
document_list_param = parameters.get("documentList")
|
|
if not document_list_param:
|
|
return ActionResult.isFailure(error="documentList is required")
|
|
|
|
doc_list = coerceDocumentReferenceList(document_list_param)
|
|
if not doc_list.references:
|
|
return ActionResult.isFailure(error=f"documentList invalid (empty)")
|
|
|
|
chat_docs = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
|
|
if not chat_docs:
|
|
return ActionResult.isFailure(error="No documents found in documentList")
|
|
|
|
action_documents = []
|
|
for chat_doc in chat_docs:
|
|
if hasattr(chat_doc, "documentData") and chat_doc.documentData:
|
|
action_documents.append(
|
|
ActionDocument(
|
|
documentName=getattr(chat_doc, "fileName", "unknown"),
|
|
documentData=chat_doc.documentData,
|
|
mimeType=getattr(chat_doc, "mimeType", "application/json"),
|
|
validationMetadata={
|
|
"actionType": "context.neutralizeData",
|
|
"neutralized": False,
|
|
"reason": "Neutralization disabled",
|
|
},
|
|
)
|
|
)
|
|
return ActionResult.isSuccess(documents=action_documents)
|
|
|
|
document_list_param = parameters.get("documentList")
|
|
if not document_list_param:
|
|
return ActionResult.isFailure(error="documentList is required")
|
|
|
|
doc_list = coerceDocumentReferenceList(document_list_param)
|
|
if not doc_list.references:
|
|
return ActionResult.isFailure(error=f"documentList invalid")
|
|
|
|
parent_operation_id = parameters.get("parentOperationId")
|
|
self.services.chat.progressLogStart(
|
|
operation_id,
|
|
"Neutralizing data from documents",
|
|
"Data Neutralization",
|
|
f"Documents: {len(doc_list.references)}",
|
|
parentOperationId=parent_operation_id,
|
|
)
|
|
|
|
self.services.chat.progressLogUpdate(operation_id, 0.2, "Loading documents")
|
|
chat_documents = self.services.chat.getChatDocumentsFromDocumentList(doc_list)
|
|
if not chat_documents:
|
|
self.services.chat.progressLogFinish(operation_id, False)
|
|
return ActionResult.isFailure(error="No documents found in documentList")
|
|
|
|
logger.info(f"Neutralizing data from {len(chat_documents)} document(s)")
|
|
self.services.chat.progressLogUpdate(operation_id, 0.3, "Processing documents")
|
|
action_documents = []
|
|
|
|
for i, chat_doc in enumerate(chat_documents):
|
|
try:
|
|
dd = getattr(chat_doc, "documentData", None)
|
|
if not dd:
|
|
logger.warning(f"Document {i + 1} has no documentData, skipping")
|
|
continue
|
|
|
|
fn = str(getattr(chat_doc, "fileName", "") or "")
|
|
mime_guess = str(getattr(chat_doc, "mimeType", "") or "").lower()
|
|
if (
|
|
mime_guess.startswith("image/")
|
|
and fn.startswith("extract_media_")
|
|
and not (isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND)
|
|
):
|
|
action_documents.append(
|
|
ActionDocument(
|
|
documentName=fn or f"media_{i + 1}",
|
|
documentData=dd,
|
|
mimeType=mime_guess or "application/octet-stream",
|
|
validationMetadata={
|
|
"actionType": "context.neutralizeData",
|
|
"neutralized": False,
|
|
"reason": "extractContent_media_sidecar_pass_through",
|
|
},
|
|
)
|
|
)
|
|
continue
|
|
|
|
# --- Unified JSON envelope from context.extractContent (v1) ---
|
|
if isinstance(dd, dict) and dd.get("kind") == HANDOVER_KIND:
|
|
bundle = dict(dd)
|
|
files_section = dd.get("files") or {}
|
|
new_files = {}
|
|
for fk, bucket in files_section.items():
|
|
if not isinstance(bucket, dict):
|
|
continue
|
|
parts_raw = bucket.get("parts") or []
|
|
parsed_parts = []
|
|
for pd in parts_raw:
|
|
parsed_parts.append(ContentPart(**pd) if isinstance(pd, dict) else pd)
|
|
|
|
summary = bucket.get("summary") or {}
|
|
if hasattr(summary, "model_dump"):
|
|
summary = summary.model_dump(mode="json")
|
|
|
|
ce = ContentExtracted(
|
|
id=str(bucket.get("extractedId") or ""),
|
|
parts=parsed_parts,
|
|
summary=summary if isinstance(summary, dict) else {},
|
|
)
|
|
|
|
ce_out = await _neutralize_one_content_extracted(
|
|
svc=self,
|
|
content_extracted=ce,
|
|
operation_id=operation_id,
|
|
chat_doc_slot=i,
|
|
chat_documents_len=max(len(chat_documents), 1),
|
|
)
|
|
new_files[fk] = _one_file_bucket(ce_out, str(bucket.get("sourceFileName") or fk))
|
|
|
|
bundle["files"] = new_files
|
|
original_filename = getattr(chat_doc, "fileName", f"neutralized_bundle_{workflow_id}.json")
|
|
bn = original_filename.rsplit(".", 1)[0] if "." in original_filename else original_filename
|
|
action_documents.append(
|
|
ActionDocument(
|
|
documentName=f"{bn}_neutralized.json",
|
|
documentData=bundle,
|
|
mimeType="application/json",
|
|
validationMetadata={
|
|
"actionType": "context.neutralizeData",
|
|
"neutralized": True,
|
|
"handoverKind": HANDOVER_KIND,
|
|
"bundleFileCount": len(new_files),
|
|
},
|
|
)
|
|
)
|
|
continue
|
|
|
|
# --- Legacy ContentExtracted per persisted document ---
|
|
if isinstance(dd, ContentExtracted):
|
|
content_extracted = dd
|
|
elif isinstance(dd, dict):
|
|
try:
|
|
content_extracted = ContentExtracted(**dd)
|
|
except Exception:
|
|
logger.warning(f"Document {i + 1} documentData cannot be parsed as ContentExtracted dict")
|
|
continue
|
|
else:
|
|
logger.warning(f"Document {i + 1} documentData is not supported")
|
|
continue
|
|
|
|
neut_out = await _neutralize_one_content_extracted(
|
|
svc=self,
|
|
content_extracted=content_extracted,
|
|
operation_id=operation_id,
|
|
chat_doc_slot=i,
|
|
chat_documents_len=max(len(chat_documents), 1),
|
|
)
|
|
|
|
original_file_name = getattr(chat_doc, "fileName", f"document_{i + 1}.json")
|
|
base_name = original_file_name.rsplit(".", 1)[0] if "." in original_file_name else original_file_name
|
|
document_name = f"{base_name}_neutralized_{neut_out.id}.json"
|
|
|
|
action_documents.append(
|
|
ActionDocument(
|
|
documentName=document_name,
|
|
documentData=neut_out,
|
|
mimeType="application/json",
|
|
validationMetadata={
|
|
"actionType": "context.neutralizeData",
|
|
"documentIndex": i,
|
|
"extractedId": neut_out.id,
|
|
"partCount": len(neut_out.parts),
|
|
"neutralized": True,
|
|
"originalFileName": original_file_name,
|
|
},
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document {i + 1}: {str(e)}")
|
|
continue
|
|
|
|
if not action_documents:
|
|
self.services.chat.progressLogFinish(operation_id, False)
|
|
return ActionResult.isFailure(error="No valid documents found to neutralize")
|
|
|
|
self.services.chat.progressLogFinish(operation_id, True)
|
|
return ActionResult.isSuccess(documents=action_documents)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in data neutralization: {str(e)}")
|
|
try:
|
|
if operation_id:
|
|
self.services.chat.progressLogFinish(operation_id, False)
|
|
except Exception:
|
|
pass
|
|
|
|
return ActionResult.isFailure(error=str(e))
|