neutralization integrated into extractor

2025-12-17 22:03:58 +01:00 · 2025-12-17 22:03:58 +01:00 · b2196bc6a3
commit b2196bc6a3
parent 56d6ecf978
5 changed files with 364 additions and 89 deletions
--- a/modules/workflows/methods/methodContext/actions/init.py
+++ b/modules/workflows/methods/methodContext/actions/init.py
@ -6,11 +6,13 @@
 # Export all actions
 from .getDocumentIndex import getDocumentIndex
 from .extractContent import extractContent
 from .neutralizeData import neutralizeData
 from .triggerPreprocessingServer import triggerPreprocessingServer
 __all__ = [
    'getDocumentIndex',
    'extractContent',
    'neutralizeData',
    'triggerPreprocessingServer',
 ]
--- a/modules/workflows/methods/methodContext/actions/extractContent.py
+++ b/modules/workflows/methods/methodContext/actions/extractContent.py
@ -12,7 +12,7 @@ from typing import Dict, Any
 from modules.workflows.methods.methodBase import action
 from modules.datamodels.datamodelChat import ActionResult, ActionDocument
 from modules.datamodels.datamodelDocref import DocumentReferenceList
-from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
+from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy, ContentExtracted, ContentPart
 logger = logging.getLogger(__name__)
@ -108,6 +108,74 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
        # Pass operationId for hierarchical per-document progress logging
        extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId)
        # Check if neutralization is enabled and should be applied automatically
        neutralizationEnabled = False
        try:
            config = self.services.neutralization.getConfig()
            neutralizationEnabled = config and config.enabled
        except Exception as e:
            logger.debug(f"Could not check neutralization config: {str(e)}")
        # Neutralize extracted data if enabled (for dynamic mode: after extraction, before AI processing)
        if neutralizationEnabled:
            self.services.chat.progressLogUpdate(operationId, 0.7, "Neutralizing extracted data")
            logger.info("Neutralization enabled - neutralizing extracted content data")
            # Neutralize each ContentExtracted result
            for extracted in extractedResults:
                if extracted.parts:
                    neutralizedParts = []
                    for part in extracted.parts:
                        if not isinstance(part, ContentPart):
                            # Try to parse as ContentPart if it's a dict
                            if isinstance(part, dict):
                                try:
                                    part = ContentPart(**part)
                                except Exception as e:
                                    logger.warning(f"Could not parse ContentPart: {str(e)}")
                                    neutralizedParts.append(part)
                                    continue
                            else:
                                neutralizedParts.append(part)
                                continue
                        # Neutralize the data field if it contains text
                        if part.data:
                            try:
                                # Call neutralization service
                                neutralizationResult = self.services.neutralization.processText(part.data)
                                if neutralizationResult and 'neutralized_text' in neutralizationResult:
                                    # Replace data with neutralized text
                                    neutralizedData = neutralizationResult['neutralized_text']
                                    # Create new ContentPart with neutralized data
                                    neutralizedPart = ContentPart(
                                        id=part.id,
                                        parentId=part.parentId,
                                        label=part.label,
                                        typeGroup=part.typeGroup,
                                        mimeType=part.mimeType,
                                        data=neutralizedData,
                                        metadata=part.metadata.copy() if part.metadata else {}
                                    )
                                    neutralizedParts.append(neutralizedPart)
                                else:
                                    # Neutralization failed, use original part
                                    logger.warning(f"Neutralization did not return neutralized_text for part {part.id}")
                                    neutralizedParts.append(part)
                            except Exception as e:
                                logger.error(f"Error neutralizing part {part.id}: {str(e)}")
                                # On error, use original part
                                neutralizedParts.append(part)
                        else:
                            # No data to neutralize, keep original part
                            neutralizedParts.append(part)
                    # Update extracted result with neutralized parts
                    extracted.parts = neutralizedParts
                    logger.info(f"Neutralized {len(neutralizedParts)} content parts")
        # Build ActionDocuments from ContentExtracted results
        self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents")
        actionDocuments = []
@ -129,6 +197,7 @@ async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
                "documentIndex": i,
                "extractedId": extracted.id,
                "partCount": len(extracted.parts) if extracted.parts else 0,
                "neutralized": neutralizationEnabled,
                "originalFileName": originalDoc.fileName if originalDoc and hasattr(originalDoc, 'fileName') else None
            }
            actionDoc = ActionDocument(
--- a/modules/workflows/methods/methodContext/actions/neutralizeData.py
+++ b/modules/workflows/methods/methodContext/actions/neutralizeData.py
@ -0,0 +1,256 @@
 # Copyright (c) 2025 Patrick Motsch
 # All rights reserved.
 """
 Neutralize Data action for Context operations.
 Neutralizes extracted content data from ContentExtracted documents.
 """
 import logging
 import time
 from typing import Dict, Any
 from modules.workflows.methods.methodBase import action
 from modules.datamodels.datamodelChat import ActionResult, ActionDocument
 from modules.datamodels.datamodelDocref import DocumentReferenceList
 from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart
 logger = logging.getLogger(__name__)
@action
 async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    Neutralize data from ContentExtracted documents.
    This action takes documents containing ContentExtracted objects (from extractContent)
    and neutralizes the text data in ContentPart.data fields.
    Parameters:
    - documentList (list, required): Document reference(s) containing ContentExtracted objects.
    Returns:
    - ActionResult with ActionDocument containing neutralized ContentExtracted objects
    """
    try:
        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"context_neutralize_{workflowId}_{int(time.time())}"
        # Check if neutralization is enabled
        neutralizationEnabled = False
        try:
            config = self.services.neutralization.getConfig()
            neutralizationEnabled = config and config.enabled
        except Exception as e:
            logger.debug(f"Could not check neutralization config: {str(e)}")
        if not neutralizationEnabled:
            logger.info("Neutralization is not enabled, returning documents unchanged")
            # Return original documents if neutralization is disabled
            # Get documents from documentList
            documentListParam = parameters.get("documentList")
            if not documentListParam:
                return ActionResult.isFailure(error="documentList is required")
            # Convert to DocumentReferenceList if needed
            if isinstance(documentListParam, DocumentReferenceList):
                documentList = documentListParam
            elif isinstance(documentListParam, str):
                documentList = DocumentReferenceList.from_string_list([documentListParam])
            elif isinstance(documentListParam, list):
                documentList = DocumentReferenceList.from_string_list(documentListParam)
            else:
                return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")
            # Get ChatDocuments from documentList
            chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
            if not chatDocuments:
                return ActionResult.isFailure(error="No documents found in documentList")
            # Return original documents as ActionDocuments
            actionDocuments = []
            for chatDoc in chatDocuments:
                # Extract ContentExtracted from documentData if available
                if hasattr(chatDoc, 'documentData') and chatDoc.documentData:
                    actionDoc = ActionDocument(
                        documentName=getattr(chatDoc, 'fileName', 'unknown'),
                        documentData=chatDoc.documentData,
                        mimeType=getattr(chatDoc, 'mimeType', 'application/json'),
                        validationMetadata={
                            "actionType": "context.neutralizeData",
                            "neutralized": False,
                            "reason": "Neutralization disabled"
                        }
                    )
                    actionDocuments.append(actionDoc)
            return ActionResult.isSuccess(documents=actionDocuments)
        # Extract documentList from parameters dict
        documentListParam = parameters.get("documentList")
        if not documentListParam:
            return ActionResult.isFailure(error="documentList is required")
        # Convert to DocumentReferenceList if needed
        if isinstance(documentListParam, DocumentReferenceList):
            documentList = documentListParam
        elif isinstance(documentListParam, str):
            documentList = DocumentReferenceList.from_string_list([documentListParam])
        elif isinstance(documentListParam, list):
            documentList = DocumentReferenceList.from_string_list(documentListParam)
        else:
            return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")
        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "Neutralizing data from documents",
            "Data Neutralization",
            f"Documents: {len(documentList.references)}",
            parentOperationId=parentOperationId
        )
        # Get ChatDocuments from documentList
        self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents")
        chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)
        if not chatDocuments:
            self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="No documents found in documentList")
        logger.info(f"Neutralizing data from {len(chatDocuments)} documents")
        # Process each document
        self.services.chat.progressLogUpdate(operationId, 0.3, "Processing documents")
        actionDocuments = []
        for i, chatDoc in enumerate(chatDocuments):
            try:
                # Extract ContentExtracted from documentData
                if not hasattr(chatDoc, 'documentData') or not chatDoc.documentData:
                    logger.warning(f"Document {i+1} has no documentData, skipping")
                    continue
                documentData = chatDoc.documentData
                # Check if it's a ContentExtracted object
                if isinstance(documentData, ContentExtracted):
                    contentExtracted = documentData
                elif isinstance(documentData, dict):
                    # Try to parse as ContentExtracted
                    try:
                        contentExtracted = ContentExtracted(**documentData)
                    except Exception as e:
                        logger.warning(f"Document {i+1} documentData is not ContentExtracted: {str(e)}")
                        continue
                else:
                    logger.warning(f"Document {i+1} documentData is not ContentExtracted or dict")
                    continue
                # Neutralize each ContentPart's data field
                neutralizedParts = []
                for part in contentExtracted.parts:
                    if not isinstance(part, ContentPart):
                        # Try to parse as ContentPart
                        if isinstance(part, dict):
                            try:
                                part = ContentPart(**part)
                            except Exception as e:
                                logger.warning(f"Could not parse ContentPart: {str(e)}")
                                neutralizedParts.append(part)
                                continue
                        else:
                            neutralizedParts.append(part)
                            continue
                    # Neutralize the data field if it contains text
                    if part.data:
                        try:
                            self.services.chat.progressLogUpdate(
                                operationId,
                                0.3 + (i / len(chatDocuments)) * 0.6,
                                f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}"
                            )
                            # Call neutralization service
                            neutralizationResult = self.services.neutralization.processText(part.data)
                            if neutralizationResult and 'neutralized_text' in neutralizationResult:
                                # Replace data with neutralized text
                                neutralizedData = neutralizationResult['neutralized_text']
                                # Create new ContentPart with neutralized data
                                neutralizedPart = ContentPart(
                                    id=part.id,
                                    parentId=part.parentId,
                                    label=part.label,
                                    typeGroup=part.typeGroup,
                                    mimeType=part.mimeType,
                                    data=neutralizedData,
                                    metadata=part.metadata.copy() if part.metadata else {}
                                )
                                neutralizedParts.append(neutralizedPart)
                            else:
                                # Neutralization failed, use original part
                                logger.warning(f"Neutralization did not return neutralized_text for part {part.id}")
                                neutralizedParts.append(part)
                        except Exception as e:
                            logger.error(f"Error neutralizing part {part.id}: {str(e)}")
                            # On error, use original part
                            neutralizedParts.append(part)
                    else:
                        # No data to neutralize, keep original part
                        neutralizedParts.append(part)
                # Create neutralized ContentExtracted object
                neutralizedContentExtracted = ContentExtracted(
                    id=contentExtracted.id,
                    parts=neutralizedParts,
                    summary=contentExtracted.summary
                )
                # Create ActionDocument
                originalFileName = getattr(chatDoc, 'fileName', f"document_{i+1}.json")
                baseName = originalFileName.rsplit('.', 1)[0] if '.' in originalFileName else originalFileName
                documentName = f"{baseName}_neutralized_{contentExtracted.id}.json"
                validationMetadata = {
                    "actionType": "context.neutralizeData",
                    "documentIndex": i,
                    "extractedId": contentExtracted.id,
                    "partCount": len(neutralizedParts),
                    "neutralized": True,
                    "originalFileName": originalFileName
                }
                actionDoc = ActionDocument(
                    documentName=documentName,
                    documentData=neutralizedContentExtracted,
                    mimeType="application/json",
                    validationMetadata=validationMetadata
                )
                actionDocuments.append(actionDoc)
            except Exception as e:
                logger.error(f"Error processing document {i+1}: {str(e)}")
                # Continue with other documents
                continue
        if not actionDocuments:
            self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="No valid ContentExtracted documents found to neutralize")
        self.services.chat.progressLogFinish(operationId, True)
        return ActionResult.isSuccess(documents=actionDocuments)
    except Exception as e:
        logger.error(f"Error in data neutralization: {str(e)}")
        # Complete progress tracking with failure
        try:
            self.services.chat.progressLogFinish(operationId, False)
        except:
            pass  # Don't fail on progress logging errors
        return ActionResult.isFailure(error=str(e))
--- a/modules/workflows/methods/methodContext/methodContext.py
+++ b/modules/workflows/methods/methodContext/methodContext.py
@ -13,6 +13,7 @@ from .helpers.formatting import FormattingHelper
 # Import actions
 from .actions.getDocumentIndex import getDocumentIndex
 from .actions.extractContent import extractContent
 from .actions.neutralizeData import neutralizeData
 from .actions.triggerPreprocessingServer import triggerPreprocessingServer
 logger = logging.getLogger(__name__)
@ -68,6 +69,20 @@ class MethodContext(MethodBase):
                },
                execute=extractContent.__get__(self, self.__class__)
            ),
            "neutralizeData": WorkflowActionDefinition(
                actionId="context.neutralizeData",
                description="Neutralize extracted data from ContentExtracted documents (for use after extractContent)",
                parameters={
                    "documentList": WorkflowActionParameter(
                        name="documentList",
                        type="List[str]",
                        frontendType=FrontendType.DOCUMENT_REFERENCE,
                        required=True,
                        description="Document reference(s) containing ContentExtracted objects to neutralize"
                    )
                },
                execute=neutralizeData.__get__(self, self.__class__)
            ),
            "triggerPreprocessingServer": WorkflowActionDefinition(
                actionId="context.triggerPreprocessingServer",
                description="Trigger preprocessing server at customer tenant to update database with configuration",
@ -104,5 +119,6 @@ class MethodContext(MethodBase):
        # Register actions as methods (optional, für direkten Zugriff)
        self.getDocumentIndex = getDocumentIndex.__get__(self, self.__class__)
        self.extractContent = extractContent.__get__(self, self.__class__)
        self.neutralizeData = neutralizeData.__get__(self, self.__class__)
        self.triggerPreprocessingServer = triggerPreprocessingServer.__get__(self, self.__class__)
--- a/modules/workflows/workflowManager.py
+++ b/modules/workflows/workflowManager.py
@ -960,17 +960,13 @@ class WorkflowManager:
    async def _processFileIds(self, fileIds: List[str], messageId: str = None) -> List[ChatDocument]:
        """Process file IDs from existing files and return ChatDocument objects.
        If neutralization is enabled, files are neutralized and new files are created with neutralized content.
        If neutralization fails, the document is not included and an error is logged to ChatLog."""
        documents = []
-        # Check if neutralization is enabled
+        NOTE: Neutralization is NOT performed here. For dynamic workflows, neutralization
-        neutralizationEnabled = False
+        should happen AFTER content extraction (in extractContent action) to neutralize
-        try:
+        extracted data (ContentPart.data), not ChatDocuments. This ensures neutralization
-            config = self.services.neutralization.getConfig()
+        happens after extraction but before AI processing.
-            neutralizationEnabled = config and config.enabled
+        """
-        except Exception as e:
+        documents = []
            logger.debug(f"Could not check neutralization config: {str(e)}")
        workflow = self.services.workflow
@ -984,87 +980,23 @@ class WorkflowManager:
                originalFileName = fileInfo.get("fileName", "unknown")
                originalMimeType = fileInfo.get("mimeType", "application/octet-stream")
                fileIdToUse = fileId
                fileNameToUse = originalFileName
                fileSizeToUse = fileInfo.get("size", 0)
                neutralizationFailed = False
-                # Neutralize file if enabled
+                # NOTE: Neutralization removed from here - it should happen in extractContent action
-                if neutralizationEnabled:
+                # after content extraction but before AI processing (for dynamic workflows)
-                    try:
+                # This ensures we neutralize extracted data (ContentPart.data), not ChatDocuments
                        # Neutralize the file using the neutralization service
                        neutralizationResult = self.services.neutralization.processFile(fileId)
                        # Check if file is binary (not neutralized)
                        if neutralizationResult.get('is_binary', False):
                            # Binary file - log INFO and use original file
                            infoMsg = f"File '{originalFileName}' (MIME type: {neutralizationResult.get('mime_type', 'unknown')}) is a binary file. Binary file neutralization will be implemented in the future. Using original file without neutralization."
                            logger.info(infoMsg)
                            self.services.chat.storeLog(workflow, {
                                "message": infoMsg,
                                "type": "info",
                                "status": "running",
                                "progress": 50
                            })
                            # Use original file (fileIdToUse already set to fileId)
                        elif neutralizationResult and 'neutralized_text' in neutralizationResult:
                            neutralizedText = neutralizationResult['neutralized_text']
                            # Create new file with neutralized content
                            neutralizedFileName = neutralizationResult.get('neutralized_file_name', f"neutralized_{originalFileName}")
                            neutralizedContentBytes = neutralizedText.encode('utf-8')
                            # Create file in component storage
                            neutralizedFileItem = self.services.interfaceDbComponent.createFile(
                                name=neutralizedFileName,
                                mimeType=originalMimeType,
                                content=neutralizedContentBytes
                            )
                            # Persist file data
                            self.services.interfaceDbComponent.createFileData(neutralizedFileItem.id, neutralizedContentBytes)
                            # Use the neutralized file ID and actual size
                            fileIdToUse = neutralizedFileItem.id
                            fileNameToUse = neutralizedFileName
                            fileSizeToUse = len(neutralizedContentBytes)
                            logger.info(f"Neutralized file {fileId} -> {fileIdToUse} ({fileNameToUse})")
                        else:
                            neutralizationFailed = True
                            errorMsg = f"Neutralization did not return neutralized_text for file '{originalFileName}' (ID: {fileId})"
                            logger.warning(errorMsg)
                            self.services.chat.storeLog(workflow, {
                                "message": errorMsg,
                                "type": "error",
                                "status": "error",
                                "progress": -1
                            })
                    except Exception as e:
                        neutralizationFailed = True
                        errorMsg = f"Failed to neutralize file '{originalFileName}' (ID: {fileId}): {str(e)}"
                        logger.error(errorMsg)
                        self.services.chat.storeLog(workflow, {
                            "message": errorMsg,
                            "type": "error",
                            "status": "error",
                            "progress": -1
                        })
-                # Only skip document if neutralization failed (not for binary files)
+                # Create document with original file ID (no neutralization)
-                if not neutralizationFailed:
+                document = ChatDocument(
-                    # Create document with file ID (neutralized or original)
+                    id=str(uuid.uuid4()),
-                    document = ChatDocument(
+                    messageId=messageId or "",
-                        id=str(uuid.uuid4()),
+                    fileId=fileId,
-                        messageId=messageId or "",
+                    fileName=originalFileName,
-                        fileId=fileIdToUse,
+                    fileSize=fileSizeToUse,
-                        fileName=fileNameToUse,
+                    mimeType=originalMimeType
-                        fileSize=fileSizeToUse,
+                )
-                        mimeType=originalMimeType
+                documents.append(document)
-                    )
+                logger.info(f"Processed file ID {fileId} -> {document.fileName}")
                    documents.append(document)
                    logger.info(f"Processed file ID {fileId} -> {document.fileName} (using fileId: {fileIdToUse})")
                else:
                    logger.warning(f"Skipping document for file ID {fileId} due to neutralization failure")
            except Exception as e:
                errorMsg = f"Error processing file ID {fileId}: {str(e)}"
                logger.error(errorMsg)