diff --git a/modules/features/neutralization/interfaceFeatureNeutralizer.py b/modules/features/neutralization/interfaceFeatureNeutralizer.py index bea7c7b3..54e3e368 100644 --- a/modules/features/neutralization/interfaceFeatureNeutralizer.py +++ b/modules/features/neutralization/interfaceFeatureNeutralizer.py @@ -83,14 +83,16 @@ class InterfaceFeatureNeutralizer: self.featureInstanceId = featureInstanceId def getNeutralizationConfig(self) -> Optional[DataNeutraliserConfig]: - """Get the data neutralization configuration for the current user's mandate""" + """Get the data neutralization configuration for the current user's mandate and instance""" try: - # Use RBAC filtering + record_filter = {"mandateId": self.mandateId} + if self.featureInstanceId: + record_filter["featureInstanceId"] = self.featureInstanceId filteredConfigs = getRecordsetWithRBAC( self.db, DataNeutraliserConfig, self.currentUser, - recordFilter={"mandateId": self.mandateId}, + recordFilter=record_filter, mandateId=self.mandateId ) @@ -130,6 +132,8 @@ class InterfaceFeatureNeutralizer: # Create new config configData["mandateId"] = self.mandateId configData["userId"] = self.userId + if self.featureInstanceId: + configData["featureInstanceId"] = self.featureInstanceId newConfig = DataNeutraliserConfig(**configData) createdRecord = self.db.recordCreate(DataNeutraliserConfig, newConfig) @@ -200,13 +204,44 @@ class InterfaceFeatureNeutralizer: DataNeutralizerAttributes, recordFilter={"mandateId": self.mandateId, "id": attributeId} ) - if attributes: - return attributes[0] - return None + if not attributes: + return None + attr = attributes[0] + return {k: v for k, v in attr.items() if not k.startswith("_")} except Exception as e: logger.error(f"Error getting attribute by ID: {str(e)}") return None + def createAttribute( + self, + attributeId: str, + originalText: str, + patternType: str, + fileId: Optional[str] = None + ) -> Optional[DataNeutralizerAttributes]: + """Create a neutralization attribute for placeholder resolution.""" + try: + mandate_id = self.mandateId or "" + feature_instance_id = self.featureInstanceId or "" + if not self.userId: + logger.warning("Cannot create attribute: missing userId") + return None + attr = DataNeutralizerAttributes( + id=attributeId, + mandateId=self.mandateId, + featureInstanceId=self.featureInstanceId, + userId=self.userId, + originalText=originalText, + fileId=fileId, + patternType=patternType, + ) + + created = self.db.recordCreate(DataNeutralizerAttributes, attr.model_dump()) + return DataNeutralizerAttributes(**{k: v for k, v in created.items() if not k.startswith("_")}) + except Exception as e: + logger.error(f"Error creating attribute: {str(e)}") + return None + def getInterface(currentUser: Optional[User] = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> InterfaceFeatureNeutralizer: """ diff --git a/modules/features/neutralization/mainNeutralization.py b/modules/features/neutralization/mainNeutralization.py index d05f2b3f..d32b441f 100644 --- a/modules/features/neutralization/mainNeutralization.py +++ b/modules/features/neutralization/mainNeutralization.py @@ -21,17 +21,7 @@ UI_OBJECTS = [ "objectKey": "ui.feature.neutralization.playground", "label": {"en": "Playground", "de": "Spielwiese", "fr": "Bac à sable"}, "meta": {"area": "playground"} - }, - { - "objectKey": "ui.feature.neutralization.config", - "label": {"en": "Configuration", "de": "Konfiguration", "fr": "Configuration"}, - "meta": {"area": "config"} - }, - { - "objectKey": "ui.feature.neutralization.attributes", - "label": {"en": "Attributes", "de": "Attribute", "fr": "Attributs"}, - "meta": {"area": "attributes"} - }, + } ] # Resource Objects for RBAC catalog @@ -130,9 +120,106 @@ def registerFeature(catalogService) -> bool: meta=resObj.get("meta") ) + # Sync template roles to database + _syncTemplateRolesToDb() + logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects") return True except Exception as e: logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}") return False + + +def _syncTemplateRolesToDb() -> int: + """ + Sync template roles and their AccessRules to the database. + Creates global template roles (mandateId=None) if they don't exist. + + Returns: + Number of roles created + """ + try: + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.datamodels.datamodelRbac import Role, AccessRule, AccessRuleContext + + rootInterface = getRootInterface() + + existingRoles = rootInterface.getRolesByFeatureCode(FEATURE_CODE) + templateRoles = [r for r in existingRoles if r.mandateId is None] + existingRoleLabels = {r.roleLabel: str(r.id) for r in templateRoles} + + createdCount = 0 + for roleTemplate in TEMPLATE_ROLES: + roleLabel = roleTemplate["roleLabel"] + + if roleLabel in existingRoleLabels: + roleId = existingRoleLabels[roleLabel] + _ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", [])) + else: + newRole = Role( + roleLabel=roleLabel, + description=roleTemplate.get("description", {}), + featureCode=FEATURE_CODE, + mandateId=None, + featureInstanceId=None, + isSystemRole=False + ) + createdRole = rootInterface.db.recordCreate(Role, newRole.model_dump()) + roleId = createdRole.get("id") + _ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", [])) + logger.info(f"Created template role '{roleLabel}' with ID {roleId}") + createdCount += 1 + + if createdCount > 0: + logger.info(f"Feature '{FEATURE_CODE}': Created {createdCount} template roles") + + return createdCount + + except Exception as e: + logger.error(f"Error syncing template roles for feature '{FEATURE_CODE}': {e}") + return 0 + + +def _ensureAccessRulesForRole(rootInterface, roleId: str, ruleTemplates: List[Dict[str, Any]]) -> int: + """Ensure AccessRules exist for a role based on templates.""" + from modules.datamodels.datamodelRbac import AccessRule, AccessRuleContext + + existingRules = rootInterface.getAccessRulesByRole(roleId) + existingSignatures = set() + for rule in existingRules: + sig = (rule.context.value if rule.context else None, rule.item) + existingSignatures.add(sig) + + createdCount = 0 + for template in ruleTemplates: + context = template.get("context", "UI") + item = template.get("item") + sig = (context, item) + + if sig in existingSignatures: + continue + + if context == "UI": + contextEnum = AccessRuleContext.UI + elif context == "DATA": + contextEnum = AccessRuleContext.DATA + elif context == "RESOURCE": + contextEnum = AccessRuleContext.RESOURCE + else: + contextEnum = context + + newRule = AccessRule( + roleId=roleId, + context=contextEnum, + item=item, + view=template.get("view", False), + read=template.get("read"), + create=template.get("create"), + update=template.get("update"), + delete=template.get("delete"), + ) + rootInterface.db.recordCreate(AccessRule, newRule.model_dump()) + createdCount += 1 + + return createdCount diff --git a/modules/features/neutralization/neutralizePlayground.py b/modules/features/neutralization/neutralizePlayground.py index 159faf04..660c2c39 100644 --- a/modules/features/neutralization/neutralizePlayground.py +++ b/modules/features/neutralization/neutralizePlayground.py @@ -15,14 +15,96 @@ logger = logging.getLogger(__name__) class NeutralizationPlayground: """Feature/UI wrapper around NeutralizationService for playground & routes.""" - def __init__(self, currentUser: User, mandateId: str): + def __init__(self, currentUser: User, mandateId: str, featureInstanceId: Optional[str] = None): self.currentUser = currentUser self.mandateId = mandateId - self.services = getServices(currentUser, None, mandateId=mandateId) + self.featureInstanceId = featureInstanceId + self.services = getServices(currentUser, None, mandateId=mandateId, featureInstanceId=featureInstanceId) def processText(self, text: str) -> Dict[str, Any]: return self.services.neutralization.processText(text) + async def processUploadedFileAsync(self, file_bytes: bytes, filename: str) -> Dict[str, Any]: + """Process an uploaded file (bytes + filename). Returns neutralized result for text or binary. + Saves both original and neutralized files to user files (component storage) when available.""" + import base64 + name_lower = (filename or '').lower() + mime_map = { + '.pdf': 'application/pdf', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xlsm': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + } + mime = next((mime_map[ext] for ext in mime_map if name_lower.endswith(ext)), 'text/plain') + binary_exts = {'.pdf', '.docx', '.xlsx', '.xlsm', '.pptx'} + is_binary = any(name_lower.endswith(ext) for ext in binary_exts) + + original_file_id = None + neutralized_file_id = None + + # Save original file to user files + if self.services.interfaceDbComponent: + try: + file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(file_bytes, filename) + original_file_id = str(file_item.id) + except Exception as e: + logger.warning(f"Could not save original file to user files: {e}") + + if is_binary: + result = await self.services.neutralization.processBinaryBytesAsync(file_bytes, filename, mime) + neu_bytes = result.get('neutralized_bytes') + logger.debug(f"Binary result: neu_bytes type={type(neu_bytes).__name__}, len={len(neu_bytes) if neu_bytes is not None else 0}") + if neu_bytes is not None and len(neu_bytes) > 0: + result['neutralized_file_base64'] = base64.b64encode(neu_bytes).decode('ascii') + result['neutralized_file_name'] = result.get('neutralized_file_name', f'neutralized_{filename}') + result['mime_type'] = result.get('mime_type', mime) + # Save neutralized binary to user files + if self.services.interfaceDbComponent: + try: + neu_name = result['neutralized_file_name'] + file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(neu_bytes, neu_name) + neutralized_file_id = str(file_item.id) + except Exception as e: + logger.warning(f"Could not save neutralized file to user files: {e}") + # Remove raw bytes before JSON response (avoid serialization issues; use base64 only) + result.pop('neutralized_bytes', None) + result['original_file_id'] = original_file_id + result['neutralized_file_id'] = neutralized_file_id + return result + + try: + text_content = file_bytes.decode('utf-8') + except UnicodeDecodeError: + try: + text_content = file_bytes.decode('latin-1') + except UnicodeDecodeError: + return { + 'neutralized_text': None, + 'original_file_id': original_file_id, + 'neutralized_file_id': None, + 'processed_info': {'type': 'error', 'error': 'File could not be decoded as text. Supported: UTF-8, Latin-1. For PDF/Word/Excel, use supported binary formats.'} + } + result = self.services.neutralization.processText(text_content) + result['neutralized_file_name'] = f'neutralized_{filename}' + # Save neutralized text as file to user files + if self.services.interfaceDbComponent and result.get('neutralized_text') is not None: + try: + neu_text = result['neutralized_text'] + neu_bytes = neu_text.encode('utf-8') + neu_name = result['neutralized_file_name'] + file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(neu_bytes, neu_name) + neutralized_file_id = str(file_item.id) + except Exception as e: + logger.warning(f"Could not save neutralized text file to user files: {e}") + result['original_file_id'] = original_file_id + result['neutralized_file_id'] = neutralized_file_id + return result + + def processUploadedFile(self, file_bytes: bytes, filename: str) -> Dict[str, Any]: + """Sync wrapper for sync callers. Uses asyncio.run; do NOT call from async routes (use processUploadedFileAsync).""" + return asyncio.run(self.processUploadedFileAsync(file_bytes, filename)) + def processFiles(self, fileIds: List[str]) -> Dict[str, Any]: results: List[Dict[str, Any]] = [] errors: List[str] = [] @@ -273,18 +355,42 @@ class SharepointProcessor: processed: List[Dict[str, Any]] = [] errors: List[str] = [] + BINARY_EXTS = {'.pdf', '.docx', '.doc', '.xlsx', '.xlsm', '.pptx', '.ppt'} + async def _processSingle(fileInfo: Dict[str, Any]): try: fileContent = await self.services.sharepoint.downloadFile(sourceSiteInfo['id'], fileInfo['id']) if not fileContent: return {'error': f"Failed to download file: {fileInfo['name']}"} - try: - textContent = fileContent.decode('utf-8') - except UnicodeDecodeError: - textContent = fileContent.decode('latin-1') - result = self.services.neutralization.processText(textContent) + name_lower = (fileInfo.get('name') or '').lower() + is_binary = any(name_lower.endswith(ext) for ext in BINARY_EXTS) + mime_map = { + '.pdf': 'application/pdf', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.doc': 'application/msword', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xlsm': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.ppt': 'application/vnd.ms-powerpoint', + } + mime = next((mime_map[ext] for ext in BINARY_EXTS if name_lower.endswith(ext)), 'text/plain') + + if is_binary: + result = self.services.neutralization.processBinaryBytes(fileContent, fileInfo['name'], mime) + if result.get('neutralized_bytes'): + content_to_upload = result['neutralized_bytes'] + else: + return {'error': f"Failed to neutralize binary file {fileInfo['name']}: {result.get('processed_info', {}).get('error', 'Unknown error')}"} + else: + try: + textContent = fileContent.decode('utf-8') + except UnicodeDecodeError: + textContent = fileContent.decode('latin-1') + result = self.services.neutralization.processText(textContent) + content_to_upload = (result.get('neutralized_text') or '').encode('utf-8') + neutralizedFilename = f"neutralized_{fileInfo['name']}" - uploadResult = await self.services.sharepoint.uploadFile(targetSiteInfo['id'], targetFolder, neutralizedFilename, result['neutralized_text'].encode('utf-8')) + uploadResult = await self.services.sharepoint.uploadFile(targetSiteInfo['id'], targetFolder, neutralizedFilename, content_to_upload) if 'error' in uploadResult: return {'error': f"Failed to upload neutralized file: {neutralizedFilename} - {uploadResult['error']}"} return { diff --git a/modules/features/neutralization/routeFeatureNeutralizer.py b/modules/features/neutralization/routeFeatureNeutralizer.py index d1590d28..de49f50d 100644 --- a/modules/features/neutralization/routeFeatureNeutralizer.py +++ b/modules/features/neutralization/routeFeatureNeutralizer.py @@ -1,6 +1,6 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -from fastapi import APIRouter, HTTPException, Depends, Path, Request, status, Query, Body +from fastapi import APIRouter, HTTPException, Depends, Path, Request, status, Query, Body, File, UploadFile from typing import List, Dict, Any, Optional import logging @@ -35,13 +35,18 @@ def get_neutralization_config( ) -> DataNeutraliserConfig: """Get data neutralization configuration""" try: - service = NeutralizationPlayground(context.user, str(context.mandateId)) + mandate_id = str(context.mandateId) if context.mandateId else "" + feature_instance_id = str(context.featureInstanceId) if context.featureInstanceId else "" + service = NeutralizationPlayground( + context.user, mandate_id, featureInstanceId=feature_instance_id or None + ) config = service.getConfig() if not config: - # Return default config instead of 404 + # Return default config instead of 404 (requires mandateId and featureInstanceId for instance-scoped config) return DataNeutraliserConfig( - mandateId=context.mandateId, + mandateId=mandate_id, + featureInstanceId=feature_instance_id, userId=context.user.id, enabled=True, namesToParse="", @@ -69,7 +74,11 @@ def save_neutralization_config( ) -> DataNeutraliserConfig: """Save or update data neutralization configuration""" try: - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) config = service.saveConfig(config_data) return config @@ -81,6 +90,44 @@ def save_neutralization_config( detail=f"Error saving neutralization config: {str(e)}" ) +@router.post("/neutralize-file") +@limiter.limit("20/minute") +async def neutralize_file( + request: Request, + file: UploadFile = File(..., description="File to neutralize (PDF, DOCX, XLSX, PPTX, TXT, CSV, JSON)"), + context: RequestContext = Depends(getRequestContext) +) -> Dict[str, Any]: + """Upload and neutralize a file. Returns neutralized text or base64-encoded file for download.""" + try: + if not file.filename or not file.filename.strip(): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="File name is required" + ) + content = await file.read() + if not content: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="File is empty" + ) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) + result = await service.processUploadedFileAsync(content, file.filename or "file") + logger.info(f"Neutralize file result keys: {list(result.keys())}, has_base64={bool(result.get('neutralized_file_base64'))}, has_text={result.get('neutralized_text') is not None}") + return result + except HTTPException: + raise + except Exception as e: + logger.error(f"Error neutralizing file: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error neutralizing file: {str(e)}" + ) + + @router.post("/neutralize-text", response_model=Dict[str, Any]) @limiter.limit("20/minute") def neutralize_text( @@ -99,7 +146,11 @@ def neutralize_text( detail="Text content is required" ) - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) result = service.neutralizeText(text, file_id) return result @@ -130,7 +181,11 @@ def resolve_text( detail="Text content is required" ) - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) resolved_text = service.resolveText(text) return {"resolved_text": resolved_text} @@ -153,7 +208,11 @@ def get_neutralization_attributes( ) -> List[DataNeutralizerAttributes]: """Get neutralization attributes, optionally filtered by file ID""" try: - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) attributes = service.getAttributes(fileId) return attributes @@ -183,7 +242,11 @@ async def process_sharepoint_files( detail="Both source and target paths are required" ) - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) result = await service.processSharepointFiles(source_path, target_path) return result @@ -212,7 +275,11 @@ def batch_process_files( detail="Files data is required" ) - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) result = service.batchNeutralizeFiles(files_data) return result @@ -234,7 +301,11 @@ def get_neutralization_stats( ) -> Dict[str, Any]: """Get neutralization processing statistics""" try: - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) stats = service.getProcessingStats() return stats @@ -255,7 +326,11 @@ def cleanup_file_attributes( ) -> Dict[str, str]: """Clean up neutralization attributes for a specific file""" try: - service = NeutralizationPlayground(context.user, str(context.mandateId)) + service = NeutralizationPlayground( + context.user, + str(context.mandateId) if context.mandateId else "", + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) success = service.cleanupFileAttributes(fileId) if success: diff --git a/modules/features/neutralization/serviceNeutralization/mainServiceNeutralization.py b/modules/features/neutralization/serviceNeutralization/mainServiceNeutralization.py index b4b34cf7..b3d5040e 100644 --- a/modules/features/neutralization/serviceNeutralization/mainServiceNeutralization.py +++ b/modules/features/neutralization/serviceNeutralization/mainServiceNeutralization.py @@ -4,10 +4,11 @@ Data Neutralization Service Handles file processing for data neutralization including SharePoint integration DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme -Unterstützt TXT, JSON, CSV, Excel und Word-Dateien +Supports TXT, JSON, CSV, PDF, DOCX, XLSX, PPTX (extract -> neutralize -> generate) Mehrsprachig: DE, EN, FR, IT """ +import asyncio import logging import re import json @@ -21,10 +22,20 @@ from .subProcessCommon import CommonUtils, NeutralizationResult, NeutralizationA from .subProcessText import TextProcessor, PlainText from .subProcessList import ListProcessor, TableData from .subProcessBinary import BinaryProcessor +from .subProcessPdfInPlace import neutralize_pdf_in_place from .subPatterns import HeaderPatterns, DataPatterns, TextTablePatterns +from .subContentPartAdapter import content_parts_to_renderer_schema logger = logging.getLogger(__name__) +# MIME types that can be processed via extract -> neutralize -> generate +EXTRACTABLE_BINARY_MIME_TYPES = frozenset({ + "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", +}) + class NeutralizationService: """Service for handling data neutralization operations""" @@ -44,8 +55,8 @@ class NeutralizationService: dbApp = serviceCenter.interfaceDbApp self.interfaceNeutralizer = getNeutralizerInterface( currentUser=dbApp.currentUser, - mandateId=dbApp.mandateId, - featureInstanceId=getattr(dbApp, 'featureInstanceId', None) + mandateId=serviceCenter.mandateId or dbApp.mandateId, + featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(dbApp, 'featureInstanceId', None) ) # Initialize anonymization processors @@ -71,47 +82,61 @@ class NeutralizationService: def processText(self, text: str) -> Dict[str, Any]: """Neutralize a raw text string and return a standard result dict.""" - return self._neutralizeText(text, 'text') + result = self._neutralizeText(text, 'text') + self._persistAttributes(result.get('mapping', {}), None) + return result def processFile(self, fileId: str) -> Dict[str, Any]: """Neutralize a file referenced by its fileId using component interface. - Binary files are not neutralized but will be indicated in the result.""" + Supports text files directly; PDF/DOCX/XLSX/PPTX via extract -> neutralize -> generate.""" if not self.interfaceDbComponent: raise ValueError("Component interface is required to process a file by fileId") - # Fetch file data and metadata fileInfo = None try: - # getFile returns an object; fallback to dict-like fileInfo = self.interfaceDbComponent.getFile(fileId) except Exception: fileInfo = None fileName = getattr(fileInfo, 'fileName', None) if fileInfo else None mimeType = getattr(fileInfo, 'mimeType', None) if fileInfo else None - - # Check if file is binary and cannot be neutralized + + fileData = self.interfaceDbComponent.getFileData(fileId) + if not fileData: + raise ValueError(f"No file data found for fileId: {fileId}") + + mime_lower = (mimeType or '').lower() + + # Binary but extractable: PDF, DOCX, XLSX, PPTX + if mime_lower in EXTRACTABLE_BINARY_MIME_TYPES: + try: + result = asyncio.run(self._processBinaryFile(fileData, fileName or "document", mime_lower, fileId)) + if result: + result['file_id'] = fileId + result['neutralized_file_name'] = f"neutralized_{fileName}" if fileName else "neutralized_document" + return result + except Exception as e: + logger.error(f"Binary file neutralization failed: {str(e)}") + return { + 'file_id': fileId, + 'is_binary': True, + 'mime_type': mimeType or 'unknown', + 'file_name': fileName or 'unknown', + 'neutralized_text': None, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)} + } + + # Binary but not extractable if self._isBinaryMimeType(mimeType or ''): - # Return a result indicating binary file (not neutralized) return { 'file_id': fileId, 'is_binary': True, 'mime_type': mimeType or 'unknown', 'file_name': fileName or 'unknown', 'neutralized_text': None, - 'processed_info': { - 'type': 'binary', - 'status': 'skipped', - 'message': 'Binary file neutralization will be implemented in the future' - } + 'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported for neutralization'} } - - fileData = self.interfaceDbComponent.getFileData(fileId) - if not fileData: - raise ValueError(f"No file data found for fileId: {fileId}") - # Determine textType from mime + # Text-based file textType = self._getContentTypeFromMime(mimeType or '') - - # Decode to text try: textContent = fileData.decode('utf-8') except UnicodeDecodeError: @@ -123,17 +148,59 @@ class NeutralizationService: except UnicodeDecodeError: continue if decoded is None: - raise ValueError("Unable to decode file content as text. This may indicate a binary file that cannot be neutralized.") + raise ValueError("Unable to decode file content as text.") textContent = decoded result = self._neutralizeText(textContent, textType) - # Add a reasonable output filename if original known + self._persistAttributes(result.get('mapping', {}), fileId) if fileName: result['neutralized_file_name'] = f"neutralized_{fileName}" result['file_id'] = fileId result['is_binary'] = False return result + def processBinaryBytes(self, fileBytes: bytes, fileName: str, mimeType: str) -> Dict[str, Any]: + """Neutralize binary file bytes (sync - use from sync callers). Uses asyncio.run when event loop not running.""" + mime_lower = (mimeType or '').lower() + if mime_lower not in EXTRACTABLE_BINARY_MIME_TYPES: + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported'} + } + try: + return asyncio.run(self._processBinaryFile(fileBytes, fileName, mime_lower, None)) + except Exception as e: + logger.error(f"Binary neutralization failed: {str(e)}") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)} + } + + async def processBinaryBytesAsync(self, fileBytes: bytes, fileName: str, mimeType: str) -> Dict[str, Any]: + """Neutralize binary file bytes (async - use from async routes to avoid event loop conflict).""" + mime_lower = (mimeType or '').lower() + if mime_lower not in EXTRACTABLE_BINARY_MIME_TYPES: + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported'} + } + try: + return await self._processBinaryFile(fileBytes, fileName, mime_lower, None) + except Exception as e: + logger.error(f"Binary neutralization failed: {str(e)}") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)} + } + def resolveText(self, text: str) -> str: if not self.interfaceNeutralizer: return text @@ -167,6 +234,195 @@ class NeutralizationService: return False return self.interfaceNeutralizer.deleteNeutralizationAttributes(fileId) + def _persistAttributes(self, mapping: Dict[str, str], fileId: Optional[str]) -> None: + """Persist mapping to DB for resolve to work. mapping: originalText -> placeholder e.g. '[email.uuid]'""" + if not self.interfaceNeutralizer or not mapping: + return + import re + placeholder_re = re.compile(r'^\[([a-z]+)\.([a-f0-9-]{36})\]$') + for original_text, placeholder in mapping.items(): + m = placeholder_re.match(placeholder) + if m: + pattern_type, uid = m.group(1), m.group(2) + try: + self.interfaceNeutralizer.createAttribute( + attributeId=uid, + originalText=original_text, + patternType=pattern_type, + fileId=fileId + ) + except Exception as e: + logger.debug(f"Could not persist attribute {uid}: {e}") + + async def _processBinaryFile( + self, + fileBytes: bytes, + fileName: str, + mimeType: str, + fileId: Optional[str] + ) -> Dict[str, Any]: + """Extract -> neutralize -> adapt -> generate for PDF/DOCX/XLSX/PPTX.""" + from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService + from modules.services.serviceExtraction.subPipeline import runExtraction + from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy + + # Ensure registries exist + if ExtractionService._sharedExtractorRegistry is None: + ExtractionService(self.services) + registry = ExtractionService._sharedExtractorRegistry + chunker = ExtractionService._sharedChunkerRegistry + opts = ExtractionOptions(prompt="neutralize", mergeStrategy=MergeStrategy(preserveChunks=True)) + + # 1. Extract + extracted = runExtraction(registry, chunker, fileBytes, fileName, mimeType, opts) + parts = extracted.parts if hasattr(extracted, 'parts') else [] + + if not parts: + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': 'No content extracted'} + } + + # 2. Neutralize each text/table part + all_mapping: Dict[str, str] = {} + neutralized_parts: List[Any] = [] + neutralization_error: Optional[str] = None + for part in parts: + p = part if isinstance(part, dict) else part.model_dump() if hasattr(part, 'model_dump') else part + type_group = p.get('typeGroup', '') + data = p.get('data', '') + if type_group in ('binary', 'image') or not (data and str(data).strip()): + neutralized_parts.append(part) + continue + nr = self._neutralizeText(str(data), 'text' if type_group != 'table' else 'csv') + proc = nr.get('processed_info', {}) or {} + if isinstance(proc, dict) and proc.get('type') == 'error': + neutralization_error = proc.get('error', 'Neutralization failed') + neu_text = nr.get('neutralized_text', str(data)) + mapping = nr.get('mapping', {}) + all_mapping.update(mapping) + new_part = {**p, 'data': neu_text} + neutralized_parts.append(new_part) + self._persistAttributes(all_mapping, fileId) + + # 3. PDF: Use in-place only; no fallback to render + if mimeType == "application/pdf": + if neutralization_error: + logger.error(f"PDF neutralization aborted: {neutralization_error}") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': neutralization_error} + } + in_place_bytes = neutralize_pdf_in_place(fileBytes, all_mapping) + if in_place_bytes is not None: + logger.info("PDF neutralization completed via in-place redaction (layout preserved)") + return { + 'neutralized_text': None, + 'neutralized_bytes': in_place_bytes, + 'neutralized_file_name': f"neutralized_{fileName}", + 'is_binary': True, + 'mime_type': 'application/pdf', + 'attributes': [{'original': k, 'placeholder': v} for k, v in all_mapping.items()], + 'processed_info': {'type': 'binary', 'status': 'success', 'format': 'pdf', 'method': 'in-place'} + } + logger.error("PDF in-place neutralization failed") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': 'PDF in-place neutralization failed'} + } + + # 4. Adapter: ContentPart list -> renderer schema (non-PDF only) + schema = content_parts_to_renderer_schema(neutralized_parts, title=fileName or "Neutralized") + + # 5. Render to format + renderer, output_mime = self._getRendererForMime(mimeType) + if not renderer: + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': f'No renderer for {mimeType}'} + } + + try: + logger.info(f"Calling renderer.render for mime={mimeType}, renderer={type(renderer).__name__}") + rendered = await renderer.render(schema, fileName or "document", None, None) + logger.info(f"Renderer returned: type={type(rendered).__name__}, len={len(rendered) if rendered else 0}") + if not rendered or len(rendered) == 0: + logger.error("Renderer returned empty list") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Render produced no output'} + } + doc = rendered[0] + logger.info(f"First doc: type={type(doc).__name__}, isinstance(dict)={isinstance(doc, dict)}, has documentData attr={hasattr(doc, 'documentData')}") + # Extract documentData: Pydantic v2 models may need model_dump() for reliable access + if isinstance(doc, dict): + doc_data = doc.get('documentData') + elif hasattr(doc, 'model_dump'): + d = doc.model_dump(mode='python') + doc_data = d.get('documentData') + else: + doc_data = getattr(doc, 'documentData', None) + logger.info(f"doc_data: type={type(doc_data).__name__ if doc_data is not None else 'None'}, len={len(doc_data) if doc_data else 0}") + if doc_data is None: + logger.error("Renderer returned document with no documentData") + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Renderer returned no data'} + } + if isinstance(doc_data, str): + doc_data = doc_data.encode('utf-8') + return { + 'neutralized_text': None, + 'neutralized_bytes': doc_data, + 'neutralized_file_name': f"neutralized_{fileName}", + 'is_binary': True, + 'mime_type': output_mime, + 'attributes': [{'original': k, 'placeholder': v} for k, v in all_mapping.items()], + 'processed_info': {'type': 'binary', 'status': 'success', 'format': mimeType} + } + except Exception as e: + logger.error(f"Render failed for {mimeType}: {str(e)}", exc_info=True) + raise + + return { + 'neutralized_text': None, + 'neutralized_bytes': None, + 'is_binary': True, + 'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Render produced no output'} + } + + def _getRendererForMime(self, mimeType: str): + """Get renderer instance and output mime for the given input MIME type.""" + from modules.services.serviceGeneration.renderers.rendererPdf import RendererPdf + from modules.services.serviceGeneration.renderers.rendererDocx import RendererDocx + from modules.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx + from modules.services.serviceGeneration.renderers.rendererPptx import RendererPptx + + mime_map = { + "application/pdf": (RendererPdf, "application/pdf"), + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": (RendererDocx, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": (RendererXlsx, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + "application/vnd.openxmlformats-officedocument.presentationml.presentation": (RendererPptx, "application/vnd.openxmlformats-officedocument.presentationml.presentation"), + } + pair = mime_map.get(mimeType) + if not pair: + return None, None + cls, out_mime = pair + renderer = cls(self.services) + return renderer, out_mime + def _reloadNamesFromConfig(self) -> None: """Reload names from config and update processors""" try: diff --git a/modules/features/neutralization/serviceNeutralization/subContentPartAdapter.py b/modules/features/neutralization/serviceNeutralization/subContentPartAdapter.py new file mode 100644 index 00000000..b7de66ca --- /dev/null +++ b/modules/features/neutralization/serviceNeutralization/subContentPartAdapter.py @@ -0,0 +1,115 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +Adapter to convert ContentPart list (from extraction) to renderer JSON schema. +Schema: { metadata: {...}, documents: [{ sections: [{ content_type, elements: [...] }] }] } +""" + +import csv +import io +from typing import Dict, List, Any + +from modules.datamodels.datamodelExtraction import ContentPart + + +def content_parts_to_renderer_schema(parts: List[ContentPart], title: str = "Neutralized Document") -> Dict[str, Any]: + """ + Convert ContentPart list to the standardized renderer schema. + + Args: + parts: List of ContentPart from extraction + title: Document title for metadata + + Returns: + Dict with metadata, documents[0].sections structure for renderers + """ + sections: List[Dict[str, Any]] = [] + + for part in parts: + if not hasattr(part, 'typeGroup') or not hasattr(part, 'data'): + part_dict = part if isinstance(part, dict) else part.model_dump() + type_group = part_dict.get("typeGroup", "text") + data = part_dict.get("data", "") + label = part_dict.get("label", "") + else: + type_group = part.typeGroup + data = part.data or "" + label = part.label or "" + + # Skip binary/image parts without text - they can't be neutralized meaningfully + if type_group in ("binary", "image"): + continue + + # Skip empty data + if not (data and str(data).strip()): + continue + + section = _part_to_section(type_group, data, label) + if section: + sections.append(section) + + # Ensure at least one section (renderers require it) + if not sections: + sections = [{ + "content_type": "paragraph", + "elements": [{"type": "paragraph", "content": {"text": ""}}] + }] + + return { + "metadata": {"title": title}, + "documents": [{ + "sections": sections + }] + } + + +def _part_to_section(type_group: str, data: str, label: str) -> Dict[str, Any]: + """Convert a single ContentPart to a section dict.""" + data_str = str(data).strip() + + if type_group == "table" and ("csv" in label.lower() or "," in data_str or "\t" in data_str): + # Parse CSV/TSV into table structure + try: + rows = list(csv.reader(io.StringIO(data_str))) + if rows: + headers = rows[0] + rows_data = rows[1:] + return { + "content_type": "table", + "elements": [{ + "type": "table", + "content": {"headers": headers, "rows": rows_data} + }] + } + except Exception: + pass + # Fallback: treat as paragraph + return { + "content_type": "paragraph", + "elements": [{ + "type": "extracted_text", + "content": data_str, + "source": label + }] + } + + if type_group == "structure": + # PPTX slide content - often markdown-like + return { + "content_type": "paragraph", + "elements": [{ + "type": "extracted_text", + "content": data_str, + "source": label + }] + } + + # Default: text/paragraph + return { + "content_type": "paragraph", + "elements": [{ + "type": "extracted_text", + "content": data_str, + "source": label + }] + } diff --git a/modules/features/neutralization/serviceNeutralization/subParseString.py b/modules/features/neutralization/serviceNeutralization/subParseString.py index 1a1b54ad..d80e1a04 100644 --- a/modules/features/neutralization/serviceNeutralization/subParseString.py +++ b/modules/features/neutralization/serviceNeutralization/subParseString.py @@ -10,6 +10,18 @@ import uuid from typing import Dict, List, Tuple, Any from .subPatterns import DataPatterns, findPatternsInText +# Phrases or words that must never be neutralized (labels, Anrede, etc.) +_NEUTRALIZATION_BLACKLIST = frozenset({ + "Für Sie", "Ihre Ansprechperson", "AXA 24", "General Agent", + "Your Contact", "Contact Person", "Bei Fragen", "Mit Freundlichen", + "Frau", "Herr", # Anrede + "Reise", "Reisebeginn", "Reiseende", "Vertragsbeginn", "Zahlbar", + "Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance", + "Leistungen", "Basis", "Benefits", # Section labels + "Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.) +}) + + class StringParser: """Handles string parsing and replacement operations""" @@ -48,7 +60,17 @@ class StringParser: """ patternMatches = findPatternsInText(text, self.data_patterns) - # Process pattern matches from right to left to avoid position shifts + # Exclude matches that are fully contained in a longer match (e.g. skip "2026" inside "17.02.2026") + def is_contained(m, all_matches): + for other in all_matches: + if other is m: + continue + if other[2] <= m[2] and m[3] <= other[3] and (other[3] - other[2]) > (m[3] - m[2]): + return True + return False + patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)] + + # Process from right to left to avoid position shifts for patternName, matchedText, start, end in reversed(patternMatches): # Skip if already a placeholder if self._isPlaceholder(matchedText): @@ -58,15 +80,27 @@ class StringParser: if '[' in matchedText or ']' in matchedText: continue + # Skip blacklisted text (labels, Anrede, etc.) – never neutralize + if matchedText.strip() in _NEUTRALIZATION_BLACKLIST: + continue + # Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern) + if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()): + continue + if matchedText not in self.mapping: # Generate a UUID for the placeholder placeholderId = str(uuid.uuid4()) # Create placeholder in format [type.uuid] typeMapping = { 'email': 'email', - 'phone': 'phone', + 'phone': 'phone', 'address': 'address', - 'id': 'id' + 'date': 'date', + 'policy': 'policy', + 'name': 'name', + 'id': 'id', + 'iban': 'iban', + 'ssn': 'ssn', } placeholderType = typeMapping.get(patternName, 'data') self.mapping[matchedText] = f"[{placeholderType}.{placeholderId}]" diff --git a/modules/features/neutralization/serviceNeutralization/subPatterns.py b/modules/features/neutralization/serviceNeutralization/subPatterns.py index f8408be9..d5a5d570 100644 --- a/modules/features/neutralization/serviceNeutralization/subPatterns.py +++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py @@ -234,6 +234,29 @@ class HeaderPatterns: class DataPatterns: """Patterns for identifying sensitive data in content""" patterns = [ + # Name patterns (before email so "name@domain" is not matched as name) + Pattern( + name="name", + patterns=[ + # Contact person context (fixed-width lookbehind for Python re) + r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', + r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', + r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', + # Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind + r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + ], + replacement_template="[NAME_{}]" + ), + # Email pattern for plain text Pattern( name="email", @@ -276,12 +299,18 @@ class DataPatterns: replacement_template="[IBAN_{}]" ), - # Address patterns + # Address patterns (compound first so full footer = one UUID) Pattern( name="address", patterns=[ - r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:[a-z])?\b', - r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b' + # Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch) + r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)', + # Street + house number (standalone) + r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', + # Postfach / PO Box (standalone) + r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b', + # Postal code + city (standalone) + r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)' ], replacement_template="[ADDRESS_{}]" ), @@ -290,25 +319,58 @@ class DataPatterns: Pattern( name="date", patterns=[ - # Specific date formats with context - r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Birth dates - r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Birth dates - r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Contract dates - r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Contract dates - # Specific date formats with month names - r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', # Birth dates with month - r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' # Contract dates with month + # Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF) + r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026 + r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines) + r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03. + r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5) + # Context-specific date formats + r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', + r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', + r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', + r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' ], replacement_template="[DATE_{}]" ), + # Policy number patterns (replaces only the number, keeps labels like "Police Nr.") + Pattern( + name="policy", + patterns=[ + # Number after "Police Nr." etc. (fixed-width lookbehind – Python re requirement) + r'(?<=Police Nr\. )[\d.]+', + r'(?<=Police Nr\. )[\d.]+', + r'(?<=Police Nr\.: )[\d.]+', + r'(?<=Police Nr )[\d.]+', + r'(?<=Police Nr: )[\d.]+', + r'(?<=Polizzenr\. )[\d.]+', + r'(?<=Polizzenummer: )[\d.]+', + r'(?<=Polizzenummer )[\d.]+', + r'(?<=Policy No\. )[\d.]+', + r'(?<=Policy No )[\d.]+', + r'(?<=Policy Number: )[\d.]+', + r'(?<=Policy Number )[\d.]+', + r'(?<=Polizza n° )[\d.]+', + r'(?<=Numéro de police: )[\d.]+', + r'(?<=Numéro de police )[\d.]+', + r'(?<=Numero polizza: )[\d.]+', + r'(?<=Numero polizza )[\d.]+', + # Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts + r'\b\d{2,4}(?:\.\d{3}){2,}\b' + ], + replacement_template="[POLICY_{}]" + ), + # SSN patterns Pattern( name="ssn", patterns=[ - r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b', # Swiss AHV + r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs - r'\b\d{3}\.\d{3}\.\d{3}\b' # Generic SSN format + # Generic SSN format - exclude when followed by comma+digit (European decimal) + r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)' ], replacement_template="[SSN_{}]" ) diff --git a/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py b/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py new file mode 100644 index 00000000..be0d5207 --- /dev/null +++ b/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +PDF in-place neutralization using PyMuPDF. +Removes original text completely and inserts full UUID placeholders. +PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout. +""" + +import io +import logging +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + + +def neutralize_pdf_in_place( + pdf_bytes: bytes, + mapping: Dict[str, str], +) -> Optional[bytes]: + """ + Remove sensitive text and replace with UUID placeholders in-place. + Content is fully removed (not just covered) so it cannot be copied. + + Args: + pdf_bytes: Original PDF file content + mapping: Dict of original_text -> placeholder (e.g. [address.uuid]) + + Returns: + Modified PDF bytes, or None on failure + """ + if not mapping: + return pdf_bytes + + try: + import fitz # PyMuPDF + except ImportError: + logger.warning("PyMuPDF (fitz) not available for PDF in-place neutralization") + return None + + try: + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + except Exception as e: + logger.error(f"Failed to open PDF: {e}") + return None + + sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0])) + fill_color = (1, 1, 1) + text_color = (0, 0, 0) + fontname = "helv" + fontsize = 8 + + try: + for page_num in range(len(doc)): + page = doc[page_num] + + for original_text, placeholder in sorted_items: + if not original_text or not placeholder: + continue + + search_text = original_text + insert_text = placeholder + if placeholder.startswith("[policy."): + # Try label+number to get wider rect; insert UUID only (label+UUID would overflow) + for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "): + candidate = prefix + original_text + try: + hits = page.search_for(candidate, quads=False) + if hits: + search_text = candidate + insert_text = placeholder # UUID only so it fits in rect + break + except Exception: + continue + + try: + instances = page.search_for(search_text, quads=False) + except Exception: + instances = [] + + for rect in instances: + try: + fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize + page.add_redact_annot( + rect, + text=insert_text, + fill=fill_color, + text_color=text_color, + fontname=fontname, + fontsize=fs, + ) + except Exception as e: + logger.warning(f"Redact failed for {original_text[:40]!r}: {e}") + + try: + page.apply_redactions() + except Exception as e: + logger.debug(f"apply_redactions page {page_num + 1}: {e}") + + buf = io.BytesIO() + doc.save(buf, garbage=4, deflate=True) + doc.close() + return buf.getvalue() + + except Exception as e: + logger.error(f"PDF in-place neutralization failed: {e}", exc_info=True) + try: + doc.close() + except Exception: + pass + return None diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 49d7e365..e8fceaff 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -7,7 +7,7 @@ import logging import json # Import auth module -from modules.auth import limiter, getCurrentUser +from modules.auth import limiter, getCurrentUser, getRequestContext, RequestContext # Import interfaces import modules.interfaces.interfaceDbManagement as interfaceDbManagement @@ -40,7 +40,8 @@ router = APIRouter( def get_files( request: Request, pagination: Optional[str] = Query(None, description="JSON-encoded PaginationParams object"), - currentUser: User = Depends(getCurrentUser) + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext) ) -> PaginatedResponse[FileItem]: """ Get files with optional pagination, sorting, and filtering. @@ -69,7 +70,11 @@ def get_files( detail=f"Invalid pagination parameter: {str(e)}" ) - managementInterface = interfaceDbManagement.getInterface(currentUser) + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) result = managementInterface.getAllFiles(pagination=paginationParams) # If pagination was requested, result is PaginatedResult @@ -330,11 +335,16 @@ def get_file_stats( def download_file( request: Request, fileId: str = Path(..., description="ID of the file to download"), - currentUser: User = Depends(getCurrentUser) + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext) ) -> Response: - """Download a file""" + """Download a file. Uses mandate/instance context when present (e.g. from feature pages).""" try: - managementInterface = interfaceDbManagement.getInterface(currentUser) + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) # Get file data fileData = managementInterface.getFile(fileId) @@ -378,11 +388,16 @@ def download_file( def preview_file( request: Request, fileId: str = Path(..., description="ID of the file to preview"), - currentUser: User = Depends(getCurrentUser) + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext) ) -> FilePreview: - """Preview a file's content""" + """Preview a file's content. Uses mandate/instance context when present.""" try: - managementInterface = interfaceDbManagement.getInterface(currentUser) + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None + ) # Get file preview using the correct method preview = managementInterface.getFileContent(fileId) diff --git a/modules/routes/routeSystem.py b/modules/routes/routeSystem.py index e3b001b1..5be43820 100644 --- a/modules/routes/routeSystem.py +++ b/modules/routes/routeSystem.py @@ -114,6 +114,9 @@ def _getFeatureUiObjects(featureCode: str) -> List[Dict[str, Any]]: elif featureCode == "teamsbot": from modules.features.teamsbot.mainTeamsbot import UI_OBJECTS return UI_OBJECTS + elif featureCode == "neutralization": + from modules.features.neutralization.mainNeutralization import UI_OBJECTS + return UI_OBJECTS else: logger.warning(f"Unknown feature code: {featureCode}") return []