neutralized pdf files and better pattern recognition

This commit is contained in:
Ida Dittrich 2026-02-23 11:18:28 +01:00
parent 7163397fd3
commit 5120fbc503
11 changed files with 984 additions and 86 deletions

View file

@ -83,14 +83,16 @@ class InterfaceFeatureNeutralizer:
self.featureInstanceId = featureInstanceId
def getNeutralizationConfig(self) -> Optional[DataNeutraliserConfig]:
"""Get the data neutralization configuration for the current user's mandate"""
"""Get the data neutralization configuration for the current user's mandate and instance"""
try:
# Use RBAC filtering
record_filter = {"mandateId": self.mandateId}
if self.featureInstanceId:
record_filter["featureInstanceId"] = self.featureInstanceId
filteredConfigs = getRecordsetWithRBAC(
self.db,
DataNeutraliserConfig,
self.currentUser,
recordFilter={"mandateId": self.mandateId},
recordFilter=record_filter,
mandateId=self.mandateId
)
@ -130,6 +132,8 @@ class InterfaceFeatureNeutralizer:
# Create new config
configData["mandateId"] = self.mandateId
configData["userId"] = self.userId
if self.featureInstanceId:
configData["featureInstanceId"] = self.featureInstanceId
newConfig = DataNeutraliserConfig(**configData)
createdRecord = self.db.recordCreate(DataNeutraliserConfig, newConfig)
@ -200,13 +204,44 @@ class InterfaceFeatureNeutralizer:
DataNeutralizerAttributes,
recordFilter={"mandateId": self.mandateId, "id": attributeId}
)
if attributes:
return attributes[0]
return None
if not attributes:
return None
attr = attributes[0]
return {k: v for k, v in attr.items() if not k.startswith("_")}
except Exception as e:
logger.error(f"Error getting attribute by ID: {str(e)}")
return None
def createAttribute(
self,
attributeId: str,
originalText: str,
patternType: str,
fileId: Optional[str] = None
) -> Optional[DataNeutralizerAttributes]:
"""Create a neutralization attribute for placeholder resolution."""
try:
mandate_id = self.mandateId or ""
feature_instance_id = self.featureInstanceId or ""
if not self.userId:
logger.warning("Cannot create attribute: missing userId")
return None
attr = DataNeutralizerAttributes(
id=attributeId,
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
userId=self.userId,
originalText=originalText,
fileId=fileId,
patternType=patternType,
)
created = self.db.recordCreate(DataNeutralizerAttributes, attr.model_dump())
return DataNeutralizerAttributes(**{k: v for k, v in created.items() if not k.startswith("_")})
except Exception as e:
logger.error(f"Error creating attribute: {str(e)}")
return None
def getInterface(currentUser: Optional[User] = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> InterfaceFeatureNeutralizer:
"""

View file

@ -21,17 +21,7 @@ UI_OBJECTS = [
"objectKey": "ui.feature.neutralization.playground",
"label": {"en": "Playground", "de": "Spielwiese", "fr": "Bac à sable"},
"meta": {"area": "playground"}
},
{
"objectKey": "ui.feature.neutralization.config",
"label": {"en": "Configuration", "de": "Konfiguration", "fr": "Configuration"},
"meta": {"area": "config"}
},
{
"objectKey": "ui.feature.neutralization.attributes",
"label": {"en": "Attributes", "de": "Attribute", "fr": "Attributs"},
"meta": {"area": "attributes"}
},
}
]
# Resource Objects for RBAC catalog
@ -130,9 +120,106 @@ def registerFeature(catalogService) -> bool:
meta=resObj.get("meta")
)
# Sync template roles to database
_syncTemplateRolesToDb()
logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects")
return True
except Exception as e:
logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}")
return False
def _syncTemplateRolesToDb() -> int:
"""
Sync template roles and their AccessRules to the database.
Creates global template roles (mandateId=None) if they don't exist.
Returns:
Number of roles created
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.datamodels.datamodelRbac import Role, AccessRule, AccessRuleContext
rootInterface = getRootInterface()
existingRoles = rootInterface.getRolesByFeatureCode(FEATURE_CODE)
templateRoles = [r for r in existingRoles if r.mandateId is None]
existingRoleLabels = {r.roleLabel: str(r.id) for r in templateRoles}
createdCount = 0
for roleTemplate in TEMPLATE_ROLES:
roleLabel = roleTemplate["roleLabel"]
if roleLabel in existingRoleLabels:
roleId = existingRoleLabels[roleLabel]
_ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", []))
else:
newRole = Role(
roleLabel=roleLabel,
description=roleTemplate.get("description", {}),
featureCode=FEATURE_CODE,
mandateId=None,
featureInstanceId=None,
isSystemRole=False
)
createdRole = rootInterface.db.recordCreate(Role, newRole.model_dump())
roleId = createdRole.get("id")
_ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", []))
logger.info(f"Created template role '{roleLabel}' with ID {roleId}")
createdCount += 1
if createdCount > 0:
logger.info(f"Feature '{FEATURE_CODE}': Created {createdCount} template roles")
return createdCount
except Exception as e:
logger.error(f"Error syncing template roles for feature '{FEATURE_CODE}': {e}")
return 0
def _ensureAccessRulesForRole(rootInterface, roleId: str, ruleTemplates: List[Dict[str, Any]]) -> int:
"""Ensure AccessRules exist for a role based on templates."""
from modules.datamodels.datamodelRbac import AccessRule, AccessRuleContext
existingRules = rootInterface.getAccessRulesByRole(roleId)
existingSignatures = set()
for rule in existingRules:
sig = (rule.context.value if rule.context else None, rule.item)
existingSignatures.add(sig)
createdCount = 0
for template in ruleTemplates:
context = template.get("context", "UI")
item = template.get("item")
sig = (context, item)
if sig in existingSignatures:
continue
if context == "UI":
contextEnum = AccessRuleContext.UI
elif context == "DATA":
contextEnum = AccessRuleContext.DATA
elif context == "RESOURCE":
contextEnum = AccessRuleContext.RESOURCE
else:
contextEnum = context
newRule = AccessRule(
roleId=roleId,
context=contextEnum,
item=item,
view=template.get("view", False),
read=template.get("read"),
create=template.get("create"),
update=template.get("update"),
delete=template.get("delete"),
)
rootInterface.db.recordCreate(AccessRule, newRule.model_dump())
createdCount += 1
return createdCount

View file

@ -15,14 +15,96 @@ logger = logging.getLogger(__name__)
class NeutralizationPlayground:
"""Feature/UI wrapper around NeutralizationService for playground & routes."""
def __init__(self, currentUser: User, mandateId: str):
def __init__(self, currentUser: User, mandateId: str, featureInstanceId: Optional[str] = None):
self.currentUser = currentUser
self.mandateId = mandateId
self.services = getServices(currentUser, None, mandateId=mandateId)
self.featureInstanceId = featureInstanceId
self.services = getServices(currentUser, None, mandateId=mandateId, featureInstanceId=featureInstanceId)
def processText(self, text: str) -> Dict[str, Any]:
return self.services.neutralization.processText(text)
async def processUploadedFileAsync(self, file_bytes: bytes, filename: str) -> Dict[str, Any]:
"""Process an uploaded file (bytes + filename). Returns neutralized result for text or binary.
Saves both original and neutralized files to user files (component storage) when available."""
import base64
name_lower = (filename or '').lower()
mime_map = {
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xlsm': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
}
mime = next((mime_map[ext] for ext in mime_map if name_lower.endswith(ext)), 'text/plain')
binary_exts = {'.pdf', '.docx', '.xlsx', '.xlsm', '.pptx'}
is_binary = any(name_lower.endswith(ext) for ext in binary_exts)
original_file_id = None
neutralized_file_id = None
# Save original file to user files
if self.services.interfaceDbComponent:
try:
file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(file_bytes, filename)
original_file_id = str(file_item.id)
except Exception as e:
logger.warning(f"Could not save original file to user files: {e}")
if is_binary:
result = await self.services.neutralization.processBinaryBytesAsync(file_bytes, filename, mime)
neu_bytes = result.get('neutralized_bytes')
logger.debug(f"Binary result: neu_bytes type={type(neu_bytes).__name__}, len={len(neu_bytes) if neu_bytes is not None else 0}")
if neu_bytes is not None and len(neu_bytes) > 0:
result['neutralized_file_base64'] = base64.b64encode(neu_bytes).decode('ascii')
result['neutralized_file_name'] = result.get('neutralized_file_name', f'neutralized_{filename}')
result['mime_type'] = result.get('mime_type', mime)
# Save neutralized binary to user files
if self.services.interfaceDbComponent:
try:
neu_name = result['neutralized_file_name']
file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(neu_bytes, neu_name)
neutralized_file_id = str(file_item.id)
except Exception as e:
logger.warning(f"Could not save neutralized file to user files: {e}")
# Remove raw bytes before JSON response (avoid serialization issues; use base64 only)
result.pop('neutralized_bytes', None)
result['original_file_id'] = original_file_id
result['neutralized_file_id'] = neutralized_file_id
return result
try:
text_content = file_bytes.decode('utf-8')
except UnicodeDecodeError:
try:
text_content = file_bytes.decode('latin-1')
except UnicodeDecodeError:
return {
'neutralized_text': None,
'original_file_id': original_file_id,
'neutralized_file_id': None,
'processed_info': {'type': 'error', 'error': 'File could not be decoded as text. Supported: UTF-8, Latin-1. For PDF/Word/Excel, use supported binary formats.'}
}
result = self.services.neutralization.processText(text_content)
result['neutralized_file_name'] = f'neutralized_{filename}'
# Save neutralized text as file to user files
if self.services.interfaceDbComponent and result.get('neutralized_text') is not None:
try:
neu_text = result['neutralized_text']
neu_bytes = neu_text.encode('utf-8')
neu_name = result['neutralized_file_name']
file_item, _ = self.services.interfaceDbComponent.saveUploadedFile(neu_bytes, neu_name)
neutralized_file_id = str(file_item.id)
except Exception as e:
logger.warning(f"Could not save neutralized text file to user files: {e}")
result['original_file_id'] = original_file_id
result['neutralized_file_id'] = neutralized_file_id
return result
def processUploadedFile(self, file_bytes: bytes, filename: str) -> Dict[str, Any]:
"""Sync wrapper for sync callers. Uses asyncio.run; do NOT call from async routes (use processUploadedFileAsync)."""
return asyncio.run(self.processUploadedFileAsync(file_bytes, filename))
def processFiles(self, fileIds: List[str]) -> Dict[str, Any]:
results: List[Dict[str, Any]] = []
errors: List[str] = []
@ -273,18 +355,42 @@ class SharepointProcessor:
processed: List[Dict[str, Any]] = []
errors: List[str] = []
BINARY_EXTS = {'.pdf', '.docx', '.doc', '.xlsx', '.xlsm', '.pptx', '.ppt'}
async def _processSingle(fileInfo: Dict[str, Any]):
try:
fileContent = await self.services.sharepoint.downloadFile(sourceSiteInfo['id'], fileInfo['id'])
if not fileContent:
return {'error': f"Failed to download file: {fileInfo['name']}"}
try:
textContent = fileContent.decode('utf-8')
except UnicodeDecodeError:
textContent = fileContent.decode('latin-1')
result = self.services.neutralization.processText(textContent)
name_lower = (fileInfo.get('name') or '').lower()
is_binary = any(name_lower.endswith(ext) for ext in BINARY_EXTS)
mime_map = {
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xlsm': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
}
mime = next((mime_map[ext] for ext in BINARY_EXTS if name_lower.endswith(ext)), 'text/plain')
if is_binary:
result = self.services.neutralization.processBinaryBytes(fileContent, fileInfo['name'], mime)
if result.get('neutralized_bytes'):
content_to_upload = result['neutralized_bytes']
else:
return {'error': f"Failed to neutralize binary file {fileInfo['name']}: {result.get('processed_info', {}).get('error', 'Unknown error')}"}
else:
try:
textContent = fileContent.decode('utf-8')
except UnicodeDecodeError:
textContent = fileContent.decode('latin-1')
result = self.services.neutralization.processText(textContent)
content_to_upload = (result.get('neutralized_text') or '').encode('utf-8')
neutralizedFilename = f"neutralized_{fileInfo['name']}"
uploadResult = await self.services.sharepoint.uploadFile(targetSiteInfo['id'], targetFolder, neutralizedFilename, result['neutralized_text'].encode('utf-8'))
uploadResult = await self.services.sharepoint.uploadFile(targetSiteInfo['id'], targetFolder, neutralizedFilename, content_to_upload)
if 'error' in uploadResult:
return {'error': f"Failed to upload neutralized file: {neutralizedFilename} - {uploadResult['error']}"}
return {

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from fastapi import APIRouter, HTTPException, Depends, Path, Request, status, Query, Body
from fastapi import APIRouter, HTTPException, Depends, Path, Request, status, Query, Body, File, UploadFile
from typing import List, Dict, Any, Optional
import logging
@ -35,13 +35,18 @@ def get_neutralization_config(
) -> DataNeutraliserConfig:
"""Get data neutralization configuration"""
try:
service = NeutralizationPlayground(context.user, str(context.mandateId))
mandate_id = str(context.mandateId) if context.mandateId else ""
feature_instance_id = str(context.featureInstanceId) if context.featureInstanceId else ""
service = NeutralizationPlayground(
context.user, mandate_id, featureInstanceId=feature_instance_id or None
)
config = service.getConfig()
if not config:
# Return default config instead of 404
# Return default config instead of 404 (requires mandateId and featureInstanceId for instance-scoped config)
return DataNeutraliserConfig(
mandateId=context.mandateId,
mandateId=mandate_id,
featureInstanceId=feature_instance_id,
userId=context.user.id,
enabled=True,
namesToParse="",
@ -69,7 +74,11 @@ def save_neutralization_config(
) -> DataNeutraliserConfig:
"""Save or update data neutralization configuration"""
try:
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
config = service.saveConfig(config_data)
return config
@ -81,6 +90,44 @@ def save_neutralization_config(
detail=f"Error saving neutralization config: {str(e)}"
)
@router.post("/neutralize-file")
@limiter.limit("20/minute")
async def neutralize_file(
request: Request,
file: UploadFile = File(..., description="File to neutralize (PDF, DOCX, XLSX, PPTX, TXT, CSV, JSON)"),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Upload and neutralize a file. Returns neutralized text or base64-encoded file for download."""
try:
if not file.filename or not file.filename.strip():
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="File name is required"
)
content = await file.read()
if not content:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="File is empty"
)
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = await service.processUploadedFileAsync(content, file.filename or "file")
logger.info(f"Neutralize file result keys: {list(result.keys())}, has_base64={bool(result.get('neutralized_file_base64'))}, has_text={result.get('neutralized_text') is not None}")
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"Error neutralizing file: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error neutralizing file: {str(e)}"
)
@router.post("/neutralize-text", response_model=Dict[str, Any])
@limiter.limit("20/minute")
def neutralize_text(
@ -99,7 +146,11 @@ def neutralize_text(
detail="Text content is required"
)
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = service.neutralizeText(text, file_id)
return result
@ -130,7 +181,11 @@ def resolve_text(
detail="Text content is required"
)
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
resolved_text = service.resolveText(text)
return {"resolved_text": resolved_text}
@ -153,7 +208,11 @@ def get_neutralization_attributes(
) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID"""
try:
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
attributes = service.getAttributes(fileId)
return attributes
@ -183,7 +242,11 @@ async def process_sharepoint_files(
detail="Both source and target paths are required"
)
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = await service.processSharepointFiles(source_path, target_path)
return result
@ -212,7 +275,11 @@ def batch_process_files(
detail="Files data is required"
)
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = service.batchNeutralizeFiles(files_data)
return result
@ -234,7 +301,11 @@ def get_neutralization_stats(
) -> Dict[str, Any]:
"""Get neutralization processing statistics"""
try:
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
stats = service.getProcessingStats()
return stats
@ -255,7 +326,11 @@ def cleanup_file_attributes(
) -> Dict[str, str]:
"""Clean up neutralization attributes for a specific file"""
try:
service = NeutralizationPlayground(context.user, str(context.mandateId))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
success = service.cleanupFileAttributes(fileId)
if success:

View file

@ -4,10 +4,11 @@
Data Neutralization Service
Handles file processing for data neutralization including SharePoint integration
DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme
Unterstützt TXT, JSON, CSV, Excel und Word-Dateien
Supports TXT, JSON, CSV, PDF, DOCX, XLSX, PPTX (extract -> neutralize -> generate)
Mehrsprachig: DE, EN, FR, IT
"""
import asyncio
import logging
import re
import json
@ -21,10 +22,20 @@ from .subProcessCommon import CommonUtils, NeutralizationResult, NeutralizationA
from .subProcessText import TextProcessor, PlainText
from .subProcessList import ListProcessor, TableData
from .subProcessBinary import BinaryProcessor
from .subProcessPdfInPlace import neutralize_pdf_in_place
from .subPatterns import HeaderPatterns, DataPatterns, TextTablePatterns
from .subContentPartAdapter import content_parts_to_renderer_schema
logger = logging.getLogger(__name__)
# MIME types that can be processed via extract -> neutralize -> generate
EXTRACTABLE_BINARY_MIME_TYPES = frozenset({
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
})
class NeutralizationService:
"""Service for handling data neutralization operations"""
@ -44,8 +55,8 @@ class NeutralizationService:
dbApp = serviceCenter.interfaceDbApp
self.interfaceNeutralizer = getNeutralizerInterface(
currentUser=dbApp.currentUser,
mandateId=dbApp.mandateId,
featureInstanceId=getattr(dbApp, 'featureInstanceId', None)
mandateId=serviceCenter.mandateId or dbApp.mandateId,
featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(dbApp, 'featureInstanceId', None)
)
# Initialize anonymization processors
@ -71,47 +82,61 @@ class NeutralizationService:
def processText(self, text: str) -> Dict[str, Any]:
"""Neutralize a raw text string and return a standard result dict."""
return self._neutralizeText(text, 'text')
result = self._neutralizeText(text, 'text')
self._persistAttributes(result.get('mapping', {}), None)
return result
def processFile(self, fileId: str) -> Dict[str, Any]:
"""Neutralize a file referenced by its fileId using component interface.
Binary files are not neutralized but will be indicated in the result."""
Supports text files directly; PDF/DOCX/XLSX/PPTX via extract -> neutralize -> generate."""
if not self.interfaceDbComponent:
raise ValueError("Component interface is required to process a file by fileId")
# Fetch file data and metadata
fileInfo = None
try:
# getFile returns an object; fallback to dict-like
fileInfo = self.interfaceDbComponent.getFile(fileId)
except Exception:
fileInfo = None
fileName = getattr(fileInfo, 'fileName', None) if fileInfo else None
mimeType = getattr(fileInfo, 'mimeType', None) if fileInfo else None
# Check if file is binary and cannot be neutralized
fileData = self.interfaceDbComponent.getFileData(fileId)
if not fileData:
raise ValueError(f"No file data found for fileId: {fileId}")
mime_lower = (mimeType or '').lower()
# Binary but extractable: PDF, DOCX, XLSX, PPTX
if mime_lower in EXTRACTABLE_BINARY_MIME_TYPES:
try:
result = asyncio.run(self._processBinaryFile(fileData, fileName or "document", mime_lower, fileId))
if result:
result['file_id'] = fileId
result['neutralized_file_name'] = f"neutralized_{fileName}" if fileName else "neutralized_document"
return result
except Exception as e:
logger.error(f"Binary file neutralization failed: {str(e)}")
return {
'file_id': fileId,
'is_binary': True,
'mime_type': mimeType or 'unknown',
'file_name': fileName or 'unknown',
'neutralized_text': None,
'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)}
}
# Binary but not extractable
if self._isBinaryMimeType(mimeType or ''):
# Return a result indicating binary file (not neutralized)
return {
'file_id': fileId,
'is_binary': True,
'mime_type': mimeType or 'unknown',
'file_name': fileName or 'unknown',
'neutralized_text': None,
'processed_info': {
'type': 'binary',
'status': 'skipped',
'message': 'Binary file neutralization will be implemented in the future'
}
'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported for neutralization'}
}
fileData = self.interfaceDbComponent.getFileData(fileId)
if not fileData:
raise ValueError(f"No file data found for fileId: {fileId}")
# Determine textType from mime
# Text-based file
textType = self._getContentTypeFromMime(mimeType or '')
# Decode to text
try:
textContent = fileData.decode('utf-8')
except UnicodeDecodeError:
@ -123,17 +148,59 @@ class NeutralizationService:
except UnicodeDecodeError:
continue
if decoded is None:
raise ValueError("Unable to decode file content as text. This may indicate a binary file that cannot be neutralized.")
raise ValueError("Unable to decode file content as text.")
textContent = decoded
result = self._neutralizeText(textContent, textType)
# Add a reasonable output filename if original known
self._persistAttributes(result.get('mapping', {}), fileId)
if fileName:
result['neutralized_file_name'] = f"neutralized_{fileName}"
result['file_id'] = fileId
result['is_binary'] = False
return result
def processBinaryBytes(self, fileBytes: bytes, fileName: str, mimeType: str) -> Dict[str, Any]:
"""Neutralize binary file bytes (sync - use from sync callers). Uses asyncio.run when event loop not running."""
mime_lower = (mimeType or '').lower()
if mime_lower not in EXTRACTABLE_BINARY_MIME_TYPES:
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported'}
}
try:
return asyncio.run(self._processBinaryFile(fileBytes, fileName, mime_lower, None))
except Exception as e:
logger.error(f"Binary neutralization failed: {str(e)}")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)}
}
async def processBinaryBytesAsync(self, fileBytes: bytes, fileName: str, mimeType: str) -> Dict[str, Any]:
"""Neutralize binary file bytes (async - use from async routes to avoid event loop conflict)."""
mime_lower = (mimeType or '').lower()
if mime_lower not in EXTRACTABLE_BINARY_MIME_TYPES:
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'skipped', 'message': 'File type not supported'}
}
try:
return await self._processBinaryFile(fileBytes, fileName, mime_lower, None)
except Exception as e:
logger.error(f"Binary neutralization failed: {str(e)}")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': str(e)}
}
def resolveText(self, text: str) -> str:
if not self.interfaceNeutralizer:
return text
@ -167,6 +234,195 @@ class NeutralizationService:
return False
return self.interfaceNeutralizer.deleteNeutralizationAttributes(fileId)
def _persistAttributes(self, mapping: Dict[str, str], fileId: Optional[str]) -> None:
"""Persist mapping to DB for resolve to work. mapping: originalText -> placeholder e.g. '[email.uuid]'"""
if not self.interfaceNeutralizer or not mapping:
return
import re
placeholder_re = re.compile(r'^\[([a-z]+)\.([a-f0-9-]{36})\]$')
for original_text, placeholder in mapping.items():
m = placeholder_re.match(placeholder)
if m:
pattern_type, uid = m.group(1), m.group(2)
try:
self.interfaceNeutralizer.createAttribute(
attributeId=uid,
originalText=original_text,
patternType=pattern_type,
fileId=fileId
)
except Exception as e:
logger.debug(f"Could not persist attribute {uid}: {e}")
async def _processBinaryFile(
self,
fileBytes: bytes,
fileName: str,
mimeType: str,
fileId: Optional[str]
) -> Dict[str, Any]:
"""Extract -> neutralize -> adapt -> generate for PDF/DOCX/XLSX/PPTX."""
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
# Ensure registries exist
if ExtractionService._sharedExtractorRegistry is None:
ExtractionService(self.services)
registry = ExtractionService._sharedExtractorRegistry
chunker = ExtractionService._sharedChunkerRegistry
opts = ExtractionOptions(prompt="neutralize", mergeStrategy=MergeStrategy(preserveChunks=True))
# 1. Extract
extracted = runExtraction(registry, chunker, fileBytes, fileName, mimeType, opts)
parts = extracted.parts if hasattr(extracted, 'parts') else []
if not parts:
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': 'No content extracted'}
}
# 2. Neutralize each text/table part
all_mapping: Dict[str, str] = {}
neutralized_parts: List[Any] = []
neutralization_error: Optional[str] = None
for part in parts:
p = part if isinstance(part, dict) else part.model_dump() if hasattr(part, 'model_dump') else part
type_group = p.get('typeGroup', '')
data = p.get('data', '')
if type_group in ('binary', 'image') or not (data and str(data).strip()):
neutralized_parts.append(part)
continue
nr = self._neutralizeText(str(data), 'text' if type_group != 'table' else 'csv')
proc = nr.get('processed_info', {}) or {}
if isinstance(proc, dict) and proc.get('type') == 'error':
neutralization_error = proc.get('error', 'Neutralization failed')
neu_text = nr.get('neutralized_text', str(data))
mapping = nr.get('mapping', {})
all_mapping.update(mapping)
new_part = {**p, 'data': neu_text}
neutralized_parts.append(new_part)
self._persistAttributes(all_mapping, fileId)
# 3. PDF: Use in-place only; no fallback to render
if mimeType == "application/pdf":
if neutralization_error:
logger.error(f"PDF neutralization aborted: {neutralization_error}")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': neutralization_error}
}
in_place_bytes = neutralize_pdf_in_place(fileBytes, all_mapping)
if in_place_bytes is not None:
logger.info("PDF neutralization completed via in-place redaction (layout preserved)")
return {
'neutralized_text': None,
'neutralized_bytes': in_place_bytes,
'neutralized_file_name': f"neutralized_{fileName}",
'is_binary': True,
'mime_type': 'application/pdf',
'attributes': [{'original': k, 'placeholder': v} for k, v in all_mapping.items()],
'processed_info': {'type': 'binary', 'status': 'success', 'format': 'pdf', 'method': 'in-place'}
}
logger.error("PDF in-place neutralization failed")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': 'PDF in-place neutralization failed'}
}
# 4. Adapter: ContentPart list -> renderer schema (non-PDF only)
schema = content_parts_to_renderer_schema(neutralized_parts, title=fileName or "Neutralized")
# 5. Render to format
renderer, output_mime = self._getRendererForMime(mimeType)
if not renderer:
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': f'No renderer for {mimeType}'}
}
try:
logger.info(f"Calling renderer.render for mime={mimeType}, renderer={type(renderer).__name__}")
rendered = await renderer.render(schema, fileName or "document", None, None)
logger.info(f"Renderer returned: type={type(rendered).__name__}, len={len(rendered) if rendered else 0}")
if not rendered or len(rendered) == 0:
logger.error("Renderer returned empty list")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Render produced no output'}
}
doc = rendered[0]
logger.info(f"First doc: type={type(doc).__name__}, isinstance(dict)={isinstance(doc, dict)}, has documentData attr={hasattr(doc, 'documentData')}")
# Extract documentData: Pydantic v2 models may need model_dump() for reliable access
if isinstance(doc, dict):
doc_data = doc.get('documentData')
elif hasattr(doc, 'model_dump'):
d = doc.model_dump(mode='python')
doc_data = d.get('documentData')
else:
doc_data = getattr(doc, 'documentData', None)
logger.info(f"doc_data: type={type(doc_data).__name__ if doc_data is not None else 'None'}, len={len(doc_data) if doc_data else 0}")
if doc_data is None:
logger.error("Renderer returned document with no documentData")
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Renderer returned no data'}
}
if isinstance(doc_data, str):
doc_data = doc_data.encode('utf-8')
return {
'neutralized_text': None,
'neutralized_bytes': doc_data,
'neutralized_file_name': f"neutralized_{fileName}",
'is_binary': True,
'mime_type': output_mime,
'attributes': [{'original': k, 'placeholder': v} for k, v in all_mapping.items()],
'processed_info': {'type': 'binary', 'status': 'success', 'format': mimeType}
}
except Exception as e:
logger.error(f"Render failed for {mimeType}: {str(e)}", exc_info=True)
raise
return {
'neutralized_text': None,
'neutralized_bytes': None,
'is_binary': True,
'processed_info': {'type': 'binary', 'status': 'error', 'error': 'Render produced no output'}
}
def _getRendererForMime(self, mimeType: str):
"""Get renderer instance and output mime for the given input MIME type."""
from modules.services.serviceGeneration.renderers.rendererPdf import RendererPdf
from modules.services.serviceGeneration.renderers.rendererDocx import RendererDocx
from modules.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
from modules.services.serviceGeneration.renderers.rendererPptx import RendererPptx
mime_map = {
"application/pdf": (RendererPdf, "application/pdf"),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (RendererDocx, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": (RendererXlsx, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
"application/vnd.openxmlformats-officedocument.presentationml.presentation": (RendererPptx, "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
}
pair = mime_map.get(mimeType)
if not pair:
return None, None
cls, out_mime = pair
renderer = cls(self.services)
return renderer, out_mime
def _reloadNamesFromConfig(self) -> None:
"""Reload names from config and update processors"""
try:

View file

@ -0,0 +1,115 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Adapter to convert ContentPart list (from extraction) to renderer JSON schema.
Schema: { metadata: {...}, documents: [{ sections: [{ content_type, elements: [...] }] }] }
"""
import csv
import io
from typing import Dict, List, Any
from modules.datamodels.datamodelExtraction import ContentPart
def content_parts_to_renderer_schema(parts: List[ContentPart], title: str = "Neutralized Document") -> Dict[str, Any]:
"""
Convert ContentPart list to the standardized renderer schema.
Args:
parts: List of ContentPart from extraction
title: Document title for metadata
Returns:
Dict with metadata, documents[0].sections structure for renderers
"""
sections: List[Dict[str, Any]] = []
for part in parts:
if not hasattr(part, 'typeGroup') or not hasattr(part, 'data'):
part_dict = part if isinstance(part, dict) else part.model_dump()
type_group = part_dict.get("typeGroup", "text")
data = part_dict.get("data", "")
label = part_dict.get("label", "")
else:
type_group = part.typeGroup
data = part.data or ""
label = part.label or ""
# Skip binary/image parts without text - they can't be neutralized meaningfully
if type_group in ("binary", "image"):
continue
# Skip empty data
if not (data and str(data).strip()):
continue
section = _part_to_section(type_group, data, label)
if section:
sections.append(section)
# Ensure at least one section (renderers require it)
if not sections:
sections = [{
"content_type": "paragraph",
"elements": [{"type": "paragraph", "content": {"text": ""}}]
}]
return {
"metadata": {"title": title},
"documents": [{
"sections": sections
}]
}
def _part_to_section(type_group: str, data: str, label: str) -> Dict[str, Any]:
"""Convert a single ContentPart to a section dict."""
data_str = str(data).strip()
if type_group == "table" and ("csv" in label.lower() or "," in data_str or "\t" in data_str):
# Parse CSV/TSV into table structure
try:
rows = list(csv.reader(io.StringIO(data_str)))
if rows:
headers = rows[0]
rows_data = rows[1:]
return {
"content_type": "table",
"elements": [{
"type": "table",
"content": {"headers": headers, "rows": rows_data}
}]
}
except Exception:
pass
# Fallback: treat as paragraph
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}
if type_group == "structure":
# PPTX slide content - often markdown-like
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}
# Default: text/paragraph
return {
"content_type": "paragraph",
"elements": [{
"type": "extracted_text",
"content": data_str,
"source": label
}]
}

View file

@ -10,6 +10,18 @@ import uuid
from typing import Dict, List, Tuple, Any
from .subPatterns import DataPatterns, findPatternsInText
# Phrases or words that must never be neutralized (labels, Anrede, etc.)
_NEUTRALIZATION_BLACKLIST = frozenset({
"Für Sie", "Ihre Ansprechperson", "AXA 24", "General Agent",
"Your Contact", "Contact Person", "Bei Fragen", "Mit Freundlichen",
"Frau", "Herr", # Anrede
"Reise", "Reisebeginn", "Reiseende", "Vertragsbeginn", "Zahlbar",
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
"Leistungen", "Basis", "Benefits", # Section labels
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
})
class StringParser:
"""Handles string parsing and replacement operations"""
@ -48,7 +60,17 @@ class StringParser:
"""
patternMatches = findPatternsInText(text, self.data_patterns)
# Process pattern matches from right to left to avoid position shifts
# Exclude matches that are fully contained in a longer match (e.g. skip "2026" inside "17.02.2026")
def is_contained(m, all_matches):
for other in all_matches:
if other is m:
continue
if other[2] <= m[2] and m[3] <= other[3] and (other[3] - other[2]) > (m[3] - m[2]):
return True
return False
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
# Process from right to left to avoid position shifts
for patternName, matchedText, start, end in reversed(patternMatches):
# Skip if already a placeholder
if self._isPlaceholder(matchedText):
@ -58,15 +80,27 @@ class StringParser:
if '[' in matchedText or ']' in matchedText:
continue
# Skip blacklisted text (labels, Anrede, etc.) never neutralize
if matchedText.strip() in _NEUTRALIZATION_BLACKLIST:
continue
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
continue
if matchedText not in self.mapping:
# Generate a UUID for the placeholder
placeholderId = str(uuid.uuid4())
# Create placeholder in format [type.uuid]
typeMapping = {
'email': 'email',
'phone': 'phone',
'phone': 'phone',
'address': 'address',
'id': 'id'
'date': 'date',
'policy': 'policy',
'name': 'name',
'id': 'id',
'iban': 'iban',
'ssn': 'ssn',
}
placeholderType = typeMapping.get(patternName, 'data')
self.mapping[matchedText] = f"[{placeholderType}.{placeholderId}]"

View file

@ -234,6 +234,29 @@ class HeaderPatterns:
class DataPatterns:
"""Patterns for identifying sensitive data in content"""
patterns = [
# Name patterns (before email so "name@domain" is not matched as name)
Pattern(
name="name",
patterns=[
# Contact person context (fixed-width lookbehind for Python re)
r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
# Name only after Anrede (keep Frau/Herr; replace only the name) fixed-width lookbehind
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
],
replacement_template="[NAME_{}]"
),
# Email pattern for plain text
Pattern(
name="email",
@ -276,12 +299,18 @@ class DataPatterns:
replacement_template="[IBAN_{}]"
),
# Address patterns
# Address patterns (compound first so full footer = one UUID)
Pattern(
name="address",
patterns=[
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:[a-z])?\b',
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b'
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
# Street + house number (standalone)
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Postfach / PO Box (standalone)
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
# Postal code + city (standalone)
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
],
replacement_template="[ADDRESS_{}]"
),
@ -290,25 +319,58 @@ class DataPatterns:
Pattern(
name="date",
patterns=[
# Specific date formats with context
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Birth dates
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Birth dates
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Contract dates
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Contract dates
# Specific date formats with month names
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', # Birth dates with month
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' # Contract dates with month
# Standalone date values require valid day (131) and month (112) to avoid decimals (e.g. 53.37 CHF)
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines)
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03.
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5)
# Context-specific date formats
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
],
replacement_template="[DATE_{}]"
),
# Policy number patterns (replaces only the number, keeps labels like "Police Nr.")
Pattern(
name="policy",
patterns=[
# Number after "Police Nr." etc. (fixed-width lookbehind Python re requirement)
r'(?<=Police Nr\. )[\d.]+',
r'(?<=Police Nr\. )[\d.]+',
r'(?<=Police Nr\.: )[\d.]+',
r'(?<=Police Nr )[\d.]+',
r'(?<=Police Nr: )[\d.]+',
r'(?<=Polizzenr\. )[\d.]+',
r'(?<=Polizzenummer: )[\d.]+',
r'(?<=Polizzenummer )[\d.]+',
r'(?<=Policy No\. )[\d.]+',
r'(?<=Policy No )[\d.]+',
r'(?<=Policy Number: )[\d.]+',
r'(?<=Policy Number )[\d.]+',
r'(?<=Polizza n° )[\d.]+',
r'(?<=Numéro de police: )[\d.]+',
r'(?<=Numéro de police )[\d.]+',
r'(?<=Numero polizza: )[\d.]+',
r'(?<=Numero polizza )[\d.]+',
# Standalone policy number format (e.g. 11.559.499) require 2+ digit prefix to avoid amounts
r'\b\d{2,4}(?:\.\d{3}){2,}\b'
],
replacement_template="[POLICY_{}]"
),
# SSN patterns
Pattern(
name="ssn",
patterns=[
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b', # Swiss AHV
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
r'\b\d{3}\.\d{3}\.\d{3}\b' # Generic SSN format
# Generic SSN format - exclude when followed by comma+digit (European decimal)
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
],
replacement_template="[SSN_{}]"
)

View file

@ -0,0 +1,110 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
PDF in-place neutralization using PyMuPDF.
Removes original text completely and inserts full UUID placeholders.
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
"""
import io
import logging
from typing import Dict, Optional
logger = logging.getLogger(__name__)
def neutralize_pdf_in_place(
pdf_bytes: bytes,
mapping: Dict[str, str],
) -> Optional[bytes]:
"""
Remove sensitive text and replace with UUID placeholders in-place.
Content is fully removed (not just covered) so it cannot be copied.
Args:
pdf_bytes: Original PDF file content
mapping: Dict of original_text -> placeholder (e.g. [address.uuid])
Returns:
Modified PDF bytes, or None on failure
"""
if not mapping:
return pdf_bytes
try:
import fitz # PyMuPDF
except ImportError:
logger.warning("PyMuPDF (fitz) not available for PDF in-place neutralization")
return None
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
logger.error(f"Failed to open PDF: {e}")
return None
sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
fill_color = (1, 1, 1)
text_color = (0, 0, 0)
fontname = "helv"
fontsize = 8
try:
for page_num in range(len(doc)):
page = doc[page_num]
for original_text, placeholder in sorted_items:
if not original_text or not placeholder:
continue
search_text = original_text
insert_text = placeholder
if placeholder.startswith("[policy."):
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
candidate = prefix + original_text
try:
hits = page.search_for(candidate, quads=False)
if hits:
search_text = candidate
insert_text = placeholder # UUID only so it fits in rect
break
except Exception:
continue
try:
instances = page.search_for(search_text, quads=False)
except Exception:
instances = []
for rect in instances:
try:
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
page.add_redact_annot(
rect,
text=insert_text,
fill=fill_color,
text_color=text_color,
fontname=fontname,
fontsize=fs,
)
except Exception as e:
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
try:
page.apply_redactions()
except Exception as e:
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
buf = io.BytesIO()
doc.save(buf, garbage=4, deflate=True)
doc.close()
return buf.getvalue()
except Exception as e:
logger.error(f"PDF in-place neutralization failed: {e}", exc_info=True)
try:
doc.close()
except Exception:
pass
return None

View file

@ -7,7 +7,7 @@ import logging
import json
# Import auth module
from modules.auth import limiter, getCurrentUser
from modules.auth import limiter, getCurrentUser, getRequestContext, RequestContext
# Import interfaces
import modules.interfaces.interfaceDbManagement as interfaceDbManagement
@ -40,7 +40,8 @@ router = APIRouter(
def get_files(
request: Request,
pagination: Optional[str] = Query(None, description="JSON-encoded PaginationParams object"),
currentUser: User = Depends(getCurrentUser)
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> PaginatedResponse[FileItem]:
"""
Get files with optional pagination, sorting, and filtering.
@ -69,7 +70,11 @@ def get_files(
detail=f"Invalid pagination parameter: {str(e)}"
)
managementInterface = interfaceDbManagement.getInterface(currentUser)
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = managementInterface.getAllFiles(pagination=paginationParams)
# If pagination was requested, result is PaginatedResult
@ -330,11 +335,16 @@ def get_file_stats(
def download_file(
request: Request,
fileId: str = Path(..., description="ID of the file to download"),
currentUser: User = Depends(getCurrentUser)
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Response:
"""Download a file"""
"""Download a file. Uses mandate/instance context when present (e.g. from feature pages)."""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
# Get file data
fileData = managementInterface.getFile(fileId)
@ -378,11 +388,16 @@ def download_file(
def preview_file(
request: Request,
fileId: str = Path(..., description="ID of the file to preview"),
currentUser: User = Depends(getCurrentUser)
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> FilePreview:
"""Preview a file's content"""
"""Preview a file's content. Uses mandate/instance context when present."""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
# Get file preview using the correct method
preview = managementInterface.getFileContent(fileId)

View file

@ -114,6 +114,9 @@ def _getFeatureUiObjects(featureCode: str) -> List[Dict[str, Any]]:
elif featureCode == "teamsbot":
from modules.features.teamsbot.mainTeamsbot import UI_OBJECTS
return UI_OBJECTS
elif featureCode == "neutralization":
from modules.features.neutralization.mainNeutralization import UI_OBJECTS
return UI_OBJECTS
else:
logger.warning(f"Unknown feature code: {featureCode}")
return []