gateway/modules/features/neutralization/serviceNeutralization/subProcessBinary.py
2026-01-25 03:01:01 +01:00

103 lines
3.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Binary data processing module for data anonymization
Handles binary data types (images, audio, video, etc.)
"""
import base64
import re
from typing import Dict, Any, Tuple
from dataclasses import dataclass
@dataclass
class BinaryData:
"""Repräsentiert Binärdaten"""
content: str
data_type: str # 'image', 'audio', 'video', 'document', 'unknown'
encoding: str # 'base64', 'hex', 'raw'
class BinaryProcessor:
"""Handles binary data processing for anonymization"""
def __init__(self):
"""Initialize the binary processor"""
self.supported_types = {
'image': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'],
'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'],
'video': ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.webm'],
'document': ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']
}
def _detectBinaryType(self, content: str) -> str:
"""
Detect if content is binary data and determine type
Args:
content: Content to analyze
Returns:
str: Binary type or 'text' if not binary
"""
# Check if content looks like base64
if re.match(r'^[A-Za-z0-9+/]*={0,2}$', content.strip()):
try:
decoded = base64.b64decode(content)
# Try to decode as text
decoded.decode('utf-8')
return 'text' # It's base64 encoded text
except (base64.binascii.Error, UnicodeDecodeError):
# It's binary data
return 'binary'
# Check for binary patterns
if len(content) > 100 and '\x00' in content:
return 'binary'
return 'text'
def isBinaryContent(self, content: str) -> bool:
"""
Check if content is binary data
Args:
content: Content to check
Returns:
bool: True if content is binary
"""
return self._detectBinaryType(content) == 'binary'
def processBinaryContent(self, content: str) -> Tuple[Any, Dict[str, str], list, Dict[str, Any]]:
"""
Process binary content for anonymization
Args:
content: Binary content to process
Returns:
Tuple of (processed_data, mapping, replaced_fields, processed_info)
"""
# TODO: Implement binary data neutralization
# This would require:
# 1. Detecting binary data types (images, audio, video, etc.)
# 2. Implementing specific neutralization for each type
# 3. Handling metadata and embedded content
# 4. Preserving binary integrity while removing sensitive data
processedInfo = {
'type': 'binary',
'status': 'not_implemented',
'message': 'Binary data neutralization not yet implemented'
}
return content, {}, [], processedInfo
def getSupportedTypes(self) -> Dict[str, list]:
"""
Get list of supported binary file types
Returns:
Dict[str, list]: Dictionary of supported types and their extensions
"""
return self.supported_types.copy()