gateway/modules/neutralizer/subProcessBinary.py
2025-09-22 00:39:15 +02:00

101 lines
3.3 KiB
Python

"""
Binary data processing module for data anonymization
Handles binary data types (images, audio, video, etc.)
"""
import base64
import re
from typing import Dict, Any, Tuple
from dataclasses import dataclass
@dataclass
class BinaryData:
"""Repräsentiert Binärdaten"""
content: str
data_type: str # 'image', 'audio', 'video', 'document', 'unknown'
encoding: str # 'base64', 'hex', 'raw'
class BinaryProcessor:
"""Handles binary data processing for anonymization"""
def __init__(self):
"""Initialize the binary processor"""
self.supported_types = {
'image': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'],
'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'],
'video': ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.webm'],
'document': ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']
}
def detect_binary_type(self, content: str) -> str:
"""
Detect if content is binary data and determine type
Args:
content: Content to analyze
Returns:
str: Binary type or 'text' if not binary
"""
# Check if content looks like base64
if re.match(r'^[A-Za-z0-9+/]*={0,2}$', content.strip()):
try:
decoded = base64.b64decode(content)
# Try to decode as text
decoded.decode('utf-8')
return 'text' # It's base64 encoded text
except (base64.binascii.Error, UnicodeDecodeError):
# It's binary data
return 'binary'
# Check for binary patterns
if len(content) > 100 and '\x00' in content:
return 'binary'
return 'text'
def is_binary_content(self, content: str) -> bool:
"""
Check if content is binary data
Args:
content: Content to check
Returns:
bool: True if content is binary
"""
return self.detect_binary_type(content) == 'binary'
def process_binary_content(self, content: str) -> Tuple[Any, Dict[str, str], list, Dict[str, Any]]:
"""
Process binary content for anonymization
Args:
content: Binary content to process
Returns:
Tuple of (processed_data, mapping, replaced_fields, processed_info)
"""
# TODO: Implement binary data neutralization
# This would require:
# 1. Detecting binary data types (images, audio, video, etc.)
# 2. Implementing specific neutralization for each type
# 3. Handling metadata and embedded content
# 4. Preserving binary integrity while removing sensitive data
processed_info = {
'type': 'binary',
'status': 'not_implemented',
'message': 'Binary data neutralization not yet implemented'
}
return content, {}, [], processed_info
def get_supported_types(self) -> Dict[str, list]:
"""
Get list of supported binary file types
Returns:
Dict[str, list]: Dictionary of supported types and their extensions
"""
return self.supported_types.copy()