101 lines
3.3 KiB
Python
101 lines
3.3 KiB
Python
"""
|
|
Binary data processing module for data anonymization
|
|
Handles binary data types (images, audio, video, etc.)
|
|
"""
|
|
|
|
import base64
|
|
import re
|
|
from typing import Dict, Any, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class BinaryData:
|
|
"""Repräsentiert Binärdaten"""
|
|
content: str
|
|
data_type: str # 'image', 'audio', 'video', 'document', 'unknown'
|
|
encoding: str # 'base64', 'hex', 'raw'
|
|
|
|
class BinaryProcessor:
|
|
"""Handles binary data processing for anonymization"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the binary processor"""
|
|
self.supported_types = {
|
|
'image': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'],
|
|
'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'],
|
|
'video': ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.webm'],
|
|
'document': ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']
|
|
}
|
|
|
|
def _detectBinaryType(self, content: str) -> str:
|
|
"""
|
|
Detect if content is binary data and determine type
|
|
|
|
Args:
|
|
content: Content to analyze
|
|
|
|
Returns:
|
|
str: Binary type or 'text' if not binary
|
|
"""
|
|
# Check if content looks like base64
|
|
if re.match(r'^[A-Za-z0-9+/]*={0,2}$', content.strip()):
|
|
try:
|
|
decoded = base64.b64decode(content)
|
|
# Try to decode as text
|
|
decoded.decode('utf-8')
|
|
return 'text' # It's base64 encoded text
|
|
except (base64.binascii.Error, UnicodeDecodeError):
|
|
# It's binary data
|
|
return 'binary'
|
|
|
|
# Check for binary patterns
|
|
if len(content) > 100 and '\x00' in content:
|
|
return 'binary'
|
|
|
|
return 'text'
|
|
|
|
def isBinaryContent(self, content: str) -> bool:
|
|
"""
|
|
Check if content is binary data
|
|
|
|
Args:
|
|
content: Content to check
|
|
|
|
Returns:
|
|
bool: True if content is binary
|
|
"""
|
|
return self._detectBinaryType(content) == 'binary'
|
|
|
|
def processBinaryContent(self, content: str) -> Tuple[Any, Dict[str, str], list, Dict[str, Any]]:
|
|
"""
|
|
Process binary content for anonymization
|
|
|
|
Args:
|
|
content: Binary content to process
|
|
|
|
Returns:
|
|
Tuple of (processed_data, mapping, replaced_fields, processed_info)
|
|
"""
|
|
# TODO: Implement binary data neutralization
|
|
# This would require:
|
|
# 1. Detecting binary data types (images, audio, video, etc.)
|
|
# 2. Implementing specific neutralization for each type
|
|
# 3. Handling metadata and embedded content
|
|
# 4. Preserving binary integrity while removing sensitive data
|
|
|
|
processedInfo = {
|
|
'type': 'binary',
|
|
'status': 'not_implemented',
|
|
'message': 'Binary data neutralization not yet implemented'
|
|
}
|
|
|
|
return content, {}, [], processedInfo
|
|
|
|
def getSupportedTypes(self) -> Dict[str, list]:
|
|
"""
|
|
Get list of supported binary file types
|
|
|
|
Returns:
|
|
Dict[str, list]: Dictionary of supported types and their extensions
|
|
"""
|
|
return self.supported_types.copy()
|