gateway/modules/neutralizer/neutralizer.py
2025-09-22 00:39:15 +02:00

112 lines
No EOL
4.3 KiB
Python

"""
DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme
Unterstützt TXT, JSON, CSV, Excel und Word-Dateien
Mehrsprachig: DE, EN, FR, IT
"""
import logging
from typing import Dict, List, Any
# Import all necessary classes and functions
from modules.neutralizer.subProcessCommon import ProcessResult, CommonUtils
from modules.neutralizer.subProcessText import TextProcessor, PlainText
from modules.neutralizer.subProcessList import ListProcessor, TableData
from modules.neutralizer.subProcessBinary import BinaryProcessor, BinaryData
from modules.neutralizer.subParseString import StringParser
from modules.neutralizer.subPatterns import Pattern, HeaderPatterns, DataPatterns, TextTablePatterns
# Configure logging
logger = logging.getLogger(__name__)
# Export all classes and functions for external use
__all__ = [
'DataAnonymizer',
'ProcessResult',
'CommonUtils',
'TextProcessor',
'PlainText',
'ListProcessor',
'TableData',
'BinaryProcessor',
'BinaryData',
'StringParser',
'Pattern',
'HeaderPatterns',
'DataPatterns',
'TextTablePatterns'
]
class DataAnonymizer:
"""Hauptklasse für die Datenanonymisierung"""
def __init__(self, names_to_parse: List[str] = None):
"""Initialize the anonymizer with specialized processors
Args:
names_to_parse: List of names to parse and replace (case-insensitive)
"""
self.names_to_parse = names_to_parse or []
# Initialize specialized processors
self.text_processor = TextProcessor(names_to_parse)
self.list_processor = ListProcessor(names_to_parse)
self.binary_processor = BinaryProcessor()
# Common utilities
self.common_utils = CommonUtils()
def process_content(self, content: str, content_type: str = None) -> ProcessResult:
"""
Process content and return anonymized data
Args:
content: Content to process
content_type: Type of content ('csv', 'json', 'xml', 'text', 'binary')
If None, will auto-detect
Returns:
ProcessResult: Contains anonymized data, mapping, replaced fields and processing info
"""
try:
# Auto-detect content type if not provided
if content_type is None:
content_type = self.common_utils.detect_content_type(content)
# Check if content is binary data
if self.binary_processor.is_binary_content(content):
return self.binary_processor.process_binary_content(content)
# Route to appropriate processor based on content type
if content_type in ['csv', 'json', 'xml']:
if content_type == 'csv':
result, mapping, replaced_fields, processed_info = self.list_processor.process_csv_content(content)
elif content_type == 'json':
result, mapping, replaced_fields, processed_info = self.list_processor.process_json_content(content)
else: # xml
result, mapping, replaced_fields, processed_info = self.list_processor.process_xml_content(content)
return ProcessResult(result, mapping, replaced_fields, processed_info)
else:
# Handle as text
result, mapping, replaced_fields, processed_info = self.text_processor.process_text_content(content)
return ProcessResult(result, mapping, replaced_fields, processed_info)
except Exception as e:
logger.error(f"Error processing content: {str(e)}")
return ProcessResult(None, {}, [], {'type': 'error', 'error': str(e)})
def get_mapping(self) -> Dict[str, str]:
"""
Get the combined mapping from all processors
Returns:
Dict[str, str]: Combined mapping dictionary
"""
text_mapping = self.text_processor.get_mapping()
list_mapping = self.list_processor.get_mapping()
return self.common_utils.merge_mappings(text_mapping, list_mapping)
def clear_mapping(self):
"""Clear the mapping in all processors"""
self.text_processor.clear_mapping()
self.list_processor.clear_mapping()