112 lines
No EOL
4.3 KiB
Python
112 lines
No EOL
4.3 KiB
Python
"""
|
|
DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme
|
|
Unterstützt TXT, JSON, CSV, Excel und Word-Dateien
|
|
Mehrsprachig: DE, EN, FR, IT
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, List, Any
|
|
|
|
# Import all necessary classes and functions
|
|
from modules.neutralizer.subProcessCommon import ProcessResult, CommonUtils
|
|
from modules.neutralizer.subProcessText import TextProcessor, PlainText
|
|
from modules.neutralizer.subProcessList import ListProcessor, TableData
|
|
from modules.neutralizer.subProcessBinary import BinaryProcessor, BinaryData
|
|
from modules.neutralizer.subParseString import StringParser
|
|
from modules.neutralizer.subPatterns import Pattern, HeaderPatterns, DataPatterns, TextTablePatterns
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Export all classes and functions for external use
|
|
__all__ = [
|
|
'DataAnonymizer',
|
|
'ProcessResult',
|
|
'CommonUtils',
|
|
'TextProcessor',
|
|
'PlainText',
|
|
'ListProcessor',
|
|
'TableData',
|
|
'BinaryProcessor',
|
|
'BinaryData',
|
|
'StringParser',
|
|
'Pattern',
|
|
'HeaderPatterns',
|
|
'DataPatterns',
|
|
'TextTablePatterns'
|
|
]
|
|
|
|
class DataAnonymizer:
|
|
"""Hauptklasse für die Datenanonymisierung"""
|
|
|
|
def __init__(self, names_to_parse: List[str] = None):
|
|
"""Initialize the anonymizer with specialized processors
|
|
|
|
Args:
|
|
names_to_parse: List of names to parse and replace (case-insensitive)
|
|
"""
|
|
self.names_to_parse = names_to_parse or []
|
|
|
|
# Initialize specialized processors
|
|
self.text_processor = TextProcessor(names_to_parse)
|
|
self.list_processor = ListProcessor(names_to_parse)
|
|
self.binary_processor = BinaryProcessor()
|
|
|
|
# Common utilities
|
|
self.common_utils = CommonUtils()
|
|
|
|
def process_content(self, content: str, content_type: str = None) -> ProcessResult:
|
|
"""
|
|
Process content and return anonymized data
|
|
|
|
Args:
|
|
content: Content to process
|
|
content_type: Type of content ('csv', 'json', 'xml', 'text', 'binary')
|
|
If None, will auto-detect
|
|
|
|
Returns:
|
|
ProcessResult: Contains anonymized data, mapping, replaced fields and processing info
|
|
"""
|
|
try:
|
|
# Auto-detect content type if not provided
|
|
if content_type is None:
|
|
content_type = self.common_utils.detect_content_type(content)
|
|
|
|
# Check if content is binary data
|
|
if self.binary_processor.is_binary_content(content):
|
|
return self.binary_processor.process_binary_content(content)
|
|
|
|
# Route to appropriate processor based on content type
|
|
if content_type in ['csv', 'json', 'xml']:
|
|
if content_type == 'csv':
|
|
result, mapping, replaced_fields, processed_info = self.list_processor.process_csv_content(content)
|
|
elif content_type == 'json':
|
|
result, mapping, replaced_fields, processed_info = self.list_processor.process_json_content(content)
|
|
else: # xml
|
|
result, mapping, replaced_fields, processed_info = self.list_processor.process_xml_content(content)
|
|
|
|
return ProcessResult(result, mapping, replaced_fields, processed_info)
|
|
else:
|
|
# Handle as text
|
|
result, mapping, replaced_fields, processed_info = self.text_processor.process_text_content(content)
|
|
return ProcessResult(result, mapping, replaced_fields, processed_info)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing content: {str(e)}")
|
|
return ProcessResult(None, {}, [], {'type': 'error', 'error': str(e)})
|
|
|
|
def get_mapping(self) -> Dict[str, str]:
|
|
"""
|
|
Get the combined mapping from all processors
|
|
|
|
Returns:
|
|
Dict[str, str]: Combined mapping dictionary
|
|
"""
|
|
text_mapping = self.text_processor.get_mapping()
|
|
list_mapping = self.list_processor.get_mapping()
|
|
return self.common_utils.merge_mappings(text_mapping, list_mapping)
|
|
|
|
def clear_mapping(self):
|
|
"""Clear the mapping in all processors"""
|
|
self.text_processor.clear_mapping()
|
|
self.list_processor.clear_mapping() |