""" DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme Unterstützt TXT, JSON, CSV, Excel und Word-Dateien Mehrsprachig: DE, EN, FR, IT """ import re import json import pandas as pd import docx from pathlib import Path from typing import Dict, List, Tuple, Any, Union from dataclasses import dataclass import uuid import logging import sys # Add the parent directory to the Python path sys.path.append(str(Path(__file__).parent.parent)) # Konfiguration für Logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class SensitivePattern: """Definiert ein Pattern für sensitive Daten""" name: str pattern: str languages: List[str] replacement_template: str class DataAnonymizer: """Hauptklasse für die Datenanonymisierung""" def __init__(self): self.patterns = self._initialize_patterns() self.anonymization_map = {} def _initialize_patterns(self) -> List[SensitivePattern]: """Initialisiert die Regex-Pattern für verschiedene Sprachen""" return [ # E-Mail Adressen SensitivePattern( name="email", pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', languages=["de", "en", "fr", "it", "ch"], replacement_template="EMAIL_{}" ), # Telefonnummern (verschiedene Formate) SensitivePattern( name="phone", pattern=r'(\+\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', languages=["de", "en", "fr", "it", "ch"], replacement_template="PHONE_{}" ), # Deutsche Telefonnummern spezifisch SensitivePattern( name="phone_de", pattern=r'(\+49|0049|0)\s?(\d{2,5})\s?(\d{3,8})', languages=["de"], replacement_template="PHONE_DE_{}" ), # Schweizer Telefonnummern SensitivePattern( name="phone_ch", pattern=r'(\+41|0041|0)\s?(\d{2})\s?(\d{3})\s?(\d{2})\s?(\d{2})', languages=["ch"], replacement_template="PHONE_CH_{}" ), # IBAN SensitivePattern( name="iban", pattern=r'\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z0-9]?){0,16}\b', languages=["de", "en", "fr", "it", "ch"], replacement_template="IBAN_{}" ), # Kreditkartennummern SensitivePattern( name="credit_card", pattern=r'\b(?:\d{4}[-\s]?){3}\d{4}\b', languages=["de", "en", "fr", "it", "ch"], replacement_template="CREDITCARD_{}" ), # Deutsche Sozialversicherungsnummer SensitivePattern( name="social_security_de", pattern=r'\b\d{2}\s?\d{6}\s?[A-Z]\s?\d{3}\b', languages=["de"], replacement_template="SSN_DE_{}" ), # Schweizer AHV/AVS Nummer SensitivePattern( name="ahv_number", pattern=r'\b(756\.\d{4}\.\d{4}\.\d{2}|756\s\d{4}\s\d{4}\s\d{2})\b', languages=["ch"], replacement_template="AHV_CH_{}" ), # IP-Adressen SensitivePattern( name="ip_address", pattern=r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', languages=["de", "en", "fr", "it", "ch"], replacement_template="IP_{}" ), # Postleitzahlen (Deutschland) SensitivePattern( name="postal_code_de", pattern=r'\b\d{5}\b', languages=["de"], replacement_template="PLZ_{}" ), # Schweizer Postleitzahlen SensitivePattern( name="postal_code_ch", pattern=r'\b[1-9]\d{3}\b', languages=["ch"], replacement_template="PLZ_CH_{}" ), # Namen (einfache Heuristik - kann erweitert werden) SensitivePattern( name="names", pattern=r'\b(Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+\b', languages=["de", "en", "fr", "it", "ch"], replacement_template="NAME_{}" ), # Adressen (vereinfacht) SensitivePattern( name="address", pattern=r'\b[A-ZÄÖÜ][a-zäöüß]+straße\s+\d+[a-z]?\b|\b[A-ZÄÖÜ][a-zäöüß]+\s+Street\s+\d+\b|\b\d+\s+[A-ZÄÖÜ][a-zäöüß]+\s+Street\b', languages=["de", "en", "ch"], replacement_template="ADDRESS_{}" ), # Schweizer UID/IDE Nummer SensitivePattern( name="uid_number", pattern=r'\bCHE-\d{3}\.\d{3}\.\d{3}\b', languages=["ch"], replacement_template="UID_CH_{}" ), # Schweizer Bankkontonummern (BC-Nummern) SensitivePattern( name="bank_account_ch", pattern=r'\b\d{2}-\d{2,6}-\d{1}\b', languages=["ch"], replacement_template="BANK_CH_{}" ), ] def main(): """Einfaches Beispiel für die Verwendung des Daten-Neutralisierers""" anonymizer = DataAnonymizer() # Beispieltext sample_text = """ Sehr geehrte Frau Müller, vielen Dank für Ihre E-Mail an max.mustermann@beispiel.de. Ihre Telefonnummer 030-12345678 wurde in unserem System hinterlegt. Die Rechnung wird an folgende Adresse gesendet: Musterstraße 123, 12345 Berlin Ihre IBAN: DE89 3704 0044 0532 0130 00 Mit freundlichen Grüßen Max Mustermann """ # Anonymisierung anonymized_text, mapping = anonymizer.anonymize_text(sample_text, "de") print("Originaler Text:") print(sample_text) print("\n" + "="*50 + "\n") print("Anonymisierter Text:") print(anonymized_text) print("\n" + "="*50 + "\n") print("Mapping:") for placeholder, original in mapping.items(): print(f"{placeholder} -> {original}") if __name__ == "__main__": main()