wiki/test_neutralizer/neutralizer.py
2025-06-06 15:50:29 +02:00

201 lines
No EOL
6.5 KiB
Python

"""
DSGVO-konformer Daten-Neutralisierer für KI-Agentensysteme
Unterstützt TXT, JSON, CSV, Excel und Word-Dateien
Mehrsprachig: DE, EN, FR, IT
"""
import re
import json
import pandas as pd
import docx
from pathlib import Path
from typing import Dict, List, Tuple, Any, Union
from dataclasses import dataclass
import uuid
import logging
import sys
# Add the parent directory to the Python path
sys.path.append(str(Path(__file__).parent.parent))
# Konfiguration für Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class SensitivePattern:
"""Definiert ein Pattern für sensitive Daten"""
name: str
pattern: str
languages: List[str]
replacement_template: str
class DataAnonymizer:
"""Hauptklasse für die Datenanonymisierung"""
def __init__(self):
self.patterns = self._initialize_patterns()
self.anonymization_map = {}
def _initialize_patterns(self) -> List[SensitivePattern]:
"""Initialisiert die Regex-Pattern für verschiedene Sprachen"""
return [
# E-Mail Adressen
SensitivePattern(
name="email",
pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="EMAIL_{}"
),
# Telefonnummern (verschiedene Formate)
SensitivePattern(
name="phone",
pattern=r'(\+\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="PHONE_{}"
),
# Deutsche Telefonnummern spezifisch
SensitivePattern(
name="phone_de",
pattern=r'(\+49|0049|0)\s?(\d{2,5})\s?(\d{3,8})',
languages=["de"],
replacement_template="PHONE_DE_{}"
),
# Schweizer Telefonnummern
SensitivePattern(
name="phone_ch",
pattern=r'(\+41|0041|0)\s?(\d{2})\s?(\d{3})\s?(\d{2})\s?(\d{2})',
languages=["ch"],
replacement_template="PHONE_CH_{}"
),
# IBAN
SensitivePattern(
name="iban",
pattern=r'\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z0-9]?){0,16}\b',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="IBAN_{}"
),
# Kreditkartennummern
SensitivePattern(
name="credit_card",
pattern=r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="CREDITCARD_{}"
),
# Deutsche Sozialversicherungsnummer
SensitivePattern(
name="social_security_de",
pattern=r'\b\d{2}\s?\d{6}\s?[A-Z]\s?\d{3}\b',
languages=["de"],
replacement_template="SSN_DE_{}"
),
# Schweizer AHV/AVS Nummer
SensitivePattern(
name="ahv_number",
pattern=r'\b(756\.\d{4}\.\d{4}\.\d{2}|756\s\d{4}\s\d{4}\s\d{2})\b',
languages=["ch"],
replacement_template="AHV_CH_{}"
),
# IP-Adressen
SensitivePattern(
name="ip_address",
pattern=r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="IP_{}"
),
# Postleitzahlen (Deutschland)
SensitivePattern(
name="postal_code_de",
pattern=r'\b\d{5}\b',
languages=["de"],
replacement_template="PLZ_{}"
),
# Schweizer Postleitzahlen
SensitivePattern(
name="postal_code_ch",
pattern=r'\b[1-9]\d{3}\b',
languages=["ch"],
replacement_template="PLZ_CH_{}"
),
# Namen (einfache Heuristik - kann erweitert werden)
SensitivePattern(
name="names",
pattern=r'\b(Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-ZÄÖÜ][a-zäöüß]+\s+[A-ZÄÖÜ][a-zäöüß]+\b',
languages=["de", "en", "fr", "it", "ch"],
replacement_template="NAME_{}"
),
# Adressen (vereinfacht)
SensitivePattern(
name="address",
pattern=r'\b[A-ZÄÖÜ][a-zäöüß]+straße\s+\d+[a-z]?\b|\b[A-ZÄÖÜ][a-zäöüß]+\s+Street\s+\d+\b|\b\d+\s+[A-ZÄÖÜ][a-zäöüß]+\s+Street\b',
languages=["de", "en", "ch"],
replacement_template="ADDRESS_{}"
),
# Schweizer UID/IDE Nummer
SensitivePattern(
name="uid_number",
pattern=r'\bCHE-\d{3}\.\d{3}\.\d{3}\b',
languages=["ch"],
replacement_template="UID_CH_{}"
),
# Schweizer Bankkontonummern (BC-Nummern)
SensitivePattern(
name="bank_account_ch",
pattern=r'\b\d{2}-\d{2,6}-\d{1}\b',
languages=["ch"],
replacement_template="BANK_CH_{}"
),
]
def main():
"""Einfaches Beispiel für die Verwendung des Daten-Neutralisierers"""
anonymizer = DataAnonymizer()
# Beispieltext
sample_text = """
Sehr geehrte Frau Müller,
vielen Dank für Ihre E-Mail an max.mustermann@beispiel.de.
Ihre Telefonnummer 030-12345678 wurde in unserem System hinterlegt.
Die Rechnung wird an folgende Adresse gesendet:
Musterstraße 123, 12345 Berlin
Ihre IBAN: DE89 3704 0044 0532 0130 00
Mit freundlichen Grüßen
Max Mustermann
"""
# Anonymisierung
anonymized_text, mapping = anonymizer.anonymize_text(sample_text, "de")
print("Originaler Text:")
print(sample_text)
print("\n" + "="*50 + "\n")
print("Anonymisierter Text:")
print(anonymized_text)
print("\n" + "="*50 + "\n")
print("Mapping:")
for placeholder, original in mapping.items():
print(f"{placeholder} -> {original}")
if __name__ == "__main__":
main()