# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Pattern definitions for data anonymization Separates header patterns from data patterns """ from dataclasses import dataclass from typing import List, Optional, Tuple import re @dataclass class Pattern: """Base class for patterns""" name: str patterns: List[str] replacement_template: str class HeaderPatterns: """Patterns for identifying sensitive data in headers""" patterns = [ # Name patterns Pattern( name="name", patterns=[ # Simple variations r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b', r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b', r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b', # German variations r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b', r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b', # French variations r'\b(?:prénom|nom|nom[-_\s]*complet)\b', r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b', # Italian variations r'\b(?:nome|cognome|nome[-_\s]*completo)\b', r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b', # Common variations r'\b(?:nom|name|nome|naam)\b' ], replacement_template="[NAME_{}]" ), # Email patterns Pattern( name="email", patterns=[ # Simple variations - only labels r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b', r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b', r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b', # German variations - only labels r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b', r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b', r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b', # French variations - only labels r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b', r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b', r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b', # Italian variations - only labels r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b', r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b', r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b' ], replacement_template="[EMAIL_{}]" ), # Phone patterns Pattern( name="phone", patterns=[ # Simple variations r'\b(?:phone|tel|telephone|mobile)\b', r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b', # German variations r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b', r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b', # French variations r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b', r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b', # Italian variations r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b', r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b' ], replacement_template="[PHONE_{}]" ), # IBAN patterns Pattern( name="iban", patterns=[ # Simple variations r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b', r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b', # German variations r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b', r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b', # French variations r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b', r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b', # Credit card variations in French r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b', r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b', r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b', r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b', # Italian variations r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b', r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b', # Common variations r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b', # Credit card variations r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b', r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b', r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b', r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b', r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b', r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b', # Credit card variations in other languages r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b', r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b', # Payment variations r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b', r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b', r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b', r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b', # Common credit card abbreviations r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b', r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b', r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b', r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b', # Simple credit card r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b', # Additional credit card variations r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b', r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b', r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b', r'\b(?:card[-_\s]*data|card[-_\s]*account)\b' ], replacement_template="[IBAN_{}]" ), # Address patterns Pattern( name="address", patterns=[ # English variations r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b', r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b', r'\b(?:.*address.*)\b', # Match any text containing "address" # German variations r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b', r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b', r'\b(?:.*adresse.*)\b', # Match any text containing "adresse" # French variations r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b', r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b', r'\b(?:.*adresse.*)\b', # Match any text containing "adresse" # Italian variations r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b', r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b', r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo" # Common variations r'\b(?:location|place|residence|domicile)\b', r'\b(?:standort|ort|wohnort|domizil)\b', r'\b(?:lieu|emplacement|résidence|domicile)\b', r'\b(?:luogo|posizione|residenza|domicilio)\b' ], replacement_template="[ADDRESS_{}]" ), # Date patterns Pattern( name="date", patterns=[ # English variations r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b', r'\b(?:dob|birthday|anniversary)\b', # German variations r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b', r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b', r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b', # French variations r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b', r'\b(?:anniversaire|date[-_\s]*anniversaire)\b', r'\b(?:né|née|né[-_\s]*le)\b', # Italian variations r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b', r'\b(?:compleanno|anniversario)\b', r'\b(?:nato|nata|nato[-_\s]*il)\b', # Common variations r'\b(?:birth|born|geboren|né|nato)\b' ], replacement_template="[DATE_{}]" ), # SSN patterns Pattern( name="ssn", patterns=[ # English variations r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b', r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b', # German variations r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b', r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b', r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b', # French variations r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b', r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b', # Italian variations r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b', r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b', # Common variations r'\b(?:ahv|svn|nss|avs)\b', # Additional AHV variations r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b', r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b', r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b', r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b', r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b', r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b', r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b' ], replacement_template="[SSN_{}]" ) ] class DataPatterns: """Patterns for identifying sensitive data in content""" patterns = [ # Name patterns (before email so "name@domain" is not matched as name) Pattern( name="name", patterns=[ # Contact person context (fixed-width lookbehind for Python re) r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', # Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', ], replacement_template="[NAME_{}]" ), # Email pattern for plain text Pattern( name="email", patterns=[ # Basic email pattern r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*' ], replacement_template="[EMAIL_{}]" ), # Phone patterns Pattern( name="phone", patterns=[ # International format r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b', # Swiss format r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b', # German format r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b', # French format r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b', # Italian format r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b', # Mobile numbers r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b', # Emergency numbers r'\b(?:112|911|118|117|144|1414)\b' ], replacement_template="[PHONE_{}]" ), # IBAN patterns Pattern( name="iban", patterns=[ r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b', r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b' ], replacement_template="[IBAN_{}]" ), # Address patterns (compound first so full footer = one UUID) Pattern( name="address", patterns=[ # Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch) r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)', # Street + house number (standalone) r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', # Postfach / PO Box (standalone) r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b', # Postal code + city (standalone) r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)' ], replacement_template="[ADDRESS_{}]" ), # Date patterns Pattern( name="date", patterns=[ # Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF) r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026 r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines) r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03. r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5) # Context-specific date formats r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' ], replacement_template="[DATE_{}]" ), # Policy number patterns (replaces only the number, keeps labels like "Police Nr.") Pattern( name="policy", patterns=[ # Number after "Police Nr." etc. (fixed-width lookbehind – Python re requirement) r'(?<=Police Nr\. )[\d.]+', r'(?<=Police Nr\. )[\d.]+', r'(?<=Police Nr\.: )[\d.]+', r'(?<=Police Nr )[\d.]+', r'(?<=Police Nr: )[\d.]+', r'(?<=Polizzenr\. )[\d.]+', r'(?<=Polizzenummer: )[\d.]+', r'(?<=Polizzenummer )[\d.]+', r'(?<=Policy No\. )[\d.]+', r'(?<=Policy No )[\d.]+', r'(?<=Policy Number: )[\d.]+', r'(?<=Policy Number )[\d.]+', r'(?<=Polizza n° )[\d.]+', r'(?<=Numéro de police: )[\d.]+', r'(?<=Numéro de police )[\d.]+', r'(?<=Numero polizza: )[\d.]+', r'(?<=Numero polizza )[\d.]+', # Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts r'\b\d{2,4}(?:\.\d{3}){2,}\b' ], replacement_template="[POLICY_{}]" ), # SSN patterns Pattern( name="ssn", patterns=[ r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs # Generic SSN format - exclude when followed by comma+digit (European decimal) r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)' ], replacement_template="[SSN_{}]" ) ] class TextTablePatterns: """Patterns for identifying table-like structures in text""" @staticmethod def getPatterns() -> List[Tuple[str, str]]: return [ # key: value pattern (with optional whitespace) (r'^([^:]+):\s*(.+)$', ':'), # key = value pattern (with optional whitespace) (r'^([^=]+)=\s*(.+)$', '='), # key = value pattern (with required whitespace) (r'^([^=]+)\s+=\s+(.+)$', '='), # key: value pattern (with required whitespace) (r'^([^:]+)\s+:\s+(.+)$', ':'), ] @staticmethod def _isTableLine(line: str) -> bool: """Check if a line matches any table pattern""" patterns = TextTablePatterns.getPatterns() return any(re.match(pattern[0], line.strip()) for pattern in patterns) @staticmethod def extractKeyValue(line: str) -> Optional[Tuple[str, str]]: """Extract key and value from a table line""" patterns = TextTablePatterns.getPatterns() for pattern, separator in patterns: match = re.match(pattern, line.strip()) if match: key = match.group(1).strip() value = match.group(2).strip() return key, value return None def getPatternForHeader(header: str, patterns: List[Pattern]) -> Optional[Pattern]: """ Find matching pattern for a header Args: header: The header to check patterns: List of patterns to check against Returns: Optional[Pattern]: Matching pattern or None """ if not header: return None header = header.lower().strip() for pattern in patterns: for p in pattern.patterns: if re.search(p, header, re.IGNORECASE): return pattern return None def findPatternsInText(text: str, patterns: List[Pattern]) -> List[tuple]: """ Find all pattern matches in text Args: text: Text to search patterns: List of patterns to check Returns: List[tuple]: List of (pattern_name, match, start, end) """ matches = [] for pattern in patterns: for p in pattern.patterns: for match in re.finditer(p, text, re.IGNORECASE): matches.append((pattern.name, match.group(0), match.start(), match.end())) return sorted(matches, key=lambda x: x[2]) # Sort by start position