gateway/modules/features/neutralization/serviceNeutralization/subPatterns.py

470 lines
No EOL
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Pattern definitions for data anonymization
Separates header patterns from data patterns
"""
from dataclasses import dataclass
from typing import List, Optional, Tuple
import re
@dataclass
class Pattern:
"""Base class for patterns"""
name: str
patterns: List[str]
replacement_template: str
class HeaderPatterns:
"""Patterns for identifying sensitive data in headers"""
patterns = [
# Name patterns
Pattern(
name="name",
patterns=[
# Simple variations
r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b',
r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b',
r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b',
# German variations
r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b',
r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b',
# French variations
r'\b(?:prénom|nom|nom[-_\s]*complet)\b',
r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b',
# Italian variations
r'\b(?:nome|cognome|nome[-_\s]*completo)\b',
r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b',
# Common variations
r'\b(?:nom|name|nome|naam)\b'
],
replacement_template="[NAME_{}]"
),
# Email patterns
Pattern(
name="email",
patterns=[
# Simple variations - only labels
r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b',
r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b',
r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b',
# German variations - only labels
r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b',
r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b',
r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b',
# French variations - only labels
r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b',
r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b',
r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b',
# Italian variations - only labels
r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b',
r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b',
r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b'
],
replacement_template="[EMAIL_{}]"
),
# Phone patterns
Pattern(
name="phone",
patterns=[
# Simple variations
r'\b(?:phone|tel|telephone|mobile)\b',
r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b',
# German variations
r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b',
r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b',
# French variations
r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b',
r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b',
# Italian variations
r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b',
r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b'
],
replacement_template="[PHONE_{}]"
),
# IBAN patterns
Pattern(
name="iban",
patterns=[
# Simple variations
r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b',
r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b',
# German variations
r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b',
r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b',
# French variations
r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b',
r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b',
# Credit card variations in French
r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b',
r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b',
r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b',
r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b',
# Italian variations
r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b',
r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b',
# Common variations
r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b',
# Credit card variations
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b',
r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b',
r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b',
r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b',
r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b',
r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b',
# Credit card variations in other languages
r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b',
r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b',
# Payment variations
r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b',
r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b',
r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b',
r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b',
# Common credit card abbreviations
r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b',
r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b',
r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b',
r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b',
# Simple credit card
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b',
# Additional credit card variations
r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b',
r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b',
r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b',
r'\b(?:card[-_\s]*data|card[-_\s]*account)\b'
],
replacement_template="[IBAN_{}]"
),
# Address patterns
Pattern(
name="address",
patterns=[
# English variations
r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b',
r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b',
r'\b(?:.*address.*)\b', # Match any text containing "address"
# German variations
r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b',
r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b',
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
# French variations
r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b',
r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b',
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
# Italian variations
r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b',
r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b',
r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo"
# Common variations
r'\b(?:location|place|residence|domicile)\b',
r'\b(?:standort|ort|wohnort|domizil)\b',
r'\b(?:lieu|emplacement|résidence|domicile)\b',
r'\b(?:luogo|posizione|residenza|domicilio)\b'
],
replacement_template="[ADDRESS_{}]"
),
# Date patterns
Pattern(
name="date",
patterns=[
# English variations
r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b',
r'\b(?:dob|birthday|anniversary)\b',
# German variations
r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b',
r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b',
r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b',
# French variations
r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b',
r'\b(?:anniversaire|date[-_\s]*anniversaire)\b',
r'\b(?:né|née|né[-_\s]*le)\b',
# Italian variations
r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b',
r'\b(?:compleanno|anniversario)\b',
r'\b(?:nato|nata|nato[-_\s]*il)\b',
# Common variations
r'\b(?:birth|born|geboren|né|nato)\b'
],
replacement_template="[DATE_{}]"
),
# SSN patterns
Pattern(
name="ssn",
patterns=[
# English variations
r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b',
r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b',
# German variations
r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b',
r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b',
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
# French variations
r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b',
r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b',
# Italian variations
r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b',
r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b',
# Common variations
r'\b(?:ahv|svn|nss|avs)\b',
# Additional AHV variations
r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b',
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b',
r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b',
r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b',
r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b'
],
replacement_template="[SSN_{}]"
)
]
class DataPatterns:
"""Patterns for identifying sensitive data in content"""
patterns = [
# Name patterns (before email so "name@domain" is not matched as name)
Pattern(
name="name",
patterns=[
# Contact person context (fixed-width lookbehind for Python re)
r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
# Name only after Anrede (keep Frau/Herr; replace only the name) fixed-width lookbehind
# Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
],
replacement_template="[NAME_{}]"
),
# Email pattern for plain text
Pattern(
name="email",
patterns=[
# Basic email pattern
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*'
],
replacement_template="[EMAIL_{}]"
),
# Phone patterns
Pattern(
name="phone",
patterns=[
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# International format
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
# Swiss format
r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# German format
r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
# French format
r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# Italian format
r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
# Mobile numbers
r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b',
# Emergency numbers
r'\b(?:112|911|118|117|144|1414)\b'
],
replacement_template="[PHONE_{}]"
),
# IBAN patterns
Pattern(
name="iban",
patterns=[
r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b',
r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b'
],
replacement_template="[IBAN_{}]"
),
# Address patterns (compound first so full footer = one UUID)
Pattern(
name="address",
patterns=[
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
# Street + house number (standalone)
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Postfach / PO Box (standalone)
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
# Postal code + city (standalone); exclude year+non-city and common non-city words
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
],
replacement_template="[ADDRESS_{}]"
),
# Date patterns (all languages and formats)
Pattern(
name="date",
patterns=[
# DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
# YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
# MM/DD/YYYY / MM-DD-YYYY (US)
r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
# geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# vertrag/contract/contrat + vom/from/du/dal/van
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
# day + month name + year (17 February 2026, 17. Februar 2026)
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# month name + day + year (February 17, 2026)
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
],
replacement_template="[DATE_{}]"
),
# Policy number patterns (replaces only the number, keeps labels like "Police Nr.")
Pattern(
name="policy",
patterns=[
# Number after "Police Nr." etc. (fixed-width lookbehind Python re requirement)
r'(?<=Police Nr\. )[\d.]+',
r'(?<=Police Nr\. )[\d.]+',
r'(?<=Police Nr\.: )[\d.]+',
r'(?<=Police Nr )[\d.]+',
r'(?<=Police Nr: )[\d.]+',
r'(?<=Polizzenr\. )[\d.]+',
r'(?<=Polizzenummer: )[\d.]+',
r'(?<=Polizzenummer )[\d.]+',
r'(?<=Policy No\. )[\d.]+',
r'(?<=Policy No )[\d.]+',
r'(?<=Policy Number: )[\d.]+',
r'(?<=Policy Number )[\d.]+',
r'(?<=Polizza n° )[\d.]+',
r'(?<=Numéro de police: )[\d.]+',
r'(?<=Numéro de police )[\d.]+',
r'(?<=Numero polizza: )[\d.]+',
r'(?<=Numero polizza )[\d.]+',
# Standalone policy number format - exclude when part of UID (CHE-115.665.634)
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
],
replacement_template="[POLICY_{}]"
),
# SSN patterns
Pattern(
name="ssn",
patterns=[
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
# Generic SSN format - exclude when part of company ID or before decimal
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
],
replacement_template="[SSN_{}]"
)
]
class TextTablePatterns:
"""Patterns for identifying table-like structures in text"""
@staticmethod
def getPatterns() -> List[Tuple[str, str]]:
return [
# key: value pattern (with optional whitespace)
(r'^([^:]+):\s*(.+)$', ':'),
# key = value pattern (with optional whitespace)
(r'^([^=]+)=\s*(.+)$', '='),
# key = value pattern (with required whitespace)
(r'^([^=]+)\s+=\s+(.+)$', '='),
# key: value pattern (with required whitespace)
(r'^([^:]+)\s+:\s+(.+)$', ':'),
]
@staticmethod
def _isTableLine(line: str) -> bool:
"""Check if a line matches any table pattern"""
patterns = TextTablePatterns.getPatterns()
return any(re.match(pattern[0], line.strip()) for pattern in patterns)
@staticmethod
def extractKeyValue(line: str) -> Optional[Tuple[str, str]]:
"""Extract key and value from a table line"""
patterns = TextTablePatterns.getPatterns()
for pattern, separator in patterns:
match = re.match(pattern, line.strip())
if match:
key = match.group(1).strip()
value = match.group(2).strip()
return key, value
return None
def getPatternForHeader(header: str, patterns: List[Pattern]) -> Optional[Pattern]:
"""
Find matching pattern for a header
Args:
header: The header to check
patterns: List of patterns to check against
Returns:
Optional[Pattern]: Matching pattern or None
"""
if not header:
return None
header = header.lower().strip()
for pattern in patterns:
for p in pattern.patterns:
if re.search(p, header, re.IGNORECASE):
return pattern
return None
def findPatternsInText(text: str, patterns: List[Pattern]) -> List[tuple]:
"""
Find all pattern matches in text
Args:
text: Text to search
patterns: List of patterns to check
Returns:
List[tuple]: List of (pattern_name, match, start, end)
"""
matches = []
for pattern in patterns:
for p in pattern.patterns:
for match in re.finditer(p, text, re.IGNORECASE):
matches.append((pattern.name, match.group(0), match.start(), match.end()))
return sorted(matches, key=lambda x: x[2]) # Sort by start position