473 lines
No EOL
26 KiB
Python
473 lines
No EOL
26 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
||
# All rights reserved.
|
||
"""
|
||
Pattern definitions for data anonymization
|
||
Separates header patterns from data patterns
|
||
"""
|
||
|
||
from dataclasses import dataclass
|
||
from typing import List, Optional, Tuple
|
||
import re
|
||
|
||
@dataclass
|
||
class Pattern:
|
||
"""Base class for patterns"""
|
||
name: str
|
||
patterns: List[str]
|
||
replacement_template: str
|
||
|
||
class HeaderPatterns:
|
||
"""Patterns for identifying sensitive data in headers"""
|
||
patterns = [
|
||
# Name patterns
|
||
Pattern(
|
||
name="name",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b',
|
||
r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b',
|
||
r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b',
|
||
# German variations
|
||
r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b',
|
||
r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b',
|
||
# French variations
|
||
r'\b(?:prénom|nom|nom[-_\s]*complet)\b',
|
||
r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b',
|
||
# Italian variations
|
||
r'\b(?:nome|cognome|nome[-_\s]*completo)\b',
|
||
r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b',
|
||
# Common variations
|
||
r'\b(?:nom|name|nome|naam)\b'
|
||
],
|
||
replacement_template="[NAME_{}]"
|
||
),
|
||
|
||
# Email patterns
|
||
Pattern(
|
||
name="email",
|
||
patterns=[
|
||
# Simple variations - only labels
|
||
r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b',
|
||
r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b',
|
||
r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
# German variations - only labels
|
||
r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b',
|
||
r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b',
|
||
r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b',
|
||
# French variations - only labels
|
||
r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b',
|
||
r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b',
|
||
# Italian variations - only labels
|
||
r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b',
|
||
r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b'
|
||
],
|
||
replacement_template="[EMAIL_{}]"
|
||
),
|
||
|
||
# Phone patterns
|
||
Pattern(
|
||
name="phone",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:phone|tel|telephone|mobile)\b',
|
||
r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b',
|
||
# German variations
|
||
r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b',
|
||
r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b',
|
||
# French variations
|
||
r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b',
|
||
r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b',
|
||
# Italian variations
|
||
r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b',
|
||
r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b'
|
||
],
|
||
replacement_template="[PHONE_{}]"
|
||
),
|
||
|
||
# IBAN patterns
|
||
Pattern(
|
||
name="iban",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b',
|
||
r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b',
|
||
# German variations
|
||
r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b',
|
||
r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b',
|
||
# French variations
|
||
r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b',
|
||
r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b',
|
||
# Credit card variations in French
|
||
r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b',
|
||
r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b',
|
||
r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b',
|
||
r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b',
|
||
# Italian variations
|
||
r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b',
|
||
r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b',
|
||
# Common variations
|
||
r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b',
|
||
# Credit card variations
|
||
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b',
|
||
# Credit card variations in other languages
|
||
r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b',
|
||
r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b',
|
||
# Payment variations
|
||
r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b',
|
||
r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b',
|
||
r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b',
|
||
r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b',
|
||
# Common credit card abbreviations
|
||
r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b',
|
||
r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b',
|
||
r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b',
|
||
r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b',
|
||
# Simple credit card
|
||
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b',
|
||
# Additional credit card variations
|
||
r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b',
|
||
r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b',
|
||
r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b',
|
||
r'\b(?:card[-_\s]*data|card[-_\s]*account)\b'
|
||
],
|
||
replacement_template="[IBAN_{}]"
|
||
),
|
||
|
||
# Address patterns
|
||
Pattern(
|
||
name="address",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b',
|
||
r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b',
|
||
r'\b(?:.*address.*)\b', # Match any text containing "address"
|
||
# German variations
|
||
r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b',
|
||
r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b',
|
||
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
||
# French variations
|
||
r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b',
|
||
r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b',
|
||
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
||
# Italian variations
|
||
r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b',
|
||
r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b',
|
||
r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo"
|
||
# Common variations
|
||
r'\b(?:location|place|residence|domicile)\b',
|
||
r'\b(?:standort|ort|wohnort|domizil)\b',
|
||
r'\b(?:lieu|emplacement|résidence|domicile)\b',
|
||
r'\b(?:luogo|posizione|residenza|domicilio)\b'
|
||
],
|
||
replacement_template="[ADDRESS_{}]"
|
||
),
|
||
|
||
# Date patterns
|
||
Pattern(
|
||
name="date",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b',
|
||
r'\b(?:dob|birthday|anniversary)\b',
|
||
# German variations
|
||
r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b',
|
||
r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b',
|
||
r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b',
|
||
# French variations
|
||
r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b',
|
||
r'\b(?:anniversaire|date[-_\s]*anniversaire)\b',
|
||
r'\b(?:né|née|né[-_\s]*le)\b',
|
||
# Italian variations
|
||
r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b',
|
||
r'\b(?:compleanno|anniversario)\b',
|
||
r'\b(?:nato|nata|nato[-_\s]*il)\b',
|
||
# Common variations
|
||
r'\b(?:birth|born|geboren|né|nato)\b'
|
||
],
|
||
replacement_template="[DATE_{}]"
|
||
),
|
||
|
||
# SSN patterns
|
||
Pattern(
|
||
name="ssn",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b',
|
||
r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b',
|
||
# German variations
|
||
r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b',
|
||
r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b',
|
||
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
||
# French variations
|
||
r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b',
|
||
r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b',
|
||
# Italian variations
|
||
r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b',
|
||
r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b',
|
||
# Common variations
|
||
r'\b(?:ahv|svn|nss|avs)\b',
|
||
# Additional AHV variations
|
||
r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b',
|
||
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
||
r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b',
|
||
r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b',
|
||
r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b',
|
||
r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b'
|
||
],
|
||
replacement_template="[SSN_{}]"
|
||
)
|
||
]
|
||
|
||
class DataPatterns:
|
||
"""Patterns for identifying sensitive data in content"""
|
||
patterns = [
|
||
# Name patterns (before email so "name@domain" is not matched as name)
|
||
Pattern(
|
||
name="name",
|
||
patterns=[
|
||
# Contact person context (fixed-width lookbehind for Python re)
|
||
r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
# Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
|
||
# Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
|
||
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||
],
|
||
replacement_template="[NAME_{}]"
|
||
),
|
||
|
||
# Email pattern for plain text
|
||
Pattern(
|
||
name="email",
|
||
patterns=[
|
||
# Basic email pattern
|
||
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*'
|
||
],
|
||
replacement_template="[EMAIL_{}]"
|
||
),
|
||
|
||
# Phone patterns
|
||
Pattern(
|
||
name="phone",
|
||
patterns=[
|
||
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
|
||
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||
# International format
|
||
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
||
# Swiss format
|
||
r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||
# German format
|
||
r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
||
# French format
|
||
r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||
# Italian format
|
||
r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
||
# Mobile numbers
|
||
r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b',
|
||
# Emergency numbers
|
||
r'\b(?:112|911|118|117|144|1414)\b'
|
||
],
|
||
replacement_template="[PHONE_{}]"
|
||
),
|
||
|
||
# IBAN patterns
|
||
Pattern(
|
||
name="iban",
|
||
patterns=[
|
||
r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b',
|
||
r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b'
|
||
],
|
||
replacement_template="[IBAN_{}]"
|
||
),
|
||
|
||
# Address patterns (compound first so full footer = one UUID)
|
||
Pattern(
|
||
name="address",
|
||
patterns=[
|
||
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
|
||
# Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
||
# Street + house number (standalone); includes "straße" for German
|
||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||
# Postfach / PO Box (standalone)
|
||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
||
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
||
# Exclude business terms (Marketing, Qualitätsmanagement, etc.) – often follow years
|
||
# Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||
r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||
],
|
||
replacement_template="[ADDRESS_{}]"
|
||
),
|
||
|
||
# Date patterns (all languages and formats)
|
||
Pattern(
|
||
name="date",
|
||
patterns=[
|
||
# DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
|
||
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
|
||
# YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
|
||
r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
|
||
# MM/DD/YYYY / MM-DD-YYYY (US)
|
||
r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
|
||
# geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
|
||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||
# vertrag/contract/contrat + vom/from/du/dal/van
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
|
||
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
# day + month name + year (17 February 2026, 17. Februar 2026)
|
||
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||
# month name + day + year (February 17, 2026)
|
||
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
|
||
],
|
||
replacement_template="[DATE_{}]"
|
||
),
|
||
|
||
# Policy number patterns (replaces only the number, keeps labels like "Police Nr.")
|
||
Pattern(
|
||
name="policy",
|
||
patterns=[
|
||
# Number after "Police Nr." etc. (fixed-width lookbehind – Python re requirement)
|
||
r'(?<=Police Nr\. )[\d.]+',
|
||
r'(?<=Police Nr\. )[\d.]+',
|
||
r'(?<=Police Nr\.: )[\d.]+',
|
||
r'(?<=Police Nr )[\d.]+',
|
||
r'(?<=Police Nr: )[\d.]+',
|
||
r'(?<=Polizzenr\. )[\d.]+',
|
||
r'(?<=Polizzenummer: )[\d.]+',
|
||
r'(?<=Polizzenummer )[\d.]+',
|
||
r'(?<=Policy No\. )[\d.]+',
|
||
r'(?<=Policy No )[\d.]+',
|
||
r'(?<=Policy Number: )[\d.]+',
|
||
r'(?<=Policy Number )[\d.]+',
|
||
r'(?<=Polizza n° )[\d.]+',
|
||
r'(?<=Numéro de police: )[\d.]+',
|
||
r'(?<=Numéro de police )[\d.]+',
|
||
r'(?<=Numero polizza: )[\d.]+',
|
||
r'(?<=Numero polizza )[\d.]+',
|
||
# Standalone policy number format - exclude when part of UID (CHE-115.665.634)
|
||
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
|
||
],
|
||
replacement_template="[POLICY_{}]"
|
||
),
|
||
|
||
# SSN patterns
|
||
Pattern(
|
||
name="ssn",
|
||
patterns=[
|
||
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
|
||
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
|
||
# Generic SSN format - exclude when part of company ID or before decimal
|
||
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
||
],
|
||
replacement_template="[SSN_{}]"
|
||
)
|
||
]
|
||
|
||
class TextTablePatterns:
|
||
"""Patterns for identifying table-like structures in text"""
|
||
|
||
@staticmethod
|
||
def getPatterns() -> List[Tuple[str, str]]:
|
||
return [
|
||
# key: value pattern (with optional whitespace)
|
||
(r'^([^:]+):\s*(.+)$', ':'),
|
||
# key = value pattern (with optional whitespace)
|
||
(r'^([^=]+)=\s*(.+)$', '='),
|
||
# key = value pattern (with required whitespace)
|
||
(r'^([^=]+)\s+=\s+(.+)$', '='),
|
||
# key: value pattern (with required whitespace)
|
||
(r'^([^:]+)\s+:\s+(.+)$', ':'),
|
||
]
|
||
|
||
@staticmethod
|
||
def _isTableLine(line: str) -> bool:
|
||
"""Check if a line matches any table pattern"""
|
||
patterns = TextTablePatterns.getPatterns()
|
||
return any(re.match(pattern[0], line.strip()) for pattern in patterns)
|
||
|
||
@staticmethod
|
||
def extractKeyValue(line: str) -> Optional[Tuple[str, str]]:
|
||
"""Extract key and value from a table line"""
|
||
patterns = TextTablePatterns.getPatterns()
|
||
for pattern, separator in patterns:
|
||
match = re.match(pattern, line.strip())
|
||
if match:
|
||
key = match.group(1).strip()
|
||
value = match.group(2).strip()
|
||
return key, value
|
||
return None
|
||
|
||
def getPatternForHeader(header: str, patterns: List[Pattern]) -> Optional[Pattern]:
|
||
"""
|
||
Find matching pattern for a header
|
||
|
||
Args:
|
||
header: The header to check
|
||
patterns: List of patterns to check against
|
||
|
||
Returns:
|
||
Optional[Pattern]: Matching pattern or None
|
||
"""
|
||
if not header:
|
||
return None
|
||
|
||
header = header.lower().strip()
|
||
|
||
for pattern in patterns:
|
||
for p in pattern.patterns:
|
||
if re.search(p, header, re.IGNORECASE):
|
||
return pattern
|
||
return None
|
||
|
||
def findPatternsInText(text: str, patterns: List[Pattern]) -> List[tuple]:
|
||
"""
|
||
Find all pattern matches in text
|
||
|
||
Args:
|
||
text: Text to search
|
||
patterns: List of patterns to check
|
||
|
||
Returns:
|
||
List[tuple]: List of (pattern_name, match, start, end)
|
||
"""
|
||
matches = []
|
||
for pattern in patterns:
|
||
for p in pattern.patterns:
|
||
for match in re.finditer(p, text, re.IGNORECASE):
|
||
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
||
return sorted(matches, key=lambda x: x[2]) # Sort by start position |