451 lines
No EOL
22 KiB
Python
451 lines
No EOL
22 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
||
# All rights reserved.
|
||
"""
|
||
Pattern definitions for data anonymization
|
||
Separates header patterns from data patterns
|
||
"""
|
||
|
||
from dataclasses import dataclass
|
||
from typing import List, Optional, Tuple
|
||
import re
|
||
|
||
@dataclass
|
||
class Pattern:
|
||
"""Base class for patterns"""
|
||
name: str
|
||
patterns: List[str]
|
||
replacement_template: str
|
||
|
||
class HeaderPatterns:
|
||
"""Patterns for identifying sensitive data in headers"""
|
||
patterns = [
|
||
# Name patterns
|
||
Pattern(
|
||
name="name",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b',
|
||
r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b',
|
||
r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b',
|
||
# German variations
|
||
r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b',
|
||
r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b',
|
||
# French variations
|
||
r'\b(?:prénom|nom|nom[-_\s]*complet)\b',
|
||
r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b',
|
||
# Italian variations
|
||
r'\b(?:nome|cognome|nome[-_\s]*completo)\b',
|
||
r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b',
|
||
# Common variations
|
||
r'\b(?:nom|name|nome|naam)\b'
|
||
],
|
||
replacement_template="[NAME_{}]"
|
||
),
|
||
|
||
# Email patterns
|
||
Pattern(
|
||
name="email",
|
||
patterns=[
|
||
# Simple variations - only labels
|
||
r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b',
|
||
r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b',
|
||
r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
# German variations - only labels
|
||
r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b',
|
||
r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b',
|
||
r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b',
|
||
# French variations - only labels
|
||
r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b',
|
||
r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b',
|
||
# Italian variations - only labels
|
||
r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b',
|
||
r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b',
|
||
r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b'
|
||
],
|
||
replacement_template="[EMAIL_{}]"
|
||
),
|
||
|
||
# Phone patterns
|
||
Pattern(
|
||
name="phone",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:phone|tel|telephone|mobile)\b',
|
||
r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b',
|
||
# German variations
|
||
r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b',
|
||
r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b',
|
||
# French variations
|
||
r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b',
|
||
r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b',
|
||
# Italian variations
|
||
r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b',
|
||
r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b'
|
||
],
|
||
replacement_template="[PHONE_{}]"
|
||
),
|
||
|
||
# IBAN patterns
|
||
Pattern(
|
||
name="iban",
|
||
patterns=[
|
||
# Simple variations
|
||
r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b',
|
||
r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b',
|
||
# German variations
|
||
r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b',
|
||
r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b',
|
||
# French variations
|
||
r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b',
|
||
r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b',
|
||
# Credit card variations in French
|
||
r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b',
|
||
r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b',
|
||
r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b',
|
||
r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b',
|
||
# Italian variations
|
||
r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b',
|
||
r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b',
|
||
# Common variations
|
||
r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b',
|
||
# Credit card variations
|
||
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b',
|
||
r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b',
|
||
# Credit card variations in other languages
|
||
r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b',
|
||
r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b',
|
||
# Payment variations
|
||
r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b',
|
||
r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b',
|
||
r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b',
|
||
r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b',
|
||
# Common credit card abbreviations
|
||
r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b',
|
||
r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b',
|
||
r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b',
|
||
r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b',
|
||
# Simple credit card
|
||
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b',
|
||
# Additional credit card variations
|
||
r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b',
|
||
r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b',
|
||
r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b',
|
||
r'\b(?:card[-_\s]*data|card[-_\s]*account)\b'
|
||
],
|
||
replacement_template="[IBAN_{}]"
|
||
),
|
||
|
||
# Address patterns
|
||
Pattern(
|
||
name="address",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b',
|
||
r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b',
|
||
r'\b(?:.*address.*)\b', # Match any text containing "address"
|
||
# German variations
|
||
r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b',
|
||
r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b',
|
||
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
||
# French variations
|
||
r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b',
|
||
r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b',
|
||
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
||
# Italian variations
|
||
r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b',
|
||
r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b',
|
||
r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo"
|
||
# Common variations
|
||
r'\b(?:location|place|residence|domicile)\b',
|
||
r'\b(?:standort|ort|wohnort|domizil)\b',
|
||
r'\b(?:lieu|emplacement|résidence|domicile)\b',
|
||
r'\b(?:luogo|posizione|residenza|domicilio)\b'
|
||
],
|
||
replacement_template="[ADDRESS_{}]"
|
||
),
|
||
|
||
# Date patterns
|
||
Pattern(
|
||
name="date",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b',
|
||
r'\b(?:dob|birthday|anniversary)\b',
|
||
# German variations
|
||
r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b',
|
||
r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b',
|
||
r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b',
|
||
# French variations
|
||
r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b',
|
||
r'\b(?:anniversaire|date[-_\s]*anniversaire)\b',
|
||
r'\b(?:né|née|né[-_\s]*le)\b',
|
||
# Italian variations
|
||
r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b',
|
||
r'\b(?:compleanno|anniversario)\b',
|
||
r'\b(?:nato|nata|nato[-_\s]*il)\b',
|
||
# Common variations
|
||
r'\b(?:birth|born|geboren|né|nato)\b'
|
||
],
|
||
replacement_template="[DATE_{}]"
|
||
),
|
||
|
||
# SSN patterns
|
||
Pattern(
|
||
name="ssn",
|
||
patterns=[
|
||
# English variations
|
||
r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b',
|
||
r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b',
|
||
# German variations
|
||
r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b',
|
||
r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b',
|
||
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
||
# French variations
|
||
r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b',
|
||
r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b',
|
||
# Italian variations
|
||
r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b',
|
||
r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b',
|
||
# Common variations
|
||
r'\b(?:ahv|svn|nss|avs)\b',
|
||
# Additional AHV variations
|
||
r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b',
|
||
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
||
r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b',
|
||
r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b',
|
||
r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b',
|
||
r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b',
|
||
r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b'
|
||
],
|
||
replacement_template="[SSN_{}]"
|
||
)
|
||
]
|
||
|
||
class DataPatterns:
|
||
"""Patterns for identifying sensitive data in content"""
|
||
patterns = [
|
||
# Name patterns (before email so "name@domain" is not matched as name)
|
||
Pattern(
|
||
name="name",
|
||
patterns=[
|
||
# Contact person context (fixed-width lookbehind for Python re)
|
||
r'(?<=Ansprechperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||
# Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
|
||
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||
],
|
||
replacement_template="[NAME_{}]"
|
||
),
|
||
|
||
# Email pattern for plain text
|
||
Pattern(
|
||
name="email",
|
||
patterns=[
|
||
# Basic email pattern
|
||
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*'
|
||
],
|
||
replacement_template="[EMAIL_{}]"
|
||
),
|
||
|
||
# Phone patterns
|
||
Pattern(
|
||
name="phone",
|
||
patterns=[
|
||
# International format
|
||
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
||
# Swiss format
|
||
r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||
# German format
|
||
r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
||
# French format
|
||
r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||
# Italian format
|
||
r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
||
# Mobile numbers
|
||
r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b',
|
||
# Emergency numbers
|
||
r'\b(?:112|911|118|117|144|1414)\b'
|
||
],
|
||
replacement_template="[PHONE_{}]"
|
||
),
|
||
|
||
# IBAN patterns
|
||
Pattern(
|
||
name="iban",
|
||
patterns=[
|
||
r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b',
|
||
r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b'
|
||
],
|
||
replacement_template="[IBAN_{}]"
|
||
),
|
||
|
||
# Address patterns (compound first so full footer = one UUID)
|
||
Pattern(
|
||
name="address",
|
||
patterns=[
|
||
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
|
||
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
||
# Street + house number (standalone)
|
||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||
# Postfach / PO Box (standalone)
|
||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||
# Postal code + city (standalone)
|
||
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||
],
|
||
replacement_template="[ADDRESS_{}]"
|
||
),
|
||
|
||
# Date patterns
|
||
Pattern(
|
||
name="date",
|
||
patterns=[
|
||
# Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF)
|
||
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines)
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03.
|
||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5)
|
||
# Context-specific date formats
|
||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
|
||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
|
||
],
|
||
replacement_template="[DATE_{}]"
|
||
),
|
||
|
||
# Policy number patterns (replaces only the number, keeps labels like "Police Nr.")
|
||
Pattern(
|
||
name="policy",
|
||
patterns=[
|
||
# Number after "Police Nr." etc. (fixed-width lookbehind – Python re requirement)
|
||
r'(?<=Police Nr\. )[\d.]+',
|
||
r'(?<=Police Nr\. )[\d.]+',
|
||
r'(?<=Police Nr\.: )[\d.]+',
|
||
r'(?<=Police Nr )[\d.]+',
|
||
r'(?<=Police Nr: )[\d.]+',
|
||
r'(?<=Polizzenr\. )[\d.]+',
|
||
r'(?<=Polizzenummer: )[\d.]+',
|
||
r'(?<=Polizzenummer )[\d.]+',
|
||
r'(?<=Policy No\. )[\d.]+',
|
||
r'(?<=Policy No )[\d.]+',
|
||
r'(?<=Policy Number: )[\d.]+',
|
||
r'(?<=Policy Number )[\d.]+',
|
||
r'(?<=Polizza n° )[\d.]+',
|
||
r'(?<=Numéro de police: )[\d.]+',
|
||
r'(?<=Numéro de police )[\d.]+',
|
||
r'(?<=Numero polizza: )[\d.]+',
|
||
r'(?<=Numero polizza )[\d.]+',
|
||
# Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts
|
||
r'\b\d{2,4}(?:\.\d{3}){2,}\b'
|
||
],
|
||
replacement_template="[POLICY_{}]"
|
||
),
|
||
|
||
# SSN patterns
|
||
Pattern(
|
||
name="ssn",
|
||
patterns=[
|
||
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
|
||
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
|
||
# Generic SSN format - exclude when followed by comma+digit (European decimal)
|
||
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
||
],
|
||
replacement_template="[SSN_{}]"
|
||
)
|
||
]
|
||
|
||
class TextTablePatterns:
|
||
"""Patterns for identifying table-like structures in text"""
|
||
|
||
@staticmethod
|
||
def getPatterns() -> List[Tuple[str, str]]:
|
||
return [
|
||
# key: value pattern (with optional whitespace)
|
||
(r'^([^:]+):\s*(.+)$', ':'),
|
||
# key = value pattern (with optional whitespace)
|
||
(r'^([^=]+)=\s*(.+)$', '='),
|
||
# key = value pattern (with required whitespace)
|
||
(r'^([^=]+)\s+=\s+(.+)$', '='),
|
||
# key: value pattern (with required whitespace)
|
||
(r'^([^:]+)\s+:\s+(.+)$', ':'),
|
||
]
|
||
|
||
@staticmethod
|
||
def _isTableLine(line: str) -> bool:
|
||
"""Check if a line matches any table pattern"""
|
||
patterns = TextTablePatterns.getPatterns()
|
||
return any(re.match(pattern[0], line.strip()) for pattern in patterns)
|
||
|
||
@staticmethod
|
||
def extractKeyValue(line: str) -> Optional[Tuple[str, str]]:
|
||
"""Extract key and value from a table line"""
|
||
patterns = TextTablePatterns.getPatterns()
|
||
for pattern, separator in patterns:
|
||
match = re.match(pattern, line.strip())
|
||
if match:
|
||
key = match.group(1).strip()
|
||
value = match.group(2).strip()
|
||
return key, value
|
||
return None
|
||
|
||
def getPatternForHeader(header: str, patterns: List[Pattern]) -> Optional[Pattern]:
|
||
"""
|
||
Find matching pattern for a header
|
||
|
||
Args:
|
||
header: The header to check
|
||
patterns: List of patterns to check against
|
||
|
||
Returns:
|
||
Optional[Pattern]: Matching pattern or None
|
||
"""
|
||
if not header:
|
||
return None
|
||
|
||
header = header.lower().strip()
|
||
|
||
for pattern in patterns:
|
||
for p in pattern.patterns:
|
||
if re.search(p, header, re.IGNORECASE):
|
||
return pattern
|
||
return None
|
||
|
||
def findPatternsInText(text: str, patterns: List[Pattern]) -> List[tuple]:
|
||
"""
|
||
Find all pattern matches in text
|
||
|
||
Args:
|
||
text: Text to search
|
||
patterns: List of patterns to check
|
||
|
||
Returns:
|
||
List[tuple]: List of (pattern_name, match, start, end)
|
||
"""
|
||
matches = []
|
||
for pattern in patterns:
|
||
for p in pattern.patterns:
|
||
for match in re.finditer(p, text, re.IGNORECASE):
|
||
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
||
return sorted(matches, key=lambda x: x[2]) # Sort by start position |