396 lines
No EOL
18 KiB
Python
396 lines
No EOL
18 KiB
Python
"""
|
|
Pattern definitions for data anonymization
|
|
Separates header patterns from data patterns
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional, Tuple
|
|
import re
|
|
|
|
@dataclass
|
|
class Pattern:
|
|
"""Base class for patterns"""
|
|
name: str
|
|
patterns: List[str]
|
|
replacement_template: str
|
|
|
|
class HeaderPatterns:
|
|
"""Patterns for identifying sensitive data in headers"""
|
|
patterns = [
|
|
# Name patterns
|
|
Pattern(
|
|
name="name",
|
|
patterns=[
|
|
# Simple variations
|
|
r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b',
|
|
r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b',
|
|
r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b',
|
|
# German variations
|
|
r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b',
|
|
r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b',
|
|
# French variations
|
|
r'\b(?:prénom|nom|nom[-_\s]*complet)\b',
|
|
r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b',
|
|
# Italian variations
|
|
r'\b(?:nome|cognome|nome[-_\s]*completo)\b',
|
|
r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b',
|
|
# Common variations
|
|
r'\b(?:nom|name|nome|naam)\b'
|
|
],
|
|
replacement_template="[NAME_{}]"
|
|
),
|
|
|
|
# Email patterns
|
|
Pattern(
|
|
name="email",
|
|
patterns=[
|
|
# Simple variations - only labels
|
|
r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b',
|
|
r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b',
|
|
r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b',
|
|
# German variations - only labels
|
|
r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b',
|
|
r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b',
|
|
r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b',
|
|
# French variations - only labels
|
|
r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b',
|
|
r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b',
|
|
r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b',
|
|
# Italian variations - only labels
|
|
r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b',
|
|
r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b',
|
|
r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b'
|
|
],
|
|
replacement_template="[EMAIL_{}]"
|
|
),
|
|
|
|
# Phone patterns
|
|
Pattern(
|
|
name="phone",
|
|
patterns=[
|
|
# Simple variations
|
|
r'\b(?:phone|tel|telephone|mobile)\b',
|
|
r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b',
|
|
# German variations
|
|
r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b',
|
|
r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b',
|
|
# French variations
|
|
r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b',
|
|
r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b',
|
|
# Italian variations
|
|
r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b',
|
|
r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b'
|
|
],
|
|
replacement_template="[PHONE_{}]"
|
|
),
|
|
|
|
# IBAN patterns
|
|
Pattern(
|
|
name="iban",
|
|
patterns=[
|
|
# Simple variations
|
|
r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b',
|
|
r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b',
|
|
# German variations
|
|
r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b',
|
|
r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b',
|
|
# French variations
|
|
r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b',
|
|
r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b',
|
|
# Credit card variations in French
|
|
r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b',
|
|
r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b',
|
|
r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b',
|
|
r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b',
|
|
# Italian variations
|
|
r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b',
|
|
r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b',
|
|
# Common variations
|
|
r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b',
|
|
# Credit card variations
|
|
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b',
|
|
r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b',
|
|
r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b',
|
|
r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b',
|
|
r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b',
|
|
r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b',
|
|
# Credit card variations in other languages
|
|
r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b',
|
|
r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b',
|
|
# Payment variations
|
|
r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b',
|
|
r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b',
|
|
r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b',
|
|
r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b',
|
|
# Common credit card abbreviations
|
|
r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b',
|
|
r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b',
|
|
r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b',
|
|
r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b',
|
|
# Simple credit card
|
|
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b',
|
|
# Additional credit card variations
|
|
r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b',
|
|
r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b',
|
|
r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b',
|
|
r'\b(?:card[-_\s]*data|card[-_\s]*account)\b'
|
|
],
|
|
replacement_template="[IBAN_{}]"
|
|
),
|
|
|
|
# Address patterns
|
|
Pattern(
|
|
name="address",
|
|
patterns=[
|
|
# English variations
|
|
r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b',
|
|
r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b',
|
|
r'\b(?:.*address.*)\b', # Match any text containing "address"
|
|
# German variations
|
|
r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b',
|
|
r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b',
|
|
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
|
# French variations
|
|
r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b',
|
|
r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b',
|
|
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
|
|
# Italian variations
|
|
r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b',
|
|
r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b',
|
|
r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo"
|
|
# Common variations
|
|
r'\b(?:location|place|residence|domicile)\b',
|
|
r'\b(?:standort|ort|wohnort|domizil)\b',
|
|
r'\b(?:lieu|emplacement|résidence|domicile)\b',
|
|
r'\b(?:luogo|posizione|residenza|domicilio)\b'
|
|
],
|
|
replacement_template="[ADDRESS_{}]"
|
|
),
|
|
|
|
# Date patterns
|
|
Pattern(
|
|
name="date",
|
|
patterns=[
|
|
# English variations
|
|
r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b',
|
|
r'\b(?:dob|birthday|anniversary)\b',
|
|
# German variations
|
|
r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b',
|
|
r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b',
|
|
r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b',
|
|
# French variations
|
|
r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b',
|
|
r'\b(?:anniversaire|date[-_\s]*anniversaire)\b',
|
|
r'\b(?:né|née|né[-_\s]*le)\b',
|
|
# Italian variations
|
|
r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b',
|
|
r'\b(?:compleanno|anniversario)\b',
|
|
r'\b(?:nato|nata|nato[-_\s]*il)\b',
|
|
# Common variations
|
|
r'\b(?:birth|born|geboren|né|nato)\b'
|
|
],
|
|
replacement_template="[DATE_{}]"
|
|
),
|
|
|
|
# SSN patterns
|
|
Pattern(
|
|
name="ssn",
|
|
patterns=[
|
|
# English variations
|
|
r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b',
|
|
r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b',
|
|
# German variations
|
|
r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b',
|
|
r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b',
|
|
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
|
# French variations
|
|
r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b',
|
|
r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b',
|
|
# Italian variations
|
|
r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b',
|
|
r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b',
|
|
# Common variations
|
|
r'\b(?:ahv|svn|nss|avs)\b',
|
|
# Additional AHV variations
|
|
r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b',
|
|
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
|
|
r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b',
|
|
r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b',
|
|
r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b',
|
|
r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b',
|
|
r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b'
|
|
],
|
|
replacement_template="[SSN_{}]"
|
|
)
|
|
]
|
|
|
|
class DataPatterns:
|
|
"""Patterns for identifying sensitive data in content"""
|
|
patterns = [
|
|
# Name patterns
|
|
Pattern(
|
|
name="name",
|
|
patterns=[
|
|
# Person names with titles and academic degrees
|
|
r'\b(?:Dr\.|Prof\.|PhD\.?|MD\.?|Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-Z][a-z]{2,}(?:\s+[A-Za-z]{2,}){1,2}\b'
|
|
],
|
|
replacement_template="[NAME_{}]"
|
|
),
|
|
|
|
# Email pattern for plain text
|
|
Pattern(
|
|
name="email",
|
|
patterns=[
|
|
# Basic email pattern
|
|
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*'
|
|
],
|
|
replacement_template="[EMAIL_{}]"
|
|
),
|
|
|
|
# Phone patterns
|
|
Pattern(
|
|
name="phone",
|
|
patterns=[
|
|
# International format
|
|
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
|
# Swiss format
|
|
r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
|
# German format
|
|
r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
|
# French format
|
|
r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
|
# Italian format
|
|
r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
|
|
# Mobile numbers
|
|
r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b',
|
|
# Emergency numbers
|
|
r'\b(?:112|911|118|117|144|1414)\b'
|
|
],
|
|
replacement_template="[PHONE_{}]"
|
|
),
|
|
|
|
# IBAN patterns
|
|
Pattern(
|
|
name="iban",
|
|
patterns=[
|
|
r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b',
|
|
r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b'
|
|
],
|
|
replacement_template="[IBAN_{}]"
|
|
),
|
|
|
|
# Address patterns
|
|
Pattern(
|
|
name="address",
|
|
patterns=[
|
|
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:[a-z])?\b',
|
|
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b'
|
|
],
|
|
replacement_template="[ADDRESS_{}]"
|
|
),
|
|
|
|
# Date patterns
|
|
Pattern(
|
|
name="date",
|
|
patterns=[
|
|
# Specific date formats with context
|
|
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Birth dates
|
|
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Birth dates
|
|
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Contract dates
|
|
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Contract dates
|
|
# Specific date formats with month names
|
|
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', # Birth dates with month
|
|
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' # Contract dates with month
|
|
],
|
|
replacement_template="[DATE_{}]"
|
|
),
|
|
|
|
# SSN patterns
|
|
Pattern(
|
|
name="ssn",
|
|
patterns=[
|
|
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b', # Swiss AHV
|
|
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
|
|
r'\b\d{3}\.\d{3}\.\d{3}\b' # Generic SSN format
|
|
],
|
|
replacement_template="[SSN_{}]"
|
|
)
|
|
]
|
|
|
|
class TextTablePatterns:
|
|
"""Patterns for identifying table-like structures in text"""
|
|
|
|
@staticmethod
|
|
def get_patterns() -> List[Tuple[str, str]]:
|
|
return [
|
|
# key: value pattern (with optional whitespace)
|
|
(r'^([^:]+):\s*(.+)$', ':'),
|
|
# key = value pattern (with optional whitespace)
|
|
(r'^([^=]+)=\s*(.+)$', '='),
|
|
# key = value pattern (with required whitespace)
|
|
(r'^([^=]+)\s+=\s+(.+)$', '='),
|
|
# key: value pattern (with required whitespace)
|
|
(r'^([^:]+)\s+:\s+(.+)$', ':'),
|
|
]
|
|
|
|
@staticmethod
|
|
def is_table_line(line: str) -> bool:
|
|
"""Check if a line matches any table pattern"""
|
|
patterns = TextTablePatterns.get_patterns()
|
|
return any(re.match(pattern[0], line.strip()) for pattern in patterns)
|
|
|
|
@staticmethod
|
|
def extract_key_value(line: str) -> Optional[Tuple[str, str]]:
|
|
"""Extract key and value from a table line"""
|
|
patterns = TextTablePatterns.get_patterns()
|
|
for pattern, separator in patterns:
|
|
match = re.match(pattern, line.strip())
|
|
if match:
|
|
key = match.group(1).strip()
|
|
value = match.group(2).strip()
|
|
return key, value
|
|
return None
|
|
|
|
def get_pattern_for_header(header: str, patterns: List[Pattern]) -> Optional[Pattern]:
|
|
"""
|
|
Find matching pattern for a header
|
|
|
|
Args:
|
|
header: The header to check
|
|
patterns: List of patterns to check against
|
|
|
|
Returns:
|
|
Optional[Pattern]: Matching pattern or None
|
|
"""
|
|
if not header:
|
|
return None
|
|
|
|
header = header.lower().strip()
|
|
|
|
for pattern in patterns:
|
|
for p in pattern.patterns:
|
|
if re.search(p, header, re.IGNORECASE):
|
|
return pattern
|
|
return None
|
|
|
|
def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
|
|
"""
|
|
Find all pattern matches in text
|
|
|
|
Args:
|
|
text: Text to search
|
|
patterns: List of patterns to check
|
|
|
|
Returns:
|
|
List[tuple]: List of (pattern_name, match, start, end)
|
|
"""
|
|
matches = []
|
|
for pattern in patterns:
|
|
for p in pattern.patterns:
|
|
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
|
return sorted(matches, key=lambda x: x[2]) # Sort by start position |