gateway/test_neutralizer/patterns.py

402 lines
No EOL
19 KiB
Python

"""
Pattern definitions for data anonymization
Separates header patterns from data patterns
"""
from dataclasses import dataclass
from typing import List, Optional, Tuple
import re
@dataclass
class Pattern:
"""Base class for patterns"""
name: str
patterns: List[str]
replacement_template: str
class HeaderPatterns:
"""Patterns for identifying sensitive data in headers"""
patterns = [
# Name patterns
Pattern(
name="name",
patterns=[
# Simple variations
r'\b(?:name|first[-_\s]*name|last[-_\s]*name|full[-_\s]*name)\b',
r'\b(?:customer[-_\s]*name|client[-_\s]*name|user[-_\s]*name)\b',
r'\b(?:given[-_\s]*name|family[-_\s]*name|surname)\b',
# German variations
r'\b(?:vorname|nachname|vollständiger[-_\s]*name|name)\b',
r'\b(?:kunden[-_\s]*name|kunde[-_\s]*name|benutzer[-_\s]*name)\b',
# French variations
r'\b(?:prénom|nom|nom[-_\s]*complet)\b',
r'\b(?:nom[-_\s]*du[-_\s]*client|nom[-_\s]*d\'utilisateur)\b',
# Italian variations
r'\b(?:nome|cognome|nome[-_\s]*completo)\b',
r'\b(?:nome[-_\s]*cliente|nome[-_\s]*utente)\b',
# Common variations
r'\b(?:nom|name|nome|naam)\b'
],
replacement_template="[NAME_{}]"
),
# Email patterns
Pattern(
name="email",
patterns=[
# Simple variations - only labels
r'\b(?:email|e[-_\s]*mail|mail)\s*:?\b',
r'\b(?:contact[-_\s]*email|user[-_\s]*email|client[-_\s]*email)\s*:?\b',
r'\b(?:customer[-_\s]*email|customer[-_\s]*mail|customer[-_\s]*e[-_\s]*mail)\s*:?\b',
# German variations - only labels
r'\b(?:e[-_\s]*mail|e[-_\s]*post|mail[-_\s]*adresse)\s*:?\b',
r'\b(?:kontakt[-_\s]*email|benutzer[-_\s]*email|kunden[-_\s]*email)\s*:?\b',
r'\b(?:kunden[-_\s]*mail|kunden[-_\s]*e[-_\s]*mail|kunden[-_\s]*e[-_\s]*post)\s*:?\b',
# French variations - only labels
r'\b(?:courriel|e[-_\s]*mail|adresse[-_\s]*e[-_\s]*mail)\s*:?\b',
r'\b(?:courriel[-_\s]*de[-_\s]*contact|e[-_\s]*mail[-_\s]*client)\s*:?\b',
r'\b(?:courriel[-_\s]*client|courriel[-_\s]*utilisateur|mail[-_\s]*client)\s*:?\b',
# Italian variations - only labels
r'\b(?:posta[-_\s]*elettronica|e[-_\s]*mail|indirizzo[-_\s]*e[-_\s]*mail)\s*:?\b',
r'\b(?:email[-_\s]*cliente|email[-_\s]*utente)\s*:?\b',
r'\b(?:mail[-_\s]*cliente|mail[-_\s]*utente|posta[-_\s]*cliente)\s*:?\b'
],
replacement_template="[EMAIL_{}]"
),
# Phone patterns
Pattern(
name="phone",
patterns=[
# Simple variations
r'\b(?:phone|tel|telephone|mobile)\b',
r'\b(?:contact[-_\s]*number|phone[-_\s]*number|tel[-_\s]*number)\b',
# German variations
r'\b(?:telefon|mobil|handy|telefon[-_\s]*nummer)\b',
r'\b(?:kontakt[-_\s]*nummer|telefon[-_\s]*nummer|tel[-_\s]*nummer)\b',
# French variations
r'\b(?:téléphone|portable|mobile|numéro[-_\s]*de[-_\s]*téléphone)\b',
r'\b(?:numéro[-_\s]*de[-_\s]*contact|tél[-_\s]*fixe|tél[-_\s]*mobile)\b',
# Italian variations
r'\b(?:telefono|cellulare|mobile|numero[-_\s]*di[-_\s]*telefono)\b',
r'\b(?:numero[-_\s]*di[-_\s]*contatto|tel[-_\s]*fisso|tel[-_\s]*mobile)\b'
],
replacement_template="[PHONE_{}]"
),
# IBAN patterns
Pattern(
name="iban",
patterns=[
# Simple variations
r'\b(?:iban|bank[-_\s]*account|account[-_\s]*number)\b',
r'\b(?:bank[-_\s]*details|account[-_\s]*details|banking[-_\s]*info)\b',
# German variations
r'\b(?:iban|bank[-_\s]*konto|konto[-_\s]*nummer)\b',
r'\b(?:bank[-_\s]*verbindung|konto[-_\s]*verbindung|bank[-_\s]*daten)\b',
# French variations
r'\b(?:iban|compte[-_\s]*bancaire|numéro[-_\s]*de[-_\s]*compte)\b',
r'\b(?:coordonnées[-_\s]*bancaires|détails[-_\s]*bancaires)\b',
# Credit card variations in French
r'\b(?:carte[-_\s]*de[-_\s]*credit|carte[-_\s]*credit|numero[-_\s]*carte[-_\s]*credit)\b',
r'\b(?:carte[-_\s]*bancaire|carte[-_\s]*de[-_\s]*paiement)\b',
r'\b(?:carte[-_\s]*de[-_\s]*crédit|carte[-_\s]*crédit|numéro[-_\s]*carte[-_\s]*crédit)\b',
r'\b(?:carte[-_\s]*de[-_\s]*débit|carte[-_\s]*débit|numéro[-_\s]*carte[-_\s]*débit)\b',
# Italian variations
r'\b(?:iban|conto[-_\s]*bancario|numero[-_\s]*di[-_\s]*conto)\b',
r'\b(?:coordinate[-_\s]*bancarie|dettagli[-_\s]*bancari)\b',
# Common variations
r'\b(?:bankkonto|bank[-_\s]*konto|conto[-_\s]*di[-_\s]*banca)\b',
# Credit card variations
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number|credit[-_\s]*card[-_\s]*no)\b',
r'\b(?:credit[-_\s]*card[-_\s]*nr|credit[-_\s]*card[-_\s]*num)\b',
r'\b(?:credit[-_\s]*card[-_\s]*id|credit[-_\s]*card[-_\s]*code)\b',
r'\b(?:credit[-_\s]*card[-_\s]*reference|credit[-_\s]*card[-_\s]*ref)\b',
r'\b(?:credit[-_\s]*card[-_\s]*details|credit[-_\s]*card[-_\s]*info)\b',
r'\b(?:credit[-_\s]*card[-_\s]*data|credit[-_\s]*card[-_\s]*account)\b',
# Credit card variations in other languages
r'\b(?:kredit[-_\s]*karte|kreditkarte|kredit[-_\s]*karten[-_\s]*nummer)\b',
r'\b(?:carta[-_\s]*di[-_\s]*credito|carta[-_\s]*credito|numero[-_\s]*carta[-_\s]*credito)\b',
# Payment variations
r'\b(?:payment[-_\s]*details|payment[-_\s]*info|payment[-_\s]*data)\b',
r'\b(?:zahlungs[-_\s]*details|zahlungs[-_\s]*informationen|zahlungs[-_\s]*daten)\b',
r'\b(?:détails[-_\s]*de[-_\s]*paiement|informations[-_\s]*de[-_\s]*paiement)\b',
r'\b(?:dettagli[-_\s]*di[-_\s]*pagamento|informazioni[-_\s]*di[-_\s]*pagamento)\b',
# Common credit card abbreviations
r'\b(?:cc[-_\s]*number|cc[-_\s]*no|cc[-_\s]*nr)\b',
r'\b(?:cc[-_\s]*num|cc[-_\s]*id|cc[-_\s]*code)\b',
r'\b(?:cc[-_\s]*ref|cc[-_\s]*details|cc[-_\s]*info)\b',
r'\b(?:cc[-_\s]*data|cc[-_\s]*account)\b',
# Simple credit card
r'\b(?:credit[-_\s]*card|credit[-_\s]*card[-_\s]*number)\b',
# Additional credit card variations
r'\b(?:card[-_\s]*number|card[-_\s]*no|card[-_\s]*nr)\b',
r'\b(?:card[-_\s]*num|card[-_\s]*id|card[-_\s]*code)\b',
r'\b(?:card[-_\s]*ref|card[-_\s]*details|card[-_\s]*info)\b',
r'\b(?:card[-_\s]*data|card[-_\s]*account)\b'
],
replacement_template="[IBAN_{}]"
),
# Address patterns
Pattern(
name="address",
patterns=[
# English variations
r'\b(?:address|street[-_\s]*address|mailing[-_\s]*address)\b',
r'\b(?:home[-_\s]*address|work[-_\s]*address|billing[-_\s]*address)\b',
r'\b(?:.*address.*)\b', # Match any text containing "address"
# German variations
r'\b(?:adresse|strassen[-_\s]*adresse|post[-_\s]*adresse)\b',
r'\b(?:wohn[-_\s]*adresse|geschäfts[-_\s]*adresse|rechnungs[-_\s]*adresse)\b',
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
# French variations
r'\b(?:adresse|adresse[-_\s]*postale|adresse[-_\s]*de[-_\s]*livraison)\b',
r'\b(?:adresse[-_\s]*personnelle|adresse[-_\s]*professionnelle)\b',
r'\b(?:.*adresse.*)\b', # Match any text containing "adresse"
# Italian variations
r'\b(?:indirizzo|indirizzo[-_\s]*postale|indirizzo[-_\s]*di[-_\s]*consegna)\b',
r'\b(?:indirizzo[-_\s]*personale|indirizzo[-_\s]*professionale)\b',
r'\b(?:.*indirizzo.*)\b', # Match any text containing "indirizzo"
# Common variations
r'\b(?:location|place|residence|domicile)\b',
r'\b(?:standort|ort|wohnort|domizil)\b',
r'\b(?:lieu|emplacement|résidence|domicile)\b',
r'\b(?:luogo|posizione|residenza|domicilio)\b'
],
replacement_template="[ADDRESS_{}]"
),
# Date patterns
Pattern(
name="date",
patterns=[
# English variations
r'\b(?:date|birth[-_\s]*date|date[-_\s]*of[-_\s]*birth)\b',
r'\b(?:dob|birthday|anniversary)\b',
# German variations
r'\b(?:datum|geburt[-_\s]*datum|geboren[-_\s]*am)\b',
r'\b(?:geburtstag|jubiläum|feier[-_\s]*tag)\b',
r'\b(?:geboren|geb\.|geboren[-_\s]*am)\b',
# French variations
r'\b(?:date|date[-_\s]*de[-_\s]*naissance|né[-_\s]*le)\b',
r'\b(?:anniversaire|date[-_\s]*anniversaire)\b',
r'\b(?:né|née|né[-_\s]*le)\b',
# Italian variations
r'\b(?:data|data[-_\s]*di[-_\s]*nascita|nato[-_\s]*il)\b',
r'\b(?:compleanno|anniversario)\b',
r'\b(?:nato|nata|nato[-_\s]*il)\b',
# Common variations
r'\b(?:birth|born|geboren|né|nato)\b'
],
replacement_template="[DATE_{}]"
),
# SSN patterns
Pattern(
name="ssn",
patterns=[
# English variations
r'\b(?:ssn|social[-_\s]*security[-_\s]*number|tax[-_\s]*id)\b',
r'\b(?:tax[-_\s]*identification|national[-_\s]*id)\b',
# German variations
r'\b(?:ahv[-_\s]*nummer|sozial[-_\s]*versicherungs[-_\s]*nummer)\b',
r'\b(?:steuer[-_\s]*nummer|steuer[-_\s]*id|svn)\b',
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
# French variations
r'\b(?:numéro[-_\s]*avs|numéro[-_\s]*de[-_\s]*sécurité[-_\s]*sociale)\b',
r'\b(?:numéro[-_\s]*fiscal|numéro[-_\s]*d\'identification)\b',
# Italian variations
r'\b(?:numero[-_\s]*avs|numero[-_\s]*di[-_\s]*sicurezza[-_\s]*sociale)\b',
r'\b(?:numero[-_\s]*fiscale|codice[-_\s]*fiscale)\b',
# Common variations
r'\b(?:ahv|svn|nss|avs)\b',
# Additional AHV variations
r'\b(?:ahv_nummer|ahvnummer|ahv-nummer|ahv_number)\b',
r'\b(?:ahv[-_\s]*nr|ahv[-_\s]*no|ahv[-_\s]*num)\b',
r'\b(?:ahv[-_\s]*number|ahv[-_\s]*number)\b',
r'\b(?:ahv[-_\s]*id|ahv[-_\s]*id)\b',
r'\b(?:ahv[-_\s]*code|ahv[-_\s]*code)\b',
r'\b(?:ahv[-_\s]*reference|ahv[-_\s]*reference)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*number|ahv[-_\s]*reference[-_\s]*number)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*no|ahv[-_\s]*reference[-_\s]*no)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*nr|ahv[-_\s]*reference[-_\s]*nr)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*num|ahv[-_\s]*reference[-_\s]*num)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*id|ahv[-_\s]*reference[-_\s]*id)\b',
r'\b(?:ahv[-_\s]*reference[-_\s]*code|ahv[-_\s]*reference[-_\s]*code)\b'
],
replacement_template="[SSN_{}]"
)
]
class DataPatterns:
"""Patterns for identifying sensitive data in content"""
patterns = [
# Name patterns
Pattern(
name="name",
patterns=[
# Person names with titles and academic degrees
r'\b(?:Dr\.|Prof\.|PhD\.?|MD\.?|Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-Z][a-z]{2,}(?:\s+[A-Za-z]{2,}){1,2}\b'
],
replacement_template="[NAME_{}]"
),
# Email pattern for plain text
Pattern(
name="email",
patterns=[
# Basic email pattern
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)*'
],
replacement_template="[EMAIL_{}]"
),
# Phone patterns
Pattern(
name="phone",
patterns=[
# International format
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
# Swiss format
r'\b(?:0\d{1,2}|0041\d{1,2})[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# German format
r'\b(?:0\d{1,4}|0049\d{1,4})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
# French format
r'\b(?:0\d{1,2}|0033\d{1,2})[-.\s]?\d{1,2}[-.\s]?\d{2}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# Italian format
r'\b(?:0\d{1,3}|0039\d{1,3})[-.\s]?\d{3,}[-.\s]?\d{3,}\b',
# Mobile numbers
r'\b(?:07|00417|004917|00337|00397)\d{8,9}\b',
# Emergency numbers
r'\b(?:112|911|118|117|144|1414)\b'
],
replacement_template="[PHONE_{}]"
),
# IBAN patterns
Pattern(
name="iban",
patterns=[
r'\b(?:CH|DE|FR|IT)\d{2}\s?(?:\d{4}\s?){5}\d{2}\b',
r'\b(?:CH|DE|FR|IT)\d{2}(?:\d{4}){5}\d{2}\b'
],
replacement_template="[IBAN_{}]"
),
# Address patterns
Pattern(
name="address",
patterns=[
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:[a-z])?\b',
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b'
],
replacement_template="[ADDRESS_{}]"
),
# Date patterns
Pattern(
name="date",
patterns=[
# Specific date formats with context
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Birth dates
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Birth dates
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', # Contract dates
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # Contract dates
# Specific date formats with month names
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', # Birth dates with month
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' # Contract dates with month
],
replacement_template="[DATE_{}]"
),
# SSN patterns
Pattern(
name="ssn",
patterns=[
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b', # Swiss AHV
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
r'\b\d{3}\.\d{3}\.\d{3}\b' # Generic SSN format
],
replacement_template="[SSN_{}]"
)
]
class TextTablePatterns:
"""Patterns for identifying table-like structures in text"""
@staticmethod
def get_patterns() -> List[Tuple[str, str]]:
return [
# key: value pattern (with optional whitespace)
(r'^([^:]+):\s*(.+)$', ':'),
# key = value pattern (with optional whitespace)
(r'^([^=]+)=\s*(.+)$', '='),
# key = value pattern (with required whitespace)
(r'^([^=]+)\s+=\s+(.+)$', '='),
# key: value pattern (with required whitespace)
(r'^([^:]+)\s+:\s+(.+)$', ':'),
]
@staticmethod
def is_table_line(line: str) -> bool:
"""Check if a line matches any table pattern"""
patterns = TextTablePatterns.get_patterns()
return any(re.match(pattern[0], line.strip()) for pattern in patterns)
@staticmethod
def extract_key_value(line: str) -> Optional[Tuple[str, str]]:
"""Extract key and value from a table line"""
patterns = TextTablePatterns.get_patterns()
for pattern, separator in patterns:
match = re.match(pattern, line.strip())
if match:
key = match.group(1).strip()
value = match.group(2).strip()
return key, value
return None
def get_pattern_for_header(header: str, patterns: List[Pattern]) -> Optional[Pattern]:
"""
Find matching pattern for a header
Args:
header: The header to check
patterns: List of patterns to check against
Returns:
Optional[Pattern]: Matching pattern or None
"""
if not header:
return None
header = header.lower().strip()
for pattern in patterns:
for p in pattern.patterns:
if re.search(p, header, re.IGNORECASE):
return pattern
return None
def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
"""
Find all pattern matches in text
Args:
text: Text to search
patterns: List of patterns to check
Returns:
List[tuple]: List of (pattern_name, match, start, end)
"""
matches = []
for pattern in patterns:
for p in pattern.patterns:
if pattern.name == 'email':
print(f"\nDEBUG: Checking email pattern '{p}'")
for match in re.finditer(p, text, re.IGNORECASE):
if pattern.name == 'email':
print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}")
print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'")
matches.append((pattern.name, match.group(0), match.start(), match.end()))
return sorted(matches, key=lambda x: x[2]) # Sort by start position