fix: further improved neutraliser
This commit is contained in:
parent
205b99dfa0
commit
d3cfe8e9be
3 changed files with 190 additions and 63 deletions
|
|
@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
|
||||||
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
|
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
|
||||||
"Leistungen", "Basis", "Benefits", # Section labels
|
"Leistungen", "Basis", "Benefits", # Section labels
|
||||||
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
||||||
|
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
|
||||||
|
# Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
|
||||||
|
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
|
||||||
|
"www", ".ch", ".com", ".org", ".net", "CHF",
|
||||||
|
# Labels that must never be neutralized
|
||||||
|
"Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -69,6 +75,25 @@ class StringParser:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
|
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
|
||||||
|
|
||||||
|
# Deduplicate: keep one match per (start,end) – same span can match multiple patterns
|
||||||
|
seen = set()
|
||||||
|
unique_matches = []
|
||||||
|
for m in patternMatches:
|
||||||
|
key = (m[2], m[3])
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_matches.append(m)
|
||||||
|
patternMatches = unique_matches
|
||||||
|
|
||||||
|
# Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026")
|
||||||
|
def overlaps(a_start, a_end, b_start, b_end):
|
||||||
|
return a_start < b_end and b_start < a_end
|
||||||
|
date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"]
|
||||||
|
patternMatches = [
|
||||||
|
m for m in patternMatches
|
||||||
|
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
|
||||||
|
]
|
||||||
|
|
||||||
# Process from right to left to avoid position shifts
|
# Process from right to left to avoid position shifts
|
||||||
for patternName, matchedText, start, end in reversed(patternMatches):
|
for patternName, matchedText, start, end in reversed(patternMatches):
|
||||||
|
|
@ -86,6 +111,11 @@ class StringParser:
|
||||||
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
|
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
|
||||||
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
|
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
|
||||||
continue
|
continue
|
||||||
|
# Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF)
|
||||||
|
if patternName == "phone" and end + 3 <= len(text):
|
||||||
|
after = text[end : end + 3]
|
||||||
|
if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()):
|
||||||
|
continue
|
||||||
|
|
||||||
if matchedText not in self.mapping:
|
if matchedText not in self.mapping:
|
||||||
# Generate a UUID for the placeholder
|
# Generate a UUID for the placeholder
|
||||||
|
|
@ -112,36 +142,41 @@ class StringParser:
|
||||||
|
|
||||||
def _replaceCustomNames(self, text: str) -> str:
|
def _replaceCustomNames(self, text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Replace custom names from the user list in text
|
Replace custom names from the user list in text.
|
||||||
|
Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word.
|
||||||
Args:
|
|
||||||
text: Text to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Text with custom names replaced
|
|
||||||
"""
|
"""
|
||||||
for name in self.NamesToParse:
|
names = [n.strip() for n in self.NamesToParse if n.strip()]
|
||||||
if not name.strip():
|
if not names:
|
||||||
continue
|
return text
|
||||||
|
|
||||||
# Create case-insensitive regex pattern with word boundaries
|
# Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list
|
||||||
pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
|
expanded = set(names)
|
||||||
|
for i, n1 in enumerate(names):
|
||||||
# Find all matches for this name
|
for n2 in names:
|
||||||
|
if n1 != n2:
|
||||||
|
expanded.add(f"{n1} {n2}")
|
||||||
|
expanded.add(f"{n2} {n1}")
|
||||||
|
|
||||||
|
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
|
||||||
|
for name in sorted(expanded, key=len, reverse=True):
|
||||||
|
# Composite: flexible whitespace (space, newline); single: word boundaries
|
||||||
|
if " " in name:
|
||||||
|
parts = name.split()
|
||||||
|
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
|
||||||
|
else:
|
||||||
|
pattern_str = r"\b" + re.escape(name) + r"\b"
|
||||||
|
pattern = re.compile(pattern_str, re.IGNORECASE)
|
||||||
|
|
||||||
matches = list(pattern.finditer(text))
|
matches = list(pattern.finditer(text))
|
||||||
|
for match in reversed(matches):
|
||||||
# Replace each match with a placeholder
|
|
||||||
for match in reversed(matches): # Process from right to left to avoid position shifts
|
|
||||||
matchedText = match.group()
|
matchedText = match.group()
|
||||||
if matchedText not in self.mapping:
|
if matchedText not in self.mapping:
|
||||||
# Generate a UUID for the placeholder
|
|
||||||
placeholderId = str(uuid.uuid4())
|
placeholderId = str(uuid.uuid4())
|
||||||
self.mapping[matchedText] = f"[name.{placeholderId}]"
|
self.mapping[matchedText] = f"[name.{placeholderId}]"
|
||||||
|
|
||||||
replacement = self.mapping[matchedText]
|
replacement = self.mapping[matchedText]
|
||||||
start, end = match.span()
|
start, end = match.span()
|
||||||
text = text[:start] + replacement + text[end:]
|
text = text[:start] + replacement + text[end:]
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def processString(self, text: str) -> str:
|
def processString(self, text: str) -> str:
|
||||||
|
|
|
||||||
|
|
@ -243,16 +243,17 @@ class DataPatterns:
|
||||||
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||||||
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||||||
# Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
|
# Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
|
||||||
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
# Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
|
||||||
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
|
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||||
],
|
],
|
||||||
replacement_template="[NAME_{}]"
|
replacement_template="[NAME_{}]"
|
||||||
),
|
),
|
||||||
|
|
@ -271,6 +272,8 @@ class DataPatterns:
|
||||||
Pattern(
|
Pattern(
|
||||||
name="phone",
|
name="phone",
|
||||||
patterns=[
|
patterns=[
|
||||||
|
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
|
||||||
|
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||||||
# International format
|
# International format
|
||||||
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
||||||
# Swiss format
|
# Swiss format
|
||||||
|
|
@ -309,28 +312,44 @@ class DataPatterns:
|
||||||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||||||
# Postfach / PO Box (standalone)
|
# Postfach / PO Box (standalone)
|
||||||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||||||
# Postal code + city (standalone)
|
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
||||||
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
||||||
|
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||||
],
|
],
|
||||||
replacement_template="[ADDRESS_{}]"
|
replacement_template="[ADDRESS_{}]"
|
||||||
),
|
),
|
||||||
|
|
||||||
# Date patterns
|
# Date patterns (all languages and formats)
|
||||||
Pattern(
|
Pattern(
|
||||||
name="date",
|
name="date",
|
||||||
patterns=[
|
patterns=[
|
||||||
# Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF)
|
# DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
|
||||||
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026
|
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
|
||||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines)
|
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
|
||||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03.
|
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
|
||||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5)
|
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
|
||||||
# Context-specific date formats
|
# YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
|
||||||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
|
||||||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
# MM/DD/YYYY / MM-DD-YYYY (US)
|
||||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
|
||||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
# geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
|
||||||
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
|
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
|
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||||
|
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||||
|
# vertrag/contract/contrat + vom/from/du/dal/van
|
||||||
|
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||||
|
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||||
|
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||||
|
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
|
||||||
|
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||||
|
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||||
|
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||||
|
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||||
|
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||||
|
# day + month name + year (17 February 2026, 17. Februar 2026)
|
||||||
|
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||||
|
# month name + day + year (February 17, 2026)
|
||||||
|
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
|
||||||
],
|
],
|
||||||
replacement_template="[DATE_{}]"
|
replacement_template="[DATE_{}]"
|
||||||
),
|
),
|
||||||
|
|
@ -357,8 +376,8 @@ class DataPatterns:
|
||||||
r'(?<=Numéro de police )[\d.]+',
|
r'(?<=Numéro de police )[\d.]+',
|
||||||
r'(?<=Numero polizza: )[\d.]+',
|
r'(?<=Numero polizza: )[\d.]+',
|
||||||
r'(?<=Numero polizza )[\d.]+',
|
r'(?<=Numero polizza )[\d.]+',
|
||||||
# Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts
|
# Standalone policy number format - exclude when part of UID (CHE-115.665.634)
|
||||||
r'\b\d{2,4}(?:\.\d{3}){2,}\b'
|
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
|
||||||
],
|
],
|
||||||
replacement_template="[POLICY_{}]"
|
replacement_template="[POLICY_{}]"
|
||||||
),
|
),
|
||||||
|
|
@ -368,9 +387,9 @@ class DataPatterns:
|
||||||
name="ssn",
|
name="ssn",
|
||||||
patterns=[
|
patterns=[
|
||||||
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
|
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
|
||||||
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
|
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
|
||||||
# Generic SSN format - exclude when followed by comma+digit (European decimal)
|
# Generic SSN format - exclude when part of company ID or before decimal
|
||||||
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
||||||
],
|
],
|
||||||
replacement_template="[SSN_{}]"
|
replacement_template="[SSN_{}]"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,9 @@
|
||||||
PDF in-place neutralization using PyMuPDF.
|
PDF in-place neutralization using PyMuPDF.
|
||||||
Removes original text completely and inserts full UUID placeholders.
|
Removes original text completely and inserts full UUID placeholders.
|
||||||
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
|
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
|
||||||
|
|
||||||
|
NOTE: PyMuPDF search_for() matches substrings (e.g. "CH" matches inside "Zurich",
|
||||||
|
"CHE-115...", ".ch"). We skip short/ambiguous keys to avoid false redactions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
|
@ -12,6 +15,16 @@ from typing import Dict, Optional
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Minimum length for PDF search - shorter keys cause substring false positives
|
||||||
|
_MIN_SEARCH_LENGTH = 5
|
||||||
|
|
||||||
|
# Keys we never search for in PDF (substrings of many common words)
|
||||||
|
_PDF_SEARCH_BLOCKLIST = frozenset({
|
||||||
|
"CH", "DE", "FR", "IT", # Country codes - match in Zurich, CHF, Deutschland, etc.
|
||||||
|
"Nr", "Nr.", "Nr:", "No", "No.", "No:", # Abbreviations - match in Pol-Nr., Policy No., etc.
|
||||||
|
"www", ".ch", ".com", ".org", ".net", # Domain parts - match in URLs
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def neutralize_pdf_in_place(
|
def neutralize_pdf_in_place(
|
||||||
pdf_bytes: bytes,
|
pdf_bytes: bytes,
|
||||||
|
|
@ -50,24 +63,53 @@ def neutralize_pdf_in_place(
|
||||||
fontsize = 8
|
fontsize = 8
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
font = fitz.Font(fontname)
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
|
name_inserts = []
|
||||||
|
address_inserts = []
|
||||||
|
phone_inserts = []
|
||||||
|
policy_inserts = []
|
||||||
|
date_inserts = []
|
||||||
|
ssn_inserts = [] # SSN/UID: CHE-115.665.634, long placeholder doesn't fit
|
||||||
|
|
||||||
for original_text, placeholder in sorted_items:
|
for original_text, placeholder in sorted_items:
|
||||||
if not original_text or not placeholder:
|
if not original_text or not placeholder:
|
||||||
continue
|
continue
|
||||||
|
# Skip keys that cause substring false positives (PyMuPDF search_for matches substrings)
|
||||||
|
if len(original_text) < _MIN_SEARCH_LENGTH:
|
||||||
|
logger.debug("Skipping PDF search for short key %r (would match substrings)", original_text[:20])
|
||||||
|
continue
|
||||||
|
if original_text.strip() in _PDF_SEARCH_BLOCKLIST:
|
||||||
|
logger.debug("Skipping PDF search for blocklisted key %r", original_text)
|
||||||
|
continue
|
||||||
|
|
||||||
search_text = original_text
|
search_text = original_text
|
||||||
insert_text = placeholder
|
insert_text = placeholder
|
||||||
if placeholder.startswith("[policy."):
|
is_name = placeholder.startswith("[name.")
|
||||||
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
|
is_address = placeholder.startswith("[address.")
|
||||||
|
is_phone = placeholder.startswith("[phone.")
|
||||||
|
is_policy = placeholder.startswith("[policy.")
|
||||||
|
is_date = placeholder.startswith("[date.")
|
||||||
|
is_ssn = placeholder.startswith("[ssn.")
|
||||||
|
if is_policy:
|
||||||
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
|
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
|
||||||
candidate = prefix + original_text
|
candidate = prefix + original_text
|
||||||
try:
|
try:
|
||||||
hits = page.search_for(candidate, quads=False)
|
hits = page.search_for(candidate, quads=False)
|
||||||
if hits:
|
if hits:
|
||||||
search_text = candidate
|
search_text = candidate
|
||||||
insert_text = placeholder # UUID only so it fits in rect
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
elif is_ssn and any(original_text.startswith(p) for p in ("CHE-", "DE-", "FR-", "IT-")):
|
||||||
|
# UID/company ID: try "UID-Nr. CHE-..." or "UID-Nr.: " for wider rect
|
||||||
|
for prefix in ("UID-Nr. ", "UID-Nr.: ", "UID No. ", "UID: ", "UID-Nummer: "):
|
||||||
|
candidate = prefix + original_text
|
||||||
|
try:
|
||||||
|
hits = page.search_for(candidate, quads=False)
|
||||||
|
if hits:
|
||||||
|
search_text = candidate
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
@ -79,15 +121,29 @@ def neutralize_pdf_in_place(
|
||||||
|
|
||||||
for rect in instances:
|
for rect in instances:
|
||||||
try:
|
try:
|
||||||
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
|
if is_name or is_address or is_phone or is_policy or is_date or is_ssn:
|
||||||
page.add_redact_annot(
|
page.add_redact_annot(rect, fill=fill_color)
|
||||||
rect,
|
if is_name:
|
||||||
text=insert_text,
|
name_inserts.append((rect, insert_text))
|
||||||
fill=fill_color,
|
elif is_address:
|
||||||
text_color=text_color,
|
address_inserts.append((rect, insert_text))
|
||||||
fontname=fontname,
|
elif is_phone:
|
||||||
fontsize=fs,
|
phone_inserts.append((rect, insert_text))
|
||||||
)
|
elif is_policy:
|
||||||
|
policy_inserts.append((rect, insert_text))
|
||||||
|
elif is_date:
|
||||||
|
date_inserts.append((rect, insert_text))
|
||||||
|
else:
|
||||||
|
ssn_inserts.append((rect, insert_text))
|
||||||
|
else:
|
||||||
|
page.add_redact_annot(
|
||||||
|
rect,
|
||||||
|
text=insert_text,
|
||||||
|
fill=fill_color,
|
||||||
|
text_color=text_color,
|
||||||
|
fontname=fontname,
|
||||||
|
fontsize=fontsize,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
|
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
|
||||||
|
|
||||||
|
|
@ -96,6 +152,23 @@ def neutralize_pdf_in_place(
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
|
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
|
||||||
|
|
||||||
|
# Insert placeholders with font size fitted to rect (avoids PyMuPDF shrinking to nothing)
|
||||||
|
for rect, text in name_inserts + address_inserts + phone_inserts + policy_inserts + date_inserts + ssn_inserts:
|
||||||
|
try:
|
||||||
|
tl = font.text_length(text, fontsize=1)
|
||||||
|
fs = max(3, min(fontsize, rect.width / tl)) if tl > 0 else 4
|
||||||
|
rc = page.insert_textbox(
|
||||||
|
rect, text, fontname=fontname, fontsize=fs,
|
||||||
|
align=0, color=text_color
|
||||||
|
)
|
||||||
|
if rc < 0:
|
||||||
|
page.insert_textbox(
|
||||||
|
rect, text, fontname=fontname, fontsize=2,
|
||||||
|
align=0, color=text_color
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Insert placeholder failed: {e}")
|
||||||
|
|
||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
doc.save(buf, garbage=4, deflate=True)
|
doc.save(buf, garbage=4, deflate=True)
|
||||||
doc.close()
|
doc.close()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue