fix: further improved neutraliser

This commit is contained in:
Ida Dittrich 2026-02-25 07:59:33 +01:00
parent 205b99dfa0
commit d3cfe8e9be
3 changed files with 190 additions and 63 deletions

View file

@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance", "Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
"Leistungen", "Basis", "Benefits", # Section labels "Leistungen", "Basis", "Benefits", # Section labels
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.) "Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
# Ambiguous substrings match in Zurich, CHF, UID-Nr., websites, etc.
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
"www", ".ch", ".com", ".org", ".net", "CHF",
# Labels that must never be neutralized
"Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID",
}) })
@ -69,6 +75,25 @@ class StringParser:
return True return True
return False return False
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)] patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
# Deduplicate: keep one match per (start,end) same span can match multiple patterns
seen = set()
unique_matches = []
for m in patternMatches:
key = (m[2], m[3])
if key not in seen:
seen.add(key)
unique_matches.append(m)
patternMatches = unique_matches
# Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026")
def overlaps(a_start, a_end, b_start, b_end):
return a_start < b_end and b_start < a_end
date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"]
patternMatches = [
m for m in patternMatches
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
]
# Process from right to left to avoid position shifts # Process from right to left to avoid position shifts
for patternName, matchedText, start, end in reversed(patternMatches): for patternName, matchedText, start, end in reversed(patternMatches):
@ -86,6 +111,11 @@ class StringParser:
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern) # Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()): if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
continue continue
# Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF)
if patternName == "phone" and end + 3 <= len(text):
after = text[end : end + 3]
if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()):
continue
if matchedText not in self.mapping: if matchedText not in self.mapping:
# Generate a UUID for the placeholder # Generate a UUID for the placeholder
@ -112,36 +142,41 @@ class StringParser:
def _replaceCustomNames(self, text: str) -> str: def _replaceCustomNames(self, text: str) -> str:
""" """
Replace custom names from the user list in text Replace custom names from the user list in text.
Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word.
Args:
text: Text to process
Returns:
str: Text with custom names replaced
""" """
for name in self.NamesToParse: names = [n.strip() for n in self.NamesToParse if n.strip()]
if not name.strip(): if not names:
continue return text
# Create case-insensitive regex pattern with word boundaries # Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list
pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE) expanded = set(names)
for i, n1 in enumerate(names):
# Find all matches for this name for n2 in names:
if n1 != n2:
expanded.add(f"{n1} {n2}")
expanded.add(f"{n2} {n1}")
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
for name in sorted(expanded, key=len, reverse=True):
# Composite: flexible whitespace (space, newline); single: word boundaries
if " " in name:
parts = name.split()
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
else:
pattern_str = r"\b" + re.escape(name) + r"\b"
pattern = re.compile(pattern_str, re.IGNORECASE)
matches = list(pattern.finditer(text)) matches = list(pattern.finditer(text))
for match in reversed(matches):
# Replace each match with a placeholder
for match in reversed(matches): # Process from right to left to avoid position shifts
matchedText = match.group() matchedText = match.group()
if matchedText not in self.mapping: if matchedText not in self.mapping:
# Generate a UUID for the placeholder
placeholderId = str(uuid.uuid4()) placeholderId = str(uuid.uuid4())
self.mapping[matchedText] = f"[name.{placeholderId}]" self.mapping[matchedText] = f"[name.{placeholderId}]"
replacement = self.mapping[matchedText] replacement = self.mapping[matchedText]
start, end = match.span() start, end = match.span()
text = text[:start] + replacement + text[end:] text = text[:start] + replacement + text[end:]
return text return text
def processString(self, text: str) -> str: def processString(self, text: str) -> str:

View file

@ -243,16 +243,17 @@ class DataPatterns:
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
# Name only after Anrede (keep Frau/Herr; replace only the name) fixed-width lookbehind # Name only after Anrede (keep Frau/Herr; replace only the name) fixed-width lookbehind
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', # Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
], ],
replacement_template="[NAME_{}]" replacement_template="[NAME_{}]"
), ),
@ -271,6 +272,8 @@ class DataPatterns:
Pattern( Pattern(
name="phone", name="phone",
patterns=[ patterns=[
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# International format # International format
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b', r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
# Swiss format # Swiss format
@ -309,28 +312,44 @@ class DataPatterns:
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Postfach / PO Box (standalone) # Postfach / PO Box (standalone)
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b', r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
# Postal code + city (standalone) # Postal code + city (standalone); exclude year+non-city and common non-city words
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)' # (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
], ],
replacement_template="[ADDRESS_{}]" replacement_template="[ADDRESS_{}]"
), ),
# Date patterns # Date patterns (all languages and formats)
Pattern( Pattern(
name="date", name="date",
patterns=[ patterns=[
# Standalone date values require valid day (131) and month (112) to avoid decimals (e.g. 53.37 CHF) # DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026 r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines) r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03. r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5) r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
# Context-specific date formats # YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # MM/DD/YYYY / MM-DD-YYYY (US)
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', # geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b', r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b' r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# vertrag/contract/contrat + vom/from/du/dal/van
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
# day + month name + year (17 February 2026, 17. Februar 2026)
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# month name + day + year (February 17, 2026)
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
], ],
replacement_template="[DATE_{}]" replacement_template="[DATE_{}]"
), ),
@ -357,8 +376,8 @@ class DataPatterns:
r'(?<=Numéro de police )[\d.]+', r'(?<=Numéro de police )[\d.]+',
r'(?<=Numero polizza: )[\d.]+', r'(?<=Numero polizza: )[\d.]+',
r'(?<=Numero polizza )[\d.]+', r'(?<=Numero polizza )[\d.]+',
# Standalone policy number format (e.g. 11.559.499) require 2+ digit prefix to avoid amounts # Standalone policy number format - exclude when part of UID (CHE-115.665.634)
r'\b\d{2,4}(?:\.\d{3}){2,}\b' r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
], ],
replacement_template="[POLICY_{}]" replacement_template="[POLICY_{}]"
), ),
@ -368,9 +387,9 @@ class DataPatterns:
name="ssn", name="ssn",
patterns=[ patterns=[
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
# Generic SSN format - exclude when followed by comma+digit (European decimal) # Generic SSN format - exclude when part of company ID or before decimal
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)' r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
], ],
replacement_template="[SSN_{}]" replacement_template="[SSN_{}]"
) )

View file

@ -4,6 +4,9 @@
PDF in-place neutralization using PyMuPDF. PDF in-place neutralization using PyMuPDF.
Removes original text completely and inserts full UUID placeholders. Removes original text completely and inserts full UUID placeholders.
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout. PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
NOTE: PyMuPDF search_for() matches substrings (e.g. "CH" matches inside "Zurich",
"CHE-115...", ".ch"). We skip short/ambiguous keys to avoid false redactions.
""" """
import io import io
@ -12,6 +15,16 @@ from typing import Dict, Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Minimum length for PDF search - shorter keys cause substring false positives
_MIN_SEARCH_LENGTH = 5
# Keys we never search for in PDF (substrings of many common words)
_PDF_SEARCH_BLOCKLIST = frozenset({
"CH", "DE", "FR", "IT", # Country codes - match in Zurich, CHF, Deutschland, etc.
"Nr", "Nr.", "Nr:", "No", "No.", "No:", # Abbreviations - match in Pol-Nr., Policy No., etc.
"www", ".ch", ".com", ".org", ".net", # Domain parts - match in URLs
})
def neutralize_pdf_in_place( def neutralize_pdf_in_place(
pdf_bytes: bytes, pdf_bytes: bytes,
@ -50,24 +63,53 @@ def neutralize_pdf_in_place(
fontsize = 8 fontsize = 8
try: try:
font = fitz.Font(fontname)
for page_num in range(len(doc)): for page_num in range(len(doc)):
page = doc[page_num] page = doc[page_num]
name_inserts = []
address_inserts = []
phone_inserts = []
policy_inserts = []
date_inserts = []
ssn_inserts = [] # SSN/UID: CHE-115.665.634, long placeholder doesn't fit
for original_text, placeholder in sorted_items: for original_text, placeholder in sorted_items:
if not original_text or not placeholder: if not original_text or not placeholder:
continue continue
# Skip keys that cause substring false positives (PyMuPDF search_for matches substrings)
if len(original_text) < _MIN_SEARCH_LENGTH:
logger.debug("Skipping PDF search for short key %r (would match substrings)", original_text[:20])
continue
if original_text.strip() in _PDF_SEARCH_BLOCKLIST:
logger.debug("Skipping PDF search for blocklisted key %r", original_text)
continue
search_text = original_text search_text = original_text
insert_text = placeholder insert_text = placeholder
if placeholder.startswith("[policy."): is_name = placeholder.startswith("[name.")
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow) is_address = placeholder.startswith("[address.")
is_phone = placeholder.startswith("[phone.")
is_policy = placeholder.startswith("[policy.")
is_date = placeholder.startswith("[date.")
is_ssn = placeholder.startswith("[ssn.")
if is_policy:
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "): for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
candidate = prefix + original_text candidate = prefix + original_text
try: try:
hits = page.search_for(candidate, quads=False) hits = page.search_for(candidate, quads=False)
if hits: if hits:
search_text = candidate search_text = candidate
insert_text = placeholder # UUID only so it fits in rect break
except Exception:
continue
elif is_ssn and any(original_text.startswith(p) for p in ("CHE-", "DE-", "FR-", "IT-")):
# UID/company ID: try "UID-Nr. CHE-..." or "UID-Nr.: " for wider rect
for prefix in ("UID-Nr. ", "UID-Nr.: ", "UID No. ", "UID: ", "UID-Nummer: "):
candidate = prefix + original_text
try:
hits = page.search_for(candidate, quads=False)
if hits:
search_text = candidate
break break
except Exception: except Exception:
continue continue
@ -79,15 +121,29 @@ def neutralize_pdf_in_place(
for rect in instances: for rect in instances:
try: try:
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize if is_name or is_address or is_phone or is_policy or is_date or is_ssn:
page.add_redact_annot( page.add_redact_annot(rect, fill=fill_color)
rect, if is_name:
text=insert_text, name_inserts.append((rect, insert_text))
fill=fill_color, elif is_address:
text_color=text_color, address_inserts.append((rect, insert_text))
fontname=fontname, elif is_phone:
fontsize=fs, phone_inserts.append((rect, insert_text))
) elif is_policy:
policy_inserts.append((rect, insert_text))
elif is_date:
date_inserts.append((rect, insert_text))
else:
ssn_inserts.append((rect, insert_text))
else:
page.add_redact_annot(
rect,
text=insert_text,
fill=fill_color,
text_color=text_color,
fontname=fontname,
fontsize=fontsize,
)
except Exception as e: except Exception as e:
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}") logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
@ -96,6 +152,23 @@ def neutralize_pdf_in_place(
except Exception as e: except Exception as e:
logger.debug(f"apply_redactions page {page_num + 1}: {e}") logger.debug(f"apply_redactions page {page_num + 1}: {e}")
# Insert placeholders with font size fitted to rect (avoids PyMuPDF shrinking to nothing)
for rect, text in name_inserts + address_inserts + phone_inserts + policy_inserts + date_inserts + ssn_inserts:
try:
tl = font.text_length(text, fontsize=1)
fs = max(3, min(fontsize, rect.width / tl)) if tl > 0 else 4
rc = page.insert_textbox(
rect, text, fontname=fontname, fontsize=fs,
align=0, color=text_color
)
if rc < 0:
page.insert_textbox(
rect, text, fontname=fontname, fontsize=2,
align=0, color=text_color
)
except Exception as e:
logger.warning(f"Insert placeholder failed: {e}")
buf = io.BytesIO() buf = io.BytesIO()
doc.save(buf, garbage=4, deflate=True) doc.save(buf, garbage=4, deflate=True)
doc.close() doc.close()