fix: further improved neutraliser
This commit is contained in:
parent
205b99dfa0
commit
d3cfe8e9be
3 changed files with 190 additions and 63 deletions
|
|
@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
|
|||
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
|
||||
"Leistungen", "Basis", "Benefits", # Section labels
|
||||
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
||||
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
|
||||
# Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
|
||||
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
|
||||
"www", ".ch", ".com", ".org", ".net", "CHF",
|
||||
# Labels that must never be neutralized
|
||||
"Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID",
|
||||
})
|
||||
|
||||
|
||||
|
|
@ -69,6 +75,25 @@ class StringParser:
|
|||
return True
|
||||
return False
|
||||
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
|
||||
|
||||
# Deduplicate: keep one match per (start,end) – same span can match multiple patterns
|
||||
seen = set()
|
||||
unique_matches = []
|
||||
for m in patternMatches:
|
||||
key = (m[2], m[3])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_matches.append(m)
|
||||
patternMatches = unique_matches
|
||||
|
||||
# Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026")
|
||||
def overlaps(a_start, a_end, b_start, b_end):
|
||||
return a_start < b_end and b_start < a_end
|
||||
date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"]
|
||||
patternMatches = [
|
||||
m for m in patternMatches
|
||||
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
|
||||
]
|
||||
|
||||
# Process from right to left to avoid position shifts
|
||||
for patternName, matchedText, start, end in reversed(patternMatches):
|
||||
|
|
@ -86,6 +111,11 @@ class StringParser:
|
|||
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
|
||||
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
|
||||
continue
|
||||
# Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF)
|
||||
if patternName == "phone" and end + 3 <= len(text):
|
||||
after = text[end : end + 3]
|
||||
if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()):
|
||||
continue
|
||||
|
||||
if matchedText not in self.mapping:
|
||||
# Generate a UUID for the placeholder
|
||||
|
|
@ -112,36 +142,41 @@ class StringParser:
|
|||
|
||||
def _replaceCustomNames(self, text: str) -> str:
|
||||
"""
|
||||
Replace custom names from the user list in text
|
||||
|
||||
Args:
|
||||
text: Text to process
|
||||
|
||||
Returns:
|
||||
str: Text with custom names replaced
|
||||
Replace custom names from the user list in text.
|
||||
Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word.
|
||||
"""
|
||||
for name in self.NamesToParse:
|
||||
if not name.strip():
|
||||
continue
|
||||
|
||||
# Create case-insensitive regex pattern with word boundaries
|
||||
pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
|
||||
|
||||
# Find all matches for this name
|
||||
names = [n.strip() for n in self.NamesToParse if n.strip()]
|
||||
if not names:
|
||||
return text
|
||||
|
||||
# Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list
|
||||
expanded = set(names)
|
||||
for i, n1 in enumerate(names):
|
||||
for n2 in names:
|
||||
if n1 != n2:
|
||||
expanded.add(f"{n1} {n2}")
|
||||
expanded.add(f"{n2} {n1}")
|
||||
|
||||
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
|
||||
for name in sorted(expanded, key=len, reverse=True):
|
||||
# Composite: flexible whitespace (space, newline); single: word boundaries
|
||||
if " " in name:
|
||||
parts = name.split()
|
||||
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
|
||||
else:
|
||||
pattern_str = r"\b" + re.escape(name) + r"\b"
|
||||
pattern = re.compile(pattern_str, re.IGNORECASE)
|
||||
|
||||
matches = list(pattern.finditer(text))
|
||||
|
||||
# Replace each match with a placeholder
|
||||
for match in reversed(matches): # Process from right to left to avoid position shifts
|
||||
for match in reversed(matches):
|
||||
matchedText = match.group()
|
||||
if matchedText not in self.mapping:
|
||||
# Generate a UUID for the placeholder
|
||||
placeholderId = str(uuid.uuid4())
|
||||
self.mapping[matchedText] = f"[name.{placeholderId}]"
|
||||
|
||||
replacement = self.mapping[matchedText]
|
||||
start, end = match.span()
|
||||
text = text[:start] + replacement + text[end:]
|
||||
|
||||
|
||||
return text
|
||||
|
||||
def processString(self, text: str) -> str:
|
||||
|
|
|
|||
|
|
@ -243,16 +243,17 @@ class DataPatterns:
|
|||
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||||
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
|
||||
# Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
|
||||
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
|
||||
# Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
|
||||
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
|
||||
],
|
||||
replacement_template="[NAME_{}]"
|
||||
),
|
||||
|
|
@ -271,6 +272,8 @@ class DataPatterns:
|
|||
Pattern(
|
||||
name="phone",
|
||||
patterns=[
|
||||
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
|
||||
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
|
||||
# International format
|
||||
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
|
||||
# Swiss format
|
||||
|
|
@ -309,28 +312,44 @@ class DataPatterns:
|
|||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||||
# Postfach / PO Box (standalone)
|
||||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||||
# Postal code + city (standalone)
|
||||
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
||||
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
||||
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||
],
|
||||
replacement_template="[ADDRESS_{}]"
|
||||
),
|
||||
|
||||
# Date patterns
|
||||
# Date patterns (all languages and formats)
|
||||
Pattern(
|
||||
name="date",
|
||||
patterns=[
|
||||
# Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF)
|
||||
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines)
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03.
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5)
|
||||
# Context-specific date formats
|
||||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
|
||||
# DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
|
||||
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
|
||||
# YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
|
||||
r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
|
||||
# MM/DD/YYYY / MM-DD-YYYY (US)
|
||||
r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
|
||||
# geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
|
||||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||
# vertrag/contract/contrat + vom/from/du/dal/van
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
|
||||
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
|
||||
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
|
||||
# day + month name + year (17 February 2026, 17. Februar 2026)
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
|
||||
# month name + day + year (February 17, 2026)
|
||||
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
|
||||
],
|
||||
replacement_template="[DATE_{}]"
|
||||
),
|
||||
|
|
@ -357,8 +376,8 @@ class DataPatterns:
|
|||
r'(?<=Numéro de police )[\d.]+',
|
||||
r'(?<=Numero polizza: )[\d.]+',
|
||||
r'(?<=Numero polizza )[\d.]+',
|
||||
# Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts
|
||||
r'\b\d{2,4}(?:\.\d{3}){2,}\b'
|
||||
# Standalone policy number format - exclude when part of UID (CHE-115.665.634)
|
||||
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
|
||||
],
|
||||
replacement_template="[POLICY_{}]"
|
||||
),
|
||||
|
|
@ -368,9 +387,9 @@ class DataPatterns:
|
|||
name="ssn",
|
||||
patterns=[
|
||||
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
|
||||
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
|
||||
# Generic SSN format - exclude when followed by comma+digit (European decimal)
|
||||
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
||||
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
|
||||
# Generic SSN format - exclude when part of company ID or before decimal
|
||||
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
|
||||
],
|
||||
replacement_template="[SSN_{}]"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@
|
|||
PDF in-place neutralization using PyMuPDF.
|
||||
Removes original text completely and inserts full UUID placeholders.
|
||||
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
|
||||
|
||||
NOTE: PyMuPDF search_for() matches substrings (e.g. "CH" matches inside "Zurich",
|
||||
"CHE-115...", ".ch"). We skip short/ambiguous keys to avoid false redactions.
|
||||
"""
|
||||
|
||||
import io
|
||||
|
|
@ -12,6 +15,16 @@ from typing import Dict, Optional
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Minimum length for PDF search - shorter keys cause substring false positives
|
||||
_MIN_SEARCH_LENGTH = 5
|
||||
|
||||
# Keys we never search for in PDF (substrings of many common words)
|
||||
_PDF_SEARCH_BLOCKLIST = frozenset({
|
||||
"CH", "DE", "FR", "IT", # Country codes - match in Zurich, CHF, Deutschland, etc.
|
||||
"Nr", "Nr.", "Nr:", "No", "No.", "No:", # Abbreviations - match in Pol-Nr., Policy No., etc.
|
||||
"www", ".ch", ".com", ".org", ".net", # Domain parts - match in URLs
|
||||
})
|
||||
|
||||
|
||||
def neutralize_pdf_in_place(
|
||||
pdf_bytes: bytes,
|
||||
|
|
@ -50,24 +63,53 @@ def neutralize_pdf_in_place(
|
|||
fontsize = 8
|
||||
|
||||
try:
|
||||
font = fitz.Font(fontname)
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
name_inserts = []
|
||||
address_inserts = []
|
||||
phone_inserts = []
|
||||
policy_inserts = []
|
||||
date_inserts = []
|
||||
ssn_inserts = [] # SSN/UID: CHE-115.665.634, long placeholder doesn't fit
|
||||
|
||||
for original_text, placeholder in sorted_items:
|
||||
if not original_text or not placeholder:
|
||||
continue
|
||||
# Skip keys that cause substring false positives (PyMuPDF search_for matches substrings)
|
||||
if len(original_text) < _MIN_SEARCH_LENGTH:
|
||||
logger.debug("Skipping PDF search for short key %r (would match substrings)", original_text[:20])
|
||||
continue
|
||||
if original_text.strip() in _PDF_SEARCH_BLOCKLIST:
|
||||
logger.debug("Skipping PDF search for blocklisted key %r", original_text)
|
||||
continue
|
||||
|
||||
search_text = original_text
|
||||
insert_text = placeholder
|
||||
if placeholder.startswith("[policy."):
|
||||
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
|
||||
is_name = placeholder.startswith("[name.")
|
||||
is_address = placeholder.startswith("[address.")
|
||||
is_phone = placeholder.startswith("[phone.")
|
||||
is_policy = placeholder.startswith("[policy.")
|
||||
is_date = placeholder.startswith("[date.")
|
||||
is_ssn = placeholder.startswith("[ssn.")
|
||||
if is_policy:
|
||||
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
|
||||
candidate = prefix + original_text
|
||||
try:
|
||||
hits = page.search_for(candidate, quads=False)
|
||||
if hits:
|
||||
search_text = candidate
|
||||
insert_text = placeholder # UUID only so it fits in rect
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
elif is_ssn and any(original_text.startswith(p) for p in ("CHE-", "DE-", "FR-", "IT-")):
|
||||
# UID/company ID: try "UID-Nr. CHE-..." or "UID-Nr.: " for wider rect
|
||||
for prefix in ("UID-Nr. ", "UID-Nr.: ", "UID No. ", "UID: ", "UID-Nummer: "):
|
||||
candidate = prefix + original_text
|
||||
try:
|
||||
hits = page.search_for(candidate, quads=False)
|
||||
if hits:
|
||||
search_text = candidate
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
|
@ -79,15 +121,29 @@ def neutralize_pdf_in_place(
|
|||
|
||||
for rect in instances:
|
||||
try:
|
||||
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
|
||||
page.add_redact_annot(
|
||||
rect,
|
||||
text=insert_text,
|
||||
fill=fill_color,
|
||||
text_color=text_color,
|
||||
fontname=fontname,
|
||||
fontsize=fs,
|
||||
)
|
||||
if is_name or is_address or is_phone or is_policy or is_date or is_ssn:
|
||||
page.add_redact_annot(rect, fill=fill_color)
|
||||
if is_name:
|
||||
name_inserts.append((rect, insert_text))
|
||||
elif is_address:
|
||||
address_inserts.append((rect, insert_text))
|
||||
elif is_phone:
|
||||
phone_inserts.append((rect, insert_text))
|
||||
elif is_policy:
|
||||
policy_inserts.append((rect, insert_text))
|
||||
elif is_date:
|
||||
date_inserts.append((rect, insert_text))
|
||||
else:
|
||||
ssn_inserts.append((rect, insert_text))
|
||||
else:
|
||||
page.add_redact_annot(
|
||||
rect,
|
||||
text=insert_text,
|
||||
fill=fill_color,
|
||||
text_color=text_color,
|
||||
fontname=fontname,
|
||||
fontsize=fontsize,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
|
||||
|
||||
|
|
@ -96,6 +152,23 @@ def neutralize_pdf_in_place(
|
|||
except Exception as e:
|
||||
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
|
||||
|
||||
# Insert placeholders with font size fitted to rect (avoids PyMuPDF shrinking to nothing)
|
||||
for rect, text in name_inserts + address_inserts + phone_inserts + policy_inserts + date_inserts + ssn_inserts:
|
||||
try:
|
||||
tl = font.text_length(text, fontsize=1)
|
||||
fs = max(3, min(fontsize, rect.width / tl)) if tl > 0 else 4
|
||||
rc = page.insert_textbox(
|
||||
rect, text, fontname=fontname, fontsize=fs,
|
||||
align=0, color=text_color
|
||||
)
|
||||
if rc < 0:
|
||||
page.insert_textbox(
|
||||
rect, text, fontname=fontname, fontsize=2,
|
||||
align=0, color=text_color
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Insert placeholder failed: {e}")
|
||||
|
||||
buf = io.BytesIO()
|
||||
doc.save(buf, garbage=4, deflate=True)
|
||||
doc.close()
|
||||
|
|
|
|||
Loading…
Reference in a new issue