fix: further improved neutraliser

This commit is contained in:
Ida Dittrich 2026-02-25 07:59:33 +01:00
parent 205b99dfa0
commit d3cfe8e9be
3 changed files with 190 additions and 63 deletions

View file

@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
"Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
"Leistungen", "Basis", "Benefits", # Section labels
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
# Ambiguous substrings match in Zurich, CHF, UID-Nr., websites, etc.
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
"www", ".ch", ".com", ".org", ".net", "CHF",
# Labels that must never be neutralized
"Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID",
})
@ -69,6 +75,25 @@ class StringParser:
return True
return False
patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
# Deduplicate: keep one match per (start,end) same span can match multiple patterns
seen = set()
unique_matches = []
for m in patternMatches:
key = (m[2], m[3])
if key not in seen:
seen.add(key)
unique_matches.append(m)
patternMatches = unique_matches
# Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026")
def overlaps(a_start, a_end, b_start, b_end):
return a_start < b_end and b_start < a_end
date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"]
patternMatches = [
m for m in patternMatches
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
]
# Process from right to left to avoid position shifts
for patternName, matchedText, start, end in reversed(patternMatches):
@ -86,6 +111,11 @@ class StringParser:
# Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
continue
# Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF)
if patternName == "phone" and end + 3 <= len(text):
after = text[end : end + 3]
if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()):
continue
if matchedText not in self.mapping:
# Generate a UUID for the placeholder
@ -112,36 +142,41 @@ class StringParser:
def _replaceCustomNames(self, text: str) -> str:
"""
Replace custom names from the user list in text
Args:
text: Text to process
Returns:
str: Text with custom names replaced
Replace custom names from the user list in text.
Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word.
"""
for name in self.NamesToParse:
if not name.strip():
continue
# Create case-insensitive regex pattern with word boundaries
pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
# Find all matches for this name
names = [n.strip() for n in self.NamesToParse if n.strip()]
if not names:
return text
# Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list
expanded = set(names)
for i, n1 in enumerate(names):
for n2 in names:
if n1 != n2:
expanded.add(f"{n1} {n2}")
expanded.add(f"{n2} {n1}")
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
for name in sorted(expanded, key=len, reverse=True):
# Composite: flexible whitespace (space, newline); single: word boundaries
if " " in name:
parts = name.split()
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
else:
pattern_str = r"\b" + re.escape(name) + r"\b"
pattern = re.compile(pattern_str, re.IGNORECASE)
matches = list(pattern.finditer(text))
# Replace each match with a placeholder
for match in reversed(matches): # Process from right to left to avoid position shifts
for match in reversed(matches):
matchedText = match.group()
if matchedText not in self.mapping:
# Generate a UUID for the placeholder
placeholderId = str(uuid.uuid4())
self.mapping[matchedText] = f"[name.{placeholderId}]"
replacement = self.mapping[matchedText]
start, end = match.span()
text = text[:start] + replacement + text[end:]
return text
def processString(self, text: str) -> str:

View file

@ -243,16 +243,17 @@ class DataPatterns:
r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
# Name only after Anrede (keep Frau/Herr; replace only the name) fixed-width lookbehind
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
# Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
],
replacement_template="[NAME_{}]"
),
@ -271,6 +272,8 @@ class DataPatterns:
Pattern(
name="phone",
patterns=[
# Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
# International format
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
# Swiss format
@ -309,28 +312,44 @@ class DataPatterns:
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Postfach / PO Box (standalone)
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
# Postal code + city (standalone)
r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
# Postal code + city (standalone); exclude year+non-city and common non-city words
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
],
replacement_template="[ADDRESS_{}]"
),
# Date patterns
# Date patterns (all languages and formats)
Pattern(
name="date",
patterns=[
# Standalone date values require valid day (131) and month (112) to avoid decimals (e.g. 53.37 CHF)
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b', # 17.02.2026, 29-03-2026
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b', # 17.02. 2026 (split across lines)
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b', # 17.02., 29.03.
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)', # 17.02, 29.03; exclude ratings (4.7/5)
# Context-specific date formats
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
# DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
# YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
# MM/DD/YYYY / MM-DD-YYYY (US)
r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
# geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# vertrag/contract/contrat + vom/from/du/dal/van
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
# day + month name + year (17 February 2026, 17. Februar 2026)
r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
# month name + day + year (February 17, 2026)
r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
],
replacement_template="[DATE_{}]"
),
@ -357,8 +376,8 @@ class DataPatterns:
r'(?<=Numéro de police )[\d.]+',
r'(?<=Numero polizza: )[\d.]+',
r'(?<=Numero polizza )[\d.]+',
# Standalone policy number format (e.g. 11.559.499) require 2+ digit prefix to avoid amounts
r'\b\d{2,4}(?:\.\d{3}){2,}\b'
# Standalone policy number format - exclude when part of UID (CHE-115.665.634)
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
],
replacement_template="[POLICY_{}]"
),
@ -368,9 +387,9 @@ class DataPatterns:
name="ssn",
patterns=[
r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)', # Swiss AHV - exclude before decimal
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs
# Generic SSN format - exclude when followed by comma+digit (European decimal)
r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b', # Company IDs (must be before generic)
# Generic SSN format - exclude when part of company ID or before decimal
r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
],
replacement_template="[SSN_{}]"
)

View file

@ -4,6 +4,9 @@
PDF in-place neutralization using PyMuPDF.
Removes original text completely and inserts full UUID placeholders.
PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
NOTE: PyMuPDF search_for() matches substrings (e.g. "CH" matches inside "Zurich",
"CHE-115...", ".ch"). We skip short/ambiguous keys to avoid false redactions.
"""
import io
@ -12,6 +15,16 @@ from typing import Dict, Optional
logger = logging.getLogger(__name__)
# Minimum length for PDF search - shorter keys cause substring false positives
_MIN_SEARCH_LENGTH = 5
# Keys we never search for in PDF (substrings of many common words)
_PDF_SEARCH_BLOCKLIST = frozenset({
"CH", "DE", "FR", "IT", # Country codes - match in Zurich, CHF, Deutschland, etc.
"Nr", "Nr.", "Nr:", "No", "No.", "No:", # Abbreviations - match in Pol-Nr., Policy No., etc.
"www", ".ch", ".com", ".org", ".net", # Domain parts - match in URLs
})
def neutralize_pdf_in_place(
pdf_bytes: bytes,
@ -50,24 +63,53 @@ def neutralize_pdf_in_place(
fontsize = 8
try:
font = fitz.Font(fontname)
for page_num in range(len(doc)):
page = doc[page_num]
name_inserts = []
address_inserts = []
phone_inserts = []
policy_inserts = []
date_inserts = []
ssn_inserts = [] # SSN/UID: CHE-115.665.634, long placeholder doesn't fit
for original_text, placeholder in sorted_items:
if not original_text or not placeholder:
continue
# Skip keys that cause substring false positives (PyMuPDF search_for matches substrings)
if len(original_text) < _MIN_SEARCH_LENGTH:
logger.debug("Skipping PDF search for short key %r (would match substrings)", original_text[:20])
continue
if original_text.strip() in _PDF_SEARCH_BLOCKLIST:
logger.debug("Skipping PDF search for blocklisted key %r", original_text)
continue
search_text = original_text
insert_text = placeholder
if placeholder.startswith("[policy."):
# Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
is_name = placeholder.startswith("[name.")
is_address = placeholder.startswith("[address.")
is_phone = placeholder.startswith("[phone.")
is_policy = placeholder.startswith("[policy.")
is_date = placeholder.startswith("[date.")
is_ssn = placeholder.startswith("[ssn.")
if is_policy:
for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
candidate = prefix + original_text
try:
hits = page.search_for(candidate, quads=False)
if hits:
search_text = candidate
insert_text = placeholder # UUID only so it fits in rect
break
except Exception:
continue
elif is_ssn and any(original_text.startswith(p) for p in ("CHE-", "DE-", "FR-", "IT-")):
# UID/company ID: try "UID-Nr. CHE-..." or "UID-Nr.: " for wider rect
for prefix in ("UID-Nr. ", "UID-Nr.: ", "UID No. ", "UID: ", "UID-Nummer: "):
candidate = prefix + original_text
try:
hits = page.search_for(candidate, quads=False)
if hits:
search_text = candidate
break
except Exception:
continue
@ -79,15 +121,29 @@ def neutralize_pdf_in_place(
for rect in instances:
try:
fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
page.add_redact_annot(
rect,
text=insert_text,
fill=fill_color,
text_color=text_color,
fontname=fontname,
fontsize=fs,
)
if is_name or is_address or is_phone or is_policy or is_date or is_ssn:
page.add_redact_annot(rect, fill=fill_color)
if is_name:
name_inserts.append((rect, insert_text))
elif is_address:
address_inserts.append((rect, insert_text))
elif is_phone:
phone_inserts.append((rect, insert_text))
elif is_policy:
policy_inserts.append((rect, insert_text))
elif is_date:
date_inserts.append((rect, insert_text))
else:
ssn_inserts.append((rect, insert_text))
else:
page.add_redact_annot(
rect,
text=insert_text,
fill=fill_color,
text_color=text_color,
fontname=fontname,
fontsize=fontsize,
)
except Exception as e:
logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")
@ -96,6 +152,23 @@ def neutralize_pdf_in_place(
except Exception as e:
logger.debug(f"apply_redactions page {page_num + 1}: {e}")
# Insert placeholders with font size fitted to rect (avoids PyMuPDF shrinking to nothing)
for rect, text in name_inserts + address_inserts + phone_inserts + policy_inserts + date_inserts + ssn_inserts:
try:
tl = font.text_length(text, fontsize=1)
fs = max(3, min(fontsize, rect.width / tl)) if tl > 0 else 4
rc = page.insert_textbox(
rect, text, fontname=fontname, fontsize=fs,
align=0, color=text_color
)
if rc < 0:
page.insert_textbox(
rect, text, fontname=fontname, fontsize=2,
align=0, color=text_color
)
except Exception as e:
logger.warning(f"Insert placeholder failed: {e}")
buf = io.BytesIO()
doc.save(buf, garbage=4, deflate=True)
doc.close()