fix: further improved neutraliser

2026-02-25 07:59:33 +01:00 · 2026-02-25 07:59:33 +01:00 · d3cfe8e9be
commit d3cfe8e9be
parent 205b99dfa0
3 changed files with 190 additions and 63 deletions
--- a/modules/features/neutralization/serviceNeutralization/subParseString.py
+++ b/modules/features/neutralization/serviceNeutralization/subParseString.py
@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
    "Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance",
    "Leistungen", "Basis", "Benefits",  # Section labels
    "Start", "Beginn", "Ende", "End", "trip",  # Contract labels (Start of trip, End of trip, etc.)
+    "incomplete", "Application", "Complete", "Pending",  # Form/status labels, not addresses
+    # Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
+    "CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
+    "www", ".ch", ".com", ".org", ".net", "CHF",
+    # Labels that must never be neutralized
+    "Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID",
 })


@ -69,6 +75,25 @@ class StringParser:
                    return True
            return False
        patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)]
+
+        # Deduplicate: keep one match per (start,end) – same span can match multiple patterns
+        seen = set()
+        unique_matches = []
+        for m in patternMatches:
+            key = (m[2], m[3])
+            if key not in seen:
+                seen.add(key)
+                unique_matches.append(m)
+        patternMatches = unique_matches
+
+        # Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026")
+        def overlaps(a_start, a_end, b_start, b_end):
+            return a_start < b_end and b_start < a_end
+        date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"]
+        patternMatches = [
+            m for m in patternMatches
+            if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
+        ]
        
        # Process from right to left to avoid position shifts
        for patternName, matchedText, start, end in reversed(patternMatches):
@ -86,6 +111,11 @@ class StringParser:
            # Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern)
            if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()):
                continue
+            # Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF)
+            if patternName == "phone" and end + 3 <= len(text):
+                after = text[end : end + 3]
+                if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()):
+                    continue

            if matchedText not in self.mapping:
                # Generate a UUID for the placeholder
@ -112,36 +142,41 @@ class StringParser:
    
    def _replaceCustomNames(self, text: str) -> str:
        """
-        Replace custom names from the user list in text
-        
-        Args:
-            text: Text to process
-            
-        Returns:
-            str: Text with custom names replaced
+        Replace custom names from the user list in text.
+        Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word.
        """
-        for name in self.NamesToParse:
-            if not name.strip():
-                continue
-                
-            # Create case-insensitive regex pattern with word boundaries
-            pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
-            
-            # Find all matches for this name
+        names = [n.strip() for n in self.NamesToParse if n.strip()]
+        if not names:
+            return text
+
+        # Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list
+        expanded = set(names)
+        for i, n1 in enumerate(names):
+            for n2 in names:
+                if n1 != n2:
+                    expanded.add(f"{n1} {n2}")
+                    expanded.add(f"{n2} {n1}")
+
+        # Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
+        for name in sorted(expanded, key=len, reverse=True):
+            # Composite: flexible whitespace (space, newline); single: word boundaries
+            if " " in name:
+                parts = name.split()
+                pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
+            else:
+                pattern_str = r"\b" + re.escape(name) + r"\b"
+            pattern = re.compile(pattern_str, re.IGNORECASE)
+
            matches = list(pattern.finditer(text))
-            
-            # Replace each match with a placeholder
-            for match in reversed(matches):  # Process from right to left to avoid position shifts
+            for match in reversed(matches):
                matchedText = match.group()
                if matchedText not in self.mapping:
-                    # Generate a UUID for the placeholder
                    placeholderId = str(uuid.uuid4())
                    self.mapping[matchedText] = f"[name.{placeholderId}]"
-                
                replacement = self.mapping[matchedText]
                start, end = match.span()
                text = text[:start] + replacement + text[end:]
-        
+
        return text
    
    def processString(self, text: str) -> str:
--- a/modules/features/neutralization/serviceNeutralization/subPatterns.py
+++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py
@ -243,16 +243,17 @@ class DataPatterns:
                r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
                r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+',
                # Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind
-                r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
-                r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+',
+                # Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr")
+                r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
+                r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*',
            ],
            replacement_template="[NAME_{}]"
        ),
@ -271,6 +272,8 @@ class DataPatterns:
        Pattern(
            name="phone",
            patterns=[
+                # Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators)
+                r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b',
                # International format
                r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
                # Swiss format
@ -309,28 +312,44 @@ class DataPatterns:
                r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
                # Postfach / PO Box (standalone)
                r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
-                # Postal code + city (standalone)
-                r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
+                # Postal code + city (standalone); exclude year+non-city and common non-city words
+                # (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
+                r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
            ],
            replacement_template="[ADDRESS_{}]"
        ),
        
-        # Date patterns
+        # Date patterns (all languages and formats)
        Pattern(
            name="date",
            patterns=[
-                # Standalone date values – require valid day (1–31) and month (1–12) to avoid decimals (e.g. 53.37 CHF)
-                r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',  # 17.02.2026, 29-03-2026
-                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',  # 17.02. 2026 (split across lines)
-                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',  # 17.02., 29.03.
-                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',  # 17.02, 29.03; exclude ratings (4.7/5)
-                # Context-specific date formats
-                r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
-                r'\b(?:geboren|birth|né|nato)\s+am\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
-                r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
-                r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
-                r'\b(?:geboren|birth|né|nato)\s+am\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b',
-                r'\b(?:vertrag|contract|contrat|contratto)\s+vom\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)[a-z]*\s+\d{4}\b'
+                # DD.MM.YYYY / DD/MM/YYYY / DD-MM-YYYY (European)
+                r'\b(0?[1-9]|[12]\d|3[01])[./-](0?[1-9]|1[0-2])[./-]\d{2,4}\b',
+                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.[\s]*\d{2,4}\b',
+                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\.(?!\d)\b',
+                r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[0-2])\b(?!\.?\d)(?!/\d)',
+                # YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD (ISO)
+                r'\b\d{4}[./-](0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])\b',
+                # MM/DD/YYYY / MM-DD-YYYY (US)
+                r'\b(0?[1-9]|1[0-2])[./-](0?[1-9]|[12]\d|3[01])[./-]\d{2,4}\b',
+                # geboren/birth/né/nato + am/le/on/il/op (DE/EN/FR/IT/NL)
+                r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
+                r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il|op)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
+                r'\b(?:geboren|birth|né|nato|nata)\s+(?:am|le|on|il)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
+                # vertrag/contract/contrat + vom/from/du/dal/van
+                r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
+                r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del|van)\s+[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
+                r'\b(?:vertrag|contract|contrat|contratto)\s+(?:vom|from|du|dal|del)\s+(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
+                # datum/date/data/fecha + numeric (fixed-width lookbehind, keeps label)
+                r'(?<=datum: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
+                r'(?<=data: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=fecha: )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
+                r'(?<=datum )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b', r'(?<=date )[0-9]{2}[./-][0-9]{2}[./-][0-9]{4}\b',
+                r'(?<=datum: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=date: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
+                r'(?<=data: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b', r'(?<=fecha: )[0-9]{4}[./-][0-9]{2}[./-][0-9]{2}\b',
+                # day + month name + year (17 February 2026, 17. Februar 2026)
+                r'\b(0?[1-9]|[12]\d|3[01])\s*(?:\.|\.\s*)?(?:jan|feb|mar|apr|mai|jun|jul|aug|sep|okt|nov|dez|january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+\d{4}\b',
+                # month name + day + year (February 17, 2026)
+                r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december|januar|februar|märz|april|mai|juni|juli|oktober|november|dezember|gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|dicembre|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januari|februari|maart|mei|augustus|janeiro|fevereiro|março|maio|junho|julho|setembro|outubro|novembro|dezembro)[a-z]*\s+(?:0?[1-9]|[12]\d|3[01])[,\s]+\d{4}\b',
            ],
            replacement_template="[DATE_{}]"
        ),
@ -357,8 +376,8 @@ class DataPatterns:
                r'(?<=Numéro de police )[\d.]+',
                r'(?<=Numero polizza: )[\d.]+',
                r'(?<=Numero polizza )[\d.]+',
-                # Standalone policy number format (e.g. 11.559.499) – require 2+ digit prefix to avoid amounts
-                r'\b\d{2,4}(?:\.\d{3}){2,}\b'
+                # Standalone policy number format - exclude when part of UID (CHE-115.665.634)
+                r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{2,4}(?:\.\d{3}){2,}(?:/[A-Za-z0-9]+)?\b'
            ],
            replacement_template="[POLICY_{}]"
        ),
@ -368,9 +387,9 @@ class DataPatterns:
            name="ssn",
            patterns=[
                r'\b(?:756|757|758|759)\.\d{4}\.\d{4}\.\d{2}\b(?!,)',  # Swiss AHV - exclude before decimal
-                r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b',  # Company IDs
-                # Generic SSN format - exclude when followed by comma+digit (European decimal)
-                r'\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
+                r'\b(?:CHE|DE|FR|IT)-\d{3}\.\d{3}\.\d{3}\b',  # Company IDs (must be before generic)
+                # Generic SSN format - exclude when part of company ID or before decimal
+                r'(?<!CHE-)(?<!DE-)(?<!FR-)(?<!IT-)\b\d{3}\.\d{3}\.\d{3}\b(?!,\d)'
            ],
            replacement_template="[SSN_{}]"
        )
--- a/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
+++ b/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
@ -4,6 +4,9 @@
 PDF in-place neutralization using PyMuPDF.
 Removes original text completely and inserts full UUID placeholders.
 PyMuPDF uses insert_textbox which wraps long placeholders to preserve layout.
+
+NOTE: PyMuPDF search_for() matches substrings (e.g. "CH" matches inside "Zurich",
+"CHE-115...", ".ch"). We skip short/ambiguous keys to avoid false redactions.
 """

 import io
@ -12,6 +15,16 @@ from typing import Dict, Optional

 logger = logging.getLogger(__name__)

+# Minimum length for PDF search - shorter keys cause substring false positives
+_MIN_SEARCH_LENGTH = 5
+
+# Keys we never search for in PDF (substrings of many common words)
+_PDF_SEARCH_BLOCKLIST = frozenset({
+    "CH", "DE", "FR", "IT",  # Country codes - match in Zurich, CHF, Deutschland, etc.
+    "Nr", "Nr.", "Nr:", "No", "No.", "No:",  # Abbreviations - match in Pol-Nr., Policy No., etc.
+    "www", ".ch", ".com", ".org", ".net",  # Domain parts - match in URLs
+})
+

 def neutralize_pdf_in_place(
    pdf_bytes: bytes,
@ -50,24 +63,53 @@ def neutralize_pdf_in_place(
    fontsize = 8

    try:
+        font = fitz.Font(fontname)
        for page_num in range(len(doc)):
            page = doc[page_num]
+            name_inserts = []
+            address_inserts = []
+            phone_inserts = []
+            policy_inserts = []
+            date_inserts = []
+            ssn_inserts = []  # SSN/UID: CHE-115.665.634, long placeholder doesn't fit

            for original_text, placeholder in sorted_items:
                if not original_text or not placeholder:
                    continue
+                # Skip keys that cause substring false positives (PyMuPDF search_for matches substrings)
+                if len(original_text) < _MIN_SEARCH_LENGTH:
+                    logger.debug("Skipping PDF search for short key %r (would match substrings)", original_text[:20])
+                    continue
+                if original_text.strip() in _PDF_SEARCH_BLOCKLIST:
+                    logger.debug("Skipping PDF search for blocklisted key %r", original_text)
+                    continue

                search_text = original_text
                insert_text = placeholder
-                if placeholder.startswith("[policy."):
-                    # Try label+number to get wider rect; insert UUID only (label+UUID would overflow)
+                is_name = placeholder.startswith("[name.")
+                is_address = placeholder.startswith("[address.")
+                is_phone = placeholder.startswith("[phone.")
+                is_policy = placeholder.startswith("[policy.")
+                is_date = placeholder.startswith("[date.")
+                is_ssn = placeholder.startswith("[ssn.")
+                if is_policy:
                    for prefix in ("Police Nr. ", "Police Nr.: ", "Polizzenr. ", "Policy no. ", "Policy No. "):
                        candidate = prefix + original_text
                        try:
                            hits = page.search_for(candidate, quads=False)
                            if hits:
                                search_text = candidate
-                                insert_text = placeholder  # UUID only so it fits in rect
+                                break
+                        except Exception:
+                            continue
+                elif is_ssn and any(original_text.startswith(p) for p in ("CHE-", "DE-", "FR-", "IT-")):
+                    # UID/company ID: try "UID-Nr. CHE-..." or "UID-Nr.: " for wider rect
+                    for prefix in ("UID-Nr. ", "UID-Nr.: ", "UID No. ", "UID: ", "UID-Nummer: "):
+                        candidate = prefix + original_text
+                        try:
+                            hits = page.search_for(candidate, quads=False)
+                            if hits:
+                                search_text = candidate
                                break
                        except Exception:
                            continue
@ -79,15 +121,29 @@ def neutralize_pdf_in_place(

                for rect in instances:
                    try:
-                        fs = 5 if placeholder.startswith(("[policy.", "[address.")) else fontsize
-                        page.add_redact_annot(
-                            rect,
-                            text=insert_text,
-                            fill=fill_color,
-                            text_color=text_color,
-                            fontname=fontname,
-                            fontsize=fs,
-                        )
+                        if is_name or is_address or is_phone or is_policy or is_date or is_ssn:
+                            page.add_redact_annot(rect, fill=fill_color)
+                            if is_name:
+                                name_inserts.append((rect, insert_text))
+                            elif is_address:
+                                address_inserts.append((rect, insert_text))
+                            elif is_phone:
+                                phone_inserts.append((rect, insert_text))
+                            elif is_policy:
+                                policy_inserts.append((rect, insert_text))
+                            elif is_date:
+                                date_inserts.append((rect, insert_text))
+                            else:
+                                ssn_inserts.append((rect, insert_text))
+                        else:
+                            page.add_redact_annot(
+                                rect,
+                                text=insert_text,
+                                fill=fill_color,
+                                text_color=text_color,
+                                fontname=fontname,
+                                fontsize=fontsize,
+                            )
                    except Exception as e:
                        logger.warning(f"Redact failed for {original_text[:40]!r}: {e}")

@ -96,6 +152,23 @@ def neutralize_pdf_in_place(
            except Exception as e:
                logger.debug(f"apply_redactions page {page_num + 1}: {e}")

+            # Insert placeholders with font size fitted to rect (avoids PyMuPDF shrinking to nothing)
+            for rect, text in name_inserts + address_inserts + phone_inserts + policy_inserts + date_inserts + ssn_inserts:
+                try:
+                    tl = font.text_length(text, fontsize=1)
+                    fs = max(3, min(fontsize, rect.width / tl)) if tl > 0 else 4
+                    rc = page.insert_textbox(
+                        rect, text, fontname=fontname, fontsize=fs,
+                        align=0, color=text_color
+                    )
+                    if rc < 0:
+                        page.insert_textbox(
+                            rect, text, fontname=fontname, fontsize=2,
+                            align=0, color=text_color
+                        )
+                except Exception as e:
+                    logger.warning(f"Insert placeholder failed: {e}")
+
        buf = io.BytesIO()
        doc.save(buf, garbage=4, deflate=True)
        doc.close()