fix:neutralization too strong

2026-03-13 11:28:45 +01:00 · 2026-03-13 11:28:45 +01:00 · 7c2192fb3e
commit 7c2192fb3e
parent 6919a23d4f
3 changed files with 91 additions and 16 deletions
--- a/modules/features/neutralization/serviceNeutralization/subParseString.py
+++ b/modules/features/neutralization/serviceNeutralization/subParseString.py
@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
    "Leistungen", "Basis", "Benefits",  # Section labels
    "Start", "Beginn", "Ende", "End", "trip",  # Contract labels (Start of trip, End of trip, etc.)
    "incomplete", "Application", "Complete", "Pending",  # Form/status labels, not addresses
    "Marketing", "Verkaufsstrategien", "Qualitätsmanagement",  # Business terms, not addresses
    "Ausbildungsstätte", "Realschule",  # Institution types, not city names
    # Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
    "CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
    "www", ".ch", ".com", ".org", ".net", "CHF",
@ -94,6 +96,19 @@ class StringParser:
            m for m in patternMatches
            if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
        ]
        # For name matches: resolve overlaps – keep only longest to avoid multiple placeholders for one name
        # (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID)
        name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"]
        name_spans = [(m[2], m[3]) for m, _ in name_matches]
        patternMatches = [
            m for m in patternMatches
            if m[0] != "name"
            or not any(
                overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2])
                for (ns, ne) in name_spans
            )
        ]
        # Process from right to left to avoid position shifts
        for patternName, matchedText, start, end in reversed(patternMatches):
@ -157,25 +172,73 @@ class StringParser:
                    expanded.add(f"{n1} {n2}")
                    expanded.add(f"{n2} {n1}")
-        # Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
+        # One UUID per person: composites and their parts share same UUID
        # Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich")
        name_to_uuid: Dict[str, str] = {}
        for composite in sorted(expanded, key=len, reverse=True):
            if " " not in composite:
                continue
            parts = composite.split()
            parts_set = frozenset(parts)
            existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None)
            if existing_uuid is None:
                existing_uuid = next(
                    (self.mapping[k] for k in (composite, *parts_set) if k in self.mapping),
                    None
                )
            if existing_uuid is None:
                existing_uuid = f"[name.{uuid.uuid4()}]"
            for p in parts_set:
                name_to_uuid[p] = existing_uuid
            name_to_uuid[composite] = existing_uuid
            if len(parts) == 2:
                name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid
        for n in names:
            if n not in name_to_uuid:
                name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]"
        self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping})
        # Collect ALL matches from all name patterns, then keep only longest per span to avoid
        # triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich")
        all_matches: List[Tuple[str, int, int]] = []
        for name in sorted(expanded, key=len, reverse=True):
            # Composite: flexible whitespace (space, newline); single: word boundaries
            if " " in name:
                parts = name.split()
                pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
            else:
                pattern_str = r"\b" + re.escape(name) + r"\b"
            pattern = re.compile(pattern_str, re.IGNORECASE)
            for m in pattern.finditer(text):
                all_matches.append((m.group(), m.start(), m.end()))
-            matches = list(pattern.finditer(text))
+        # Remove matches that overlap with a longer match (keep longest per span)
-            for match in reversed(matches):
+        def _overlaps(s1, e1, s2, e2):
-                matchedText = match.group()
+            return s1 < e2 and s2 < e1
-                if matchedText not in self.mapping:
+
-                    placeholderId = str(uuid.uuid4())
+        def _contained_in_longer(matched_text: str, start: int, end: int) -> bool:
-                    self.mapping[matchedText] = f"[name.{placeholderId}]"
+            for other_text, os, oe in all_matches:
-                replacement = self.mapping[matchedText]
+                if (os, oe) == (start, end):
-                start, end = match.span()
+                    continue
-                text = text[:start] + replacement + text[end:]
+                if _overlaps(start, end, os, oe) and (oe - os) > (end - start):
                    return True
            return False
        to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)]
        to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values())
        # Replace from right to left to avoid position shift
        for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]):
            normalized = " ".join(matched_text.split())
            replacement = (
                self.mapping.get(matched_text)
                or self.mapping.get(normalized)
                or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None)
                or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None)
            )
            if not replacement:
                replacement = f"[name.{uuid.uuid4()}]"
                self.mapping[matched_text] = replacement
            text = text[:start] + replacement + text[end:]
        return text
--- a/modules/features/neutralization/serviceNeutralization/subPatterns.py
+++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py
@ -307,14 +307,17 @@ class DataPatterns:
            name="address",
            patterns=[
                # Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
-                r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
+                # Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
-                # Street + house number (standalone)
+                r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
-                r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
+                # Street + house number (standalone); includes "straße" for German
                r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
                # Postfach / PO Box (standalone)
                r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
                # Postal code + city (standalone); exclude year+non-city and common non-city words
                # (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
-                r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
+                # Exclude business terms (Marketing, Qualitätsmanagement, etc.) – often follow years
                # Swiss PLZ (4 digits) and German PLZ (5 digits)
                r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
            ],
            replacement_template="[ADDRESS_{}]"
        ),
--- a/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
+++ b/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
@ -56,7 +56,16 @@ def neutralize_pdf_in_place(
        logger.error(f"Failed to open PDF: {e}")
        return None
-    sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
+    # For same placeholder: only search longest original_text to avoid triple overlay
    # (e.g. "Ida Dittrich", "Ida", "Dittrich" all map to [name.x] → only search "Ida Dittrich")
    placeholder_to_longest: Dict[str, str] = {}
    for orig, ph in mapping.items():
        if not orig or not ph:
            continue
        if ph not in placeholder_to_longest or len(orig) > len(placeholder_to_longest[ph]):
            placeholder_to_longest[ph] = orig
    filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()]
    sorted_items = sorted(filtered, key=lambda x: -len(x[0]))
    fill_color = (1, 1, 1)
    text_color = (0, 0, 0)
    fontname = "helv"