fix:neutralization too strong

2026-03-13 11:28:45 +01:00 · 2026-03-13 11:28:45 +01:00 · 7c2192fb3e
commit 7c2192fb3e
parent 6919a23d4f
3 changed files with 91 additions and 16 deletions
--- a/modules/features/neutralization/serviceNeutralization/subParseString.py
+++ b/modules/features/neutralization/serviceNeutralization/subParseString.py
@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
    "Leistungen", "Basis", "Benefits",  # Section labels
    "Start", "Beginn", "Ende", "End", "trip",  # Contract labels (Start of trip, End of trip, etc.)
    "incomplete", "Application", "Complete", "Pending",  # Form/status labels, not addresses
+    "Marketing", "Verkaufsstrategien", "Qualitätsmanagement",  # Business terms, not addresses
+    "Ausbildungsstätte", "Realschule",  # Institution types, not city names
    # Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
    "CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
    "www", ".ch", ".com", ".org", ".net", "CHF",
@ -94,6 +96,19 @@ class StringParser:
            m for m in patternMatches
            if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
        ]
+
+        # For name matches: resolve overlaps – keep only longest to avoid multiple placeholders for one name
+        # (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID)
+        name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"]
+        name_spans = [(m[2], m[3]) for m, _ in name_matches]
+        patternMatches = [
+            m for m in patternMatches
+            if m[0] != "name"
+            or not any(
+                overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2])
+                for (ns, ne) in name_spans
+            )
+        ]
        
        # Process from right to left to avoid position shifts
        for patternName, matchedText, start, end in reversed(patternMatches):
@ -157,25 +172,73 @@ class StringParser:
                    expanded.add(f"{n1} {n2}")
                    expanded.add(f"{n2} {n1}")

-        # Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
+        # One UUID per person: composites and their parts share same UUID
+        # Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich")
+        name_to_uuid: Dict[str, str] = {}
+        for composite in sorted(expanded, key=len, reverse=True):
+            if " " not in composite:
+                continue
+            parts = composite.split()
+            parts_set = frozenset(parts)
+            existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None)
+            if existing_uuid is None:
+                existing_uuid = next(
+                    (self.mapping[k] for k in (composite, *parts_set) if k in self.mapping),
+                    None
+                )
+            if existing_uuid is None:
+                existing_uuid = f"[name.{uuid.uuid4()}]"
+            for p in parts_set:
+                name_to_uuid[p] = existing_uuid
+            name_to_uuid[composite] = existing_uuid
+            if len(parts) == 2:
+                name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid
+        for n in names:
+            if n not in name_to_uuid:
+                name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]"
+        self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping})
+
+        # Collect ALL matches from all name patterns, then keep only longest per span to avoid
+        # triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich")
+        all_matches: List[Tuple[str, int, int]] = []
        for name in sorted(expanded, key=len, reverse=True):
-            # Composite: flexible whitespace (space, newline); single: word boundaries
            if " " in name:
                parts = name.split()
                pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
            else:
                pattern_str = r"\b" + re.escape(name) + r"\b"
            pattern = re.compile(pattern_str, re.IGNORECASE)
+            for m in pattern.finditer(text):
+                all_matches.append((m.group(), m.start(), m.end()))

-            matches = list(pattern.finditer(text))
-            for match in reversed(matches):
-                matchedText = match.group()
-                if matchedText not in self.mapping:
-                    placeholderId = str(uuid.uuid4())
-                    self.mapping[matchedText] = f"[name.{placeholderId}]"
-                replacement = self.mapping[matchedText]
-                start, end = match.span()
-                text = text[:start] + replacement + text[end:]
+        # Remove matches that overlap with a longer match (keep longest per span)
+        def _overlaps(s1, e1, s2, e2):
+            return s1 < e2 and s2 < e1
+
+        def _contained_in_longer(matched_text: str, start: int, end: int) -> bool:
+            for other_text, os, oe in all_matches:
+                if (os, oe) == (start, end):
+                    continue
+                if _overlaps(start, end, os, oe) and (oe - os) > (end - start):
+                    return True
+            return False
+
+        to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)]
+        to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values())
+
+        # Replace from right to left to avoid position shift
+        for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]):
+            normalized = " ".join(matched_text.split())
+            replacement = (
+                self.mapping.get(matched_text)
+                or self.mapping.get(normalized)
+                or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None)
+                or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None)
+            )
+            if not replacement:
+                replacement = f"[name.{uuid.uuid4()}]"
+                self.mapping[matched_text] = replacement
+            text = text[:start] + replacement + text[end:]

        return text
    
--- a/modules/features/neutralization/serviceNeutralization/subPatterns.py
+++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py
@ -307,14 +307,17 @@ class DataPatterns:
            name="address",
            patterns=[
                # Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
-                r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
-                # Street + house number (standalone)
-                r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
+                # Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
+                r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
+                # Street + house number (standalone); includes "straße" for German
+                r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
                # Postfach / PO Box (standalone)
                r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
                # Postal code + city (standalone); exclude year+non-city and common non-city words
                # (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
-                r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
+                # Exclude business terms (Marketing, Qualitätsmanagement, etc.) – often follow years
+                # Swiss PLZ (4 digits) and German PLZ (5 digits)
+                r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
            ],
            replacement_template="[ADDRESS_{}]"
        ),
--- a/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
+++ b/modules/features/neutralization/serviceNeutralization/subProcessPdfInPlace.py
@ -56,7 +56,16 @@ def neutralize_pdf_in_place(
        logger.error(f"Failed to open PDF: {e}")
        return None

-    sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
+    # For same placeholder: only search longest original_text to avoid triple overlay
+    # (e.g. "Ida Dittrich", "Ida", "Dittrich" all map to [name.x] → only search "Ida Dittrich")
+    placeholder_to_longest: Dict[str, str] = {}
+    for orig, ph in mapping.items():
+        if not orig or not ph:
+            continue
+        if ph not in placeholder_to_longest or len(orig) > len(placeholder_to_longest[ph]):
+            placeholder_to_longest[ph] = orig
+    filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()]
+    sorted_items = sorted(filtered, key=lambda x: -len(x[0]))
    fill_color = (1, 1, 1)
    text_color = (0, 0, 0)
    fontname = "helv"