diff --git a/modules/features/neutralization/serviceNeutralization/subParseString.py b/modules/features/neutralization/serviceNeutralization/subParseString.py index 32de8688..86ef2f16 100644 --- a/modules/features/neutralization/serviceNeutralization/subParseString.py +++ b/modules/features/neutralization/serviceNeutralization/subParseString.py @@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({ "Leistungen", "Basis", "Benefits", # Section labels "Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.) "incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses + "Marketing", "Verkaufsstrategien", "Qualitätsmanagement", # Business terms, not addresses + "Ausbildungsstätte", "Realschule", # Institution types, not city names # Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc. "CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:", "www", ".ch", ".com", ".org", ".net", "CHF", @@ -94,6 +96,19 @@ class StringParser: m for m in patternMatches if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges)) ] + + # For name matches: resolve overlaps – keep only longest to avoid multiple placeholders for one name + # (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID) + name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"] + name_spans = [(m[2], m[3]) for m, _ in name_matches] + patternMatches = [ + m for m in patternMatches + if m[0] != "name" + or not any( + overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2]) + for (ns, ne) in name_spans + ) + ] # Process from right to left to avoid position shifts for patternName, matchedText, start, end in reversed(patternMatches): @@ -157,25 +172,73 @@ class StringParser: expanded.add(f"{n1} {n2}") expanded.add(f"{n2} {n1}") - # Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich" + # One UUID per person: composites and their parts share same UUID + # Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich") + name_to_uuid: Dict[str, str] = {} + for composite in sorted(expanded, key=len, reverse=True): + if " " not in composite: + continue + parts = composite.split() + parts_set = frozenset(parts) + existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None) + if existing_uuid is None: + existing_uuid = next( + (self.mapping[k] for k in (composite, *parts_set) if k in self.mapping), + None + ) + if existing_uuid is None: + existing_uuid = f"[name.{uuid.uuid4()}]" + for p in parts_set: + name_to_uuid[p] = existing_uuid + name_to_uuid[composite] = existing_uuid + if len(parts) == 2: + name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid + for n in names: + if n not in name_to_uuid: + name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]" + self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping}) + + # Collect ALL matches from all name patterns, then keep only longest per span to avoid + # triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich") + all_matches: List[Tuple[str, int, int]] = [] for name in sorted(expanded, key=len, reverse=True): - # Composite: flexible whitespace (space, newline); single: word boundaries if " " in name: parts = name.split() pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b" else: pattern_str = r"\b" + re.escape(name) + r"\b" pattern = re.compile(pattern_str, re.IGNORECASE) + for m in pattern.finditer(text): + all_matches.append((m.group(), m.start(), m.end())) - matches = list(pattern.finditer(text)) - for match in reversed(matches): - matchedText = match.group() - if matchedText not in self.mapping: - placeholderId = str(uuid.uuid4()) - self.mapping[matchedText] = f"[name.{placeholderId}]" - replacement = self.mapping[matchedText] - start, end = match.span() - text = text[:start] + replacement + text[end:] + # Remove matches that overlap with a longer match (keep longest per span) + def _overlaps(s1, e1, s2, e2): + return s1 < e2 and s2 < e1 + + def _contained_in_longer(matched_text: str, start: int, end: int) -> bool: + for other_text, os, oe in all_matches: + if (os, oe) == (start, end): + continue + if _overlaps(start, end, os, oe) and (oe - os) > (end - start): + return True + return False + + to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)] + to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values()) + + # Replace from right to left to avoid position shift + for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]): + normalized = " ".join(matched_text.split()) + replacement = ( + self.mapping.get(matched_text) + or self.mapping.get(normalized) + or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None) + or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None) + ) + if not replacement: + replacement = f"[name.{uuid.uuid4()}]" + self.mapping[matched_text] = replacement + text = text[:start] + replacement + text[end:] return text diff --git a/modules/features/neutralization/serviceNeutralization/subPatterns.py b/modules/features/neutralization/serviceNeutralization/subPatterns.py index 43f91dc5..f83c817e 100644 --- a/modules/features/neutralization/serviceNeutralization/subPatterns.py +++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py @@ -307,14 +307,17 @@ class DataPatterns: name="address", patterns=[ # Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch) - r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)', - # Street + house number (standalone) - r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', + # Supports Swiss PLZ (4 digits) and German PLZ (5 digits) + r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)', + # Street + house number (standalone); includes "straße" for German + r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', # Postfach / PO Box (standalone) r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b', # Postal code + city (standalone); exclude year+non-city and common non-city words # (? len(placeholder_to_longest[ph]): + placeholder_to_longest[ph] = orig + filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()] + sorted_items = sorted(filtered, key=lambda x: -len(x[0])) fill_color = (1, 1, 1) text_color = (0, 0, 0) fontname = "helv"