From d3cfe8e9be4417a104767e8baeaa54b6ec26d522 Mon Sep 17 00:00:00 2001 From: Ida Dittrich Date: Wed, 25 Feb 2026 07:59:33 +0100 Subject: [PATCH] fix: further improved neutraliser --- .../serviceNeutralization/subParseString.py | 77 +++++++++++---- .../serviceNeutralization/subPatterns.py | 79 +++++++++------ .../subProcessPdfInPlace.py | 97 ++++++++++++++++--- 3 files changed, 190 insertions(+), 63 deletions(-) diff --git a/modules/features/neutralization/serviceNeutralization/subParseString.py b/modules/features/neutralization/serviceNeutralization/subParseString.py index d80e1a04..32de8688 100644 --- a/modules/features/neutralization/serviceNeutralization/subParseString.py +++ b/modules/features/neutralization/serviceNeutralization/subParseString.py @@ -19,6 +19,12 @@ _NEUTRALIZATION_BLACKLIST = frozenset({ "Versicherte", "Versicherungsnehmer", "Versicherung", "Insurance", "Leistungen", "Basis", "Benefits", # Section labels "Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.) + "incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses + # Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc. + "CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:", + "www", ".ch", ".com", ".org", ".net", "CHF", + # Labels that must never be neutralized + "Kontakt", "Kanzlei", "Telefon", "Matrikel-Nr", "Matrikel-Nr.", "Student ID", "Student-ID", }) @@ -69,6 +75,25 @@ class StringParser: return True return False patternMatches = [m for m in patternMatches if not is_contained(m, patternMatches)] + + # Deduplicate: keep one match per (start,end) – same span can match multiple patterns + seen = set() + unique_matches = [] + for m in patternMatches: + key = (m[2], m[3]) + if key not in seen: + seen.add(key) + unique_matches.append(m) + patternMatches = unique_matches + + # Exclude address matches that overlap with date matches (e.g. "2026 den" overlaps "17.02.2026") + def overlaps(a_start, a_end, b_start, b_end): + return a_start < b_end and b_start < a_end + date_ranges = [(m[2], m[3]) for m in patternMatches if m[0] == "date"] + patternMatches = [ + m for m in patternMatches + if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges)) + ] # Process from right to left to avoid position shifts for patternName, matchedText, start, end in reversed(patternMatches): @@ -86,6 +111,11 @@ class StringParser: # Skip if match contains any blacklisted word (e.g. "2026 Reise" or "2026 Reisebeginn" from address pattern) if any(w in _NEUTRALIZATION_BLACKLIST for w in matchedText.split()): continue + # Skip phone matches that are clearly part of a price (e.g. 128 in 128.56 CHF) + if patternName == "phone" and end + 3 <= len(text): + after = text[end : end + 3] + if (after[0] in ".," and len(after) >= 3 and after[1:3].isdigit()): + continue if matchedText not in self.mapping: # Generate a UUID for the placeholder @@ -112,36 +142,41 @@ class StringParser: def _replaceCustomNames(self, text: str) -> str: """ - Replace custom names from the user list in text - - Args: - text: Text to process - - Returns: - str: Text with custom names replaced + Replace custom names from the user list in text. + Builds composite names (e.g. "Ida Dittrich") so full names get one UUID, not one per word. """ - for name in self.NamesToParse: - if not name.strip(): - continue - - # Create case-insensitive regex pattern with word boundaries - pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE) - - # Find all matches for this name + names = [n.strip() for n in self.NamesToParse if n.strip()] + if not names: + return text + + # Add composite names: "Ida Dittrich", "Dittrich Ida" when both are in list + expanded = set(names) + for i, n1 in enumerate(names): + for n2 in names: + if n1 != n2: + expanded.add(f"{n1} {n2}") + expanded.add(f"{n2} {n1}") + + # Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich" + for name in sorted(expanded, key=len, reverse=True): + # Composite: flexible whitespace (space, newline); single: word boundaries + if " " in name: + parts = name.split() + pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b" + else: + pattern_str = r"\b" + re.escape(name) + r"\b" + pattern = re.compile(pattern_str, re.IGNORECASE) + matches = list(pattern.finditer(text)) - - # Replace each match with a placeholder - for match in reversed(matches): # Process from right to left to avoid position shifts + for match in reversed(matches): matchedText = match.group() if matchedText not in self.mapping: - # Generate a UUID for the placeholder placeholderId = str(uuid.uuid4()) self.mapping[matchedText] = f"[name.{placeholderId}]" - replacement = self.mapping[matchedText] start, end = match.span() text = text[:start] + replacement + text[end:] - + return text def processString(self, text: str) -> str: diff --git a/modules/features/neutralization/serviceNeutralization/subPatterns.py b/modules/features/neutralization/serviceNeutralization/subPatterns.py index d5a5d570..43f91dc5 100644 --- a/modules/features/neutralization/serviceNeutralization/subPatterns.py +++ b/modules/features/neutralization/serviceNeutralization/subPatterns.py @@ -243,16 +243,17 @@ class DataPatterns: r'(?<=Leiter: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', r'(?<=Kontaktperson: )[A-Za-zäöüßÄÖÜ][a-zäöüß]+\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]+', # Name only after Anrede (keep Frau/Herr; replace only the name) – fixed-width lookbehind - r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', - r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:\s+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)+', + # Use [ \t]+ not \s+ so we don't match across line breaks (avoids grabbing "Es" from "Es freut uns sehr") + r'(?<=Frau )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Herr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Mr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Mr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Mrs )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Mrs\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Ms )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Ms\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Dr )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', + r'(?<=Dr\. )[A-Za-zäöüßÄÖÜ][a-zäöüß]*(?:[ \t]+[A-Za-zäöüßÄÖÜ][a-zäöüß]*)*', ], replacement_template="[NAME_{}]" ), @@ -271,6 +272,8 @@ class DataPatterns: Pattern( name="phone", patterns=[ + # Swiss full format: +41 44 315 19 19 (area + 3 + 2 + 2 digits, flexible separators) + r'\+\s*41[-.\s]?\d{2}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b', # International format r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b', # Swiss format @@ -309,28 +312,44 @@ class DataPatterns: r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b', # Postfach / PO Box (standalone) r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b', - # Postal code + city (standalone) - r'\b\d{4}\s+[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)' + # Postal code + city (standalone); exclude year+non-city and common non-city words + # (? 0 else 4 + rc = page.insert_textbox( + rect, text, fontname=fontname, fontsize=fs, + align=0, color=text_color + ) + if rc < 0: + page.insert_textbox( + rect, text, fontname=fontname, fontsize=2, + align=0, color=text_color + ) + except Exception as e: + logger.warning(f"Insert placeholder failed: {e}") + buf = io.BytesIO() doc.save(buf, garbage=4, deflate=True) doc.close()