fix:neutralization too strong

This commit is contained in:
Ida Dittrich 2026-03-13 11:28:45 +01:00
parent 6919a23d4f
commit 7c2192fb3e
3 changed files with 91 additions and 16 deletions

View file

@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
"Leistungen", "Basis", "Benefits", # Section labels
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
"Marketing", "Verkaufsstrategien", "Qualitätsmanagement", # Business terms, not addresses
"Ausbildungsstätte", "Realschule", # Institution types, not city names
# Ambiguous substrings match in Zurich, CHF, UID-Nr., websites, etc.
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
"www", ".ch", ".com", ".org", ".net", "CHF",
@ -94,6 +96,19 @@ class StringParser:
m for m in patternMatches
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
]
# For name matches: resolve overlaps keep only longest to avoid multiple placeholders for one name
# (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID)
name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"]
name_spans = [(m[2], m[3]) for m, _ in name_matches]
patternMatches = [
m for m in patternMatches
if m[0] != "name"
or not any(
overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2])
for (ns, ne) in name_spans
)
]
# Process from right to left to avoid position shifts
for patternName, matchedText, start, end in reversed(patternMatches):
@ -157,25 +172,73 @@ class StringParser:
expanded.add(f"{n1} {n2}")
expanded.add(f"{n2} {n1}")
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
# One UUID per person: composites and their parts share same UUID
# Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich")
name_to_uuid: Dict[str, str] = {}
for composite in sorted(expanded, key=len, reverse=True):
if " " not in composite:
continue
parts = composite.split()
parts_set = frozenset(parts)
existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None)
if existing_uuid is None:
existing_uuid = next(
(self.mapping[k] for k in (composite, *parts_set) if k in self.mapping),
None
)
if existing_uuid is None:
existing_uuid = f"[name.{uuid.uuid4()}]"
for p in parts_set:
name_to_uuid[p] = existing_uuid
name_to_uuid[composite] = existing_uuid
if len(parts) == 2:
name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid
for n in names:
if n not in name_to_uuid:
name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]"
self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping})
# Collect ALL matches from all name patterns, then keep only longest per span to avoid
# triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich")
all_matches: List[Tuple[str, int, int]] = []
for name in sorted(expanded, key=len, reverse=True):
# Composite: flexible whitespace (space, newline); single: word boundaries
if " " in name:
parts = name.split()
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
else:
pattern_str = r"\b" + re.escape(name) + r"\b"
pattern = re.compile(pattern_str, re.IGNORECASE)
for m in pattern.finditer(text):
all_matches.append((m.group(), m.start(), m.end()))
matches = list(pattern.finditer(text))
for match in reversed(matches):
matchedText = match.group()
if matchedText not in self.mapping:
placeholderId = str(uuid.uuid4())
self.mapping[matchedText] = f"[name.{placeholderId}]"
replacement = self.mapping[matchedText]
start, end = match.span()
text = text[:start] + replacement + text[end:]
# Remove matches that overlap with a longer match (keep longest per span)
def _overlaps(s1, e1, s2, e2):
return s1 < e2 and s2 < e1
def _contained_in_longer(matched_text: str, start: int, end: int) -> bool:
for other_text, os, oe in all_matches:
if (os, oe) == (start, end):
continue
if _overlaps(start, end, os, oe) and (oe - os) > (end - start):
return True
return False
to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)]
to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values())
# Replace from right to left to avoid position shift
for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]):
normalized = " ".join(matched_text.split())
replacement = (
self.mapping.get(matched_text)
or self.mapping.get(normalized)
or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None)
or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None)
)
if not replacement:
replacement = f"[name.{uuid.uuid4()}]"
self.mapping[matched_text] = replacement
text = text[:start] + replacement + text[end:]
return text

View file

@ -307,14 +307,17 @@ class DataPatterns:
name="address",
patterns=[
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
# Street + house number (standalone)
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
# Street + house number (standalone); includes "straße" for German
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
# Postfach / PO Box (standalone)
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
# Postal code + city (standalone); exclude year+non-city and common non-city words
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
# Exclude business terms (Marketing, Qualitätsmanagement, etc.) often follow years
# Swiss PLZ (4 digits) and German PLZ (5 digits)
r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
],
replacement_template="[ADDRESS_{}]"
),

View file

@ -56,7 +56,16 @@ def neutralize_pdf_in_place(
logger.error(f"Failed to open PDF: {e}")
return None
sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
# For same placeholder: only search longest original_text to avoid triple overlay
# (e.g. "Ida Dittrich", "Ida", "Dittrich" all map to [name.x] → only search "Ida Dittrich")
placeholder_to_longest: Dict[str, str] = {}
for orig, ph in mapping.items():
if not orig or not ph:
continue
if ph not in placeholder_to_longest or len(orig) > len(placeholder_to_longest[ph]):
placeholder_to_longest[ph] = orig
filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()]
sorted_items = sorted(filtered, key=lambda x: -len(x[0]))
fill_color = (1, 1, 1)
text_color = (0, 0, 0)
fontname = "helv"