fix:neutralization too strong
This commit is contained in:
parent
6919a23d4f
commit
7c2192fb3e
3 changed files with 91 additions and 16 deletions
|
|
@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
|
|||
"Leistungen", "Basis", "Benefits", # Section labels
|
||||
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
||||
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
|
||||
"Marketing", "Verkaufsstrategien", "Qualitätsmanagement", # Business terms, not addresses
|
||||
"Ausbildungsstätte", "Realschule", # Institution types, not city names
|
||||
# Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
|
||||
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
|
||||
"www", ".ch", ".com", ".org", ".net", "CHF",
|
||||
|
|
@ -95,6 +97,19 @@ class StringParser:
|
|||
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
|
||||
]
|
||||
|
||||
# For name matches: resolve overlaps – keep only longest to avoid multiple placeholders for one name
|
||||
# (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID)
|
||||
name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"]
|
||||
name_spans = [(m[2], m[3]) for m, _ in name_matches]
|
||||
patternMatches = [
|
||||
m for m in patternMatches
|
||||
if m[0] != "name"
|
||||
or not any(
|
||||
overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2])
|
||||
for (ns, ne) in name_spans
|
||||
)
|
||||
]
|
||||
|
||||
# Process from right to left to avoid position shifts
|
||||
for patternName, matchedText, start, end in reversed(patternMatches):
|
||||
# Skip if already a placeholder
|
||||
|
|
@ -157,25 +172,73 @@ class StringParser:
|
|||
expanded.add(f"{n1} {n2}")
|
||||
expanded.add(f"{n2} {n1}")
|
||||
|
||||
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
|
||||
# One UUID per person: composites and their parts share same UUID
|
||||
# Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich")
|
||||
name_to_uuid: Dict[str, str] = {}
|
||||
for composite in sorted(expanded, key=len, reverse=True):
|
||||
if " " not in composite:
|
||||
continue
|
||||
parts = composite.split()
|
||||
parts_set = frozenset(parts)
|
||||
existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None)
|
||||
if existing_uuid is None:
|
||||
existing_uuid = next(
|
||||
(self.mapping[k] for k in (composite, *parts_set) if k in self.mapping),
|
||||
None
|
||||
)
|
||||
if existing_uuid is None:
|
||||
existing_uuid = f"[name.{uuid.uuid4()}]"
|
||||
for p in parts_set:
|
||||
name_to_uuid[p] = existing_uuid
|
||||
name_to_uuid[composite] = existing_uuid
|
||||
if len(parts) == 2:
|
||||
name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid
|
||||
for n in names:
|
||||
if n not in name_to_uuid:
|
||||
name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]"
|
||||
self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping})
|
||||
|
||||
# Collect ALL matches from all name patterns, then keep only longest per span to avoid
|
||||
# triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich")
|
||||
all_matches: List[Tuple[str, int, int]] = []
|
||||
for name in sorted(expanded, key=len, reverse=True):
|
||||
# Composite: flexible whitespace (space, newline); single: word boundaries
|
||||
if " " in name:
|
||||
parts = name.split()
|
||||
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
|
||||
else:
|
||||
pattern_str = r"\b" + re.escape(name) + r"\b"
|
||||
pattern = re.compile(pattern_str, re.IGNORECASE)
|
||||
for m in pattern.finditer(text):
|
||||
all_matches.append((m.group(), m.start(), m.end()))
|
||||
|
||||
matches = list(pattern.finditer(text))
|
||||
for match in reversed(matches):
|
||||
matchedText = match.group()
|
||||
if matchedText not in self.mapping:
|
||||
placeholderId = str(uuid.uuid4())
|
||||
self.mapping[matchedText] = f"[name.{placeholderId}]"
|
||||
replacement = self.mapping[matchedText]
|
||||
start, end = match.span()
|
||||
text = text[:start] + replacement + text[end:]
|
||||
# Remove matches that overlap with a longer match (keep longest per span)
|
||||
def _overlaps(s1, e1, s2, e2):
|
||||
return s1 < e2 and s2 < e1
|
||||
|
||||
def _contained_in_longer(matched_text: str, start: int, end: int) -> bool:
|
||||
for other_text, os, oe in all_matches:
|
||||
if (os, oe) == (start, end):
|
||||
continue
|
||||
if _overlaps(start, end, os, oe) and (oe - os) > (end - start):
|
||||
return True
|
||||
return False
|
||||
|
||||
to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)]
|
||||
to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values())
|
||||
|
||||
# Replace from right to left to avoid position shift
|
||||
for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]):
|
||||
normalized = " ".join(matched_text.split())
|
||||
replacement = (
|
||||
self.mapping.get(matched_text)
|
||||
or self.mapping.get(normalized)
|
||||
or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None)
|
||||
or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None)
|
||||
)
|
||||
if not replacement:
|
||||
replacement = f"[name.{uuid.uuid4()}]"
|
||||
self.mapping[matched_text] = replacement
|
||||
text = text[:start] + replacement + text[end:]
|
||||
|
||||
return text
|
||||
|
||||
|
|
|
|||
|
|
@ -307,14 +307,17 @@ class DataPatterns:
|
|||
name="address",
|
||||
patterns=[
|
||||
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
|
||||
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
||||
# Street + house number (standalone)
|
||||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||||
# Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||||
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
||||
# Street + house number (standalone); includes "straße" for German
|
||||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||||
# Postfach / PO Box (standalone)
|
||||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||||
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
||||
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
||||
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||
# Exclude business terms (Marketing, Qualitätsmanagement, etc.) – often follow years
|
||||
# Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||||
r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||
],
|
||||
replacement_template="[ADDRESS_{}]"
|
||||
),
|
||||
|
|
|
|||
|
|
@ -56,7 +56,16 @@ def neutralize_pdf_in_place(
|
|||
logger.error(f"Failed to open PDF: {e}")
|
||||
return None
|
||||
|
||||
sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
|
||||
# For same placeholder: only search longest original_text to avoid triple overlay
|
||||
# (e.g. "Ida Dittrich", "Ida", "Dittrich" all map to [name.x] → only search "Ida Dittrich")
|
||||
placeholder_to_longest: Dict[str, str] = {}
|
||||
for orig, ph in mapping.items():
|
||||
if not orig or not ph:
|
||||
continue
|
||||
if ph not in placeholder_to_longest or len(orig) > len(placeholder_to_longest[ph]):
|
||||
placeholder_to_longest[ph] = orig
|
||||
filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()]
|
||||
sorted_items = sorted(filtered, key=lambda x: -len(x[0]))
|
||||
fill_color = (1, 1, 1)
|
||||
text_color = (0, 0, 0)
|
||||
fontname = "helv"
|
||||
|
|
|
|||
Loading…
Reference in a new issue