fix:neutralization too strong
This commit is contained in:
parent
6919a23d4f
commit
7c2192fb3e
3 changed files with 91 additions and 16 deletions
|
|
@ -20,6 +20,8 @@ _NEUTRALIZATION_BLACKLIST = frozenset({
|
||||||
"Leistungen", "Basis", "Benefits", # Section labels
|
"Leistungen", "Basis", "Benefits", # Section labels
|
||||||
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
"Start", "Beginn", "Ende", "End", "trip", # Contract labels (Start of trip, End of trip, etc.)
|
||||||
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
|
"incomplete", "Application", "Complete", "Pending", # Form/status labels, not addresses
|
||||||
|
"Marketing", "Verkaufsstrategien", "Qualitätsmanagement", # Business terms, not addresses
|
||||||
|
"Ausbildungsstätte", "Realschule", # Institution types, not city names
|
||||||
# Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
|
# Ambiguous substrings – match in Zurich, CHF, UID-Nr., websites, etc.
|
||||||
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
|
"CH", "DE", "FR", "IT", "Nr", "Nr.", "Nr:", "No", "No.", "No:",
|
||||||
"www", ".ch", ".com", ".org", ".net", "CHF",
|
"www", ".ch", ".com", ".org", ".net", "CHF",
|
||||||
|
|
@ -94,6 +96,19 @@ class StringParser:
|
||||||
m for m in patternMatches
|
m for m in patternMatches
|
||||||
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
|
if not (m[0] == "address" and any(overlaps(m[2], m[3], ds, de) for ds, de in date_ranges))
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# For name matches: resolve overlaps – keep only longest to avoid multiple placeholders for one name
|
||||||
|
# (e.g. "Ida", "Dittrich", "Ida Dittrich" → keep only "Ida Dittrich" with one UUID)
|
||||||
|
name_matches = [(m, m[3] - m[2]) for m in patternMatches if m[0] == "name"]
|
||||||
|
name_spans = [(m[2], m[3]) for m, _ in name_matches]
|
||||||
|
patternMatches = [
|
||||||
|
m for m in patternMatches
|
||||||
|
if m[0] != "name"
|
||||||
|
or not any(
|
||||||
|
overlaps(m[2], m[3], ns, ne) and (ne - ns) > (m[3] - m[2])
|
||||||
|
for (ns, ne) in name_spans
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
# Process from right to left to avoid position shifts
|
# Process from right to left to avoid position shifts
|
||||||
for patternName, matchedText, start, end in reversed(patternMatches):
|
for patternName, matchedText, start, end in reversed(patternMatches):
|
||||||
|
|
@ -157,25 +172,73 @@ class StringParser:
|
||||||
expanded.add(f"{n1} {n2}")
|
expanded.add(f"{n1} {n2}")
|
||||||
expanded.add(f"{n2} {n1}")
|
expanded.add(f"{n2} {n1}")
|
||||||
|
|
||||||
# Process longest first so "Ida Dittrich" replaces before "Ida" or "Dittrich"
|
# One UUID per person: composites and their parts share same UUID
|
||||||
|
# Also align with DataPatterns mapping (step 1 may have already replaced "Ida Dittrich")
|
||||||
|
name_to_uuid: Dict[str, str] = {}
|
||||||
|
for composite in sorted(expanded, key=len, reverse=True):
|
||||||
|
if " " not in composite:
|
||||||
|
continue
|
||||||
|
parts = composite.split()
|
||||||
|
parts_set = frozenset(parts)
|
||||||
|
existing_uuid = next((name_to_uuid[p] for p in parts_set if p in name_to_uuid), None)
|
||||||
|
if existing_uuid is None:
|
||||||
|
existing_uuid = next(
|
||||||
|
(self.mapping[k] for k in (composite, *parts_set) if k in self.mapping),
|
||||||
|
None
|
||||||
|
)
|
||||||
|
if existing_uuid is None:
|
||||||
|
existing_uuid = f"[name.{uuid.uuid4()}]"
|
||||||
|
for p in parts_set:
|
||||||
|
name_to_uuid[p] = existing_uuid
|
||||||
|
name_to_uuid[composite] = existing_uuid
|
||||||
|
if len(parts) == 2:
|
||||||
|
name_to_uuid[f"{parts[1]} {parts[0]}"] = existing_uuid
|
||||||
|
for n in names:
|
||||||
|
if n not in name_to_uuid:
|
||||||
|
name_to_uuid[n] = self.mapping.get(n) or f"[name.{uuid.uuid4()}]"
|
||||||
|
self.mapping.update({k: v for k, v in name_to_uuid.items() if k not in self.mapping})
|
||||||
|
|
||||||
|
# Collect ALL matches from all name patterns, then keep only longest per span to avoid
|
||||||
|
# triple replacement ("Ida" + "Dittrich" + "Ida Dittrich" -> only "Ida Dittrich")
|
||||||
|
all_matches: List[Tuple[str, int, int]] = []
|
||||||
for name in sorted(expanded, key=len, reverse=True):
|
for name in sorted(expanded, key=len, reverse=True):
|
||||||
# Composite: flexible whitespace (space, newline); single: word boundaries
|
|
||||||
if " " in name:
|
if " " in name:
|
||||||
parts = name.split()
|
parts = name.split()
|
||||||
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
|
pattern_str = r"\b" + r"\s+".join(re.escape(p) for p in parts) + r"\b"
|
||||||
else:
|
else:
|
||||||
pattern_str = r"\b" + re.escape(name) + r"\b"
|
pattern_str = r"\b" + re.escape(name) + r"\b"
|
||||||
pattern = re.compile(pattern_str, re.IGNORECASE)
|
pattern = re.compile(pattern_str, re.IGNORECASE)
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
all_matches.append((m.group(), m.start(), m.end()))
|
||||||
|
|
||||||
matches = list(pattern.finditer(text))
|
# Remove matches that overlap with a longer match (keep longest per span)
|
||||||
for match in reversed(matches):
|
def _overlaps(s1, e1, s2, e2):
|
||||||
matchedText = match.group()
|
return s1 < e2 and s2 < e1
|
||||||
if matchedText not in self.mapping:
|
|
||||||
placeholderId = str(uuid.uuid4())
|
def _contained_in_longer(matched_text: str, start: int, end: int) -> bool:
|
||||||
self.mapping[matchedText] = f"[name.{placeholderId}]"
|
for other_text, os, oe in all_matches:
|
||||||
replacement = self.mapping[matchedText]
|
if (os, oe) == (start, end):
|
||||||
start, end = match.span()
|
continue
|
||||||
text = text[:start] + replacement + text[end:]
|
if _overlaps(start, end, os, oe) and (oe - os) > (end - start):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
to_replace = [(t, s, e) for t, s, e in all_matches if not _contained_in_longer(t, s, e)]
|
||||||
|
to_replace = list({(s, e): (t, s, e) for t, s, e in to_replace}.values())
|
||||||
|
|
||||||
|
# Replace from right to left to avoid position shift
|
||||||
|
for matched_text, start, end in sorted(to_replace, key=lambda x: -x[1]):
|
||||||
|
normalized = " ".join(matched_text.split())
|
||||||
|
replacement = (
|
||||||
|
self.mapping.get(matched_text)
|
||||||
|
or self.mapping.get(normalized)
|
||||||
|
or next((v for k, v in self.mapping.items() if " ".join(k.split()) == normalized), None)
|
||||||
|
or next((v for k, v in self.mapping.items() if k.lower() == matched_text.lower()), None)
|
||||||
|
)
|
||||||
|
if not replacement:
|
||||||
|
replacement = f"[name.{uuid.uuid4()}]"
|
||||||
|
self.mapping[matched_text] = replacement
|
||||||
|
text = text[:start] + replacement + text[end:]
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -307,14 +307,17 @@ class DataPatterns:
|
||||||
name="address",
|
name="address",
|
||||||
patterns=[
|
patterns=[
|
||||||
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
|
# Full address block: company, street, postfach, postal+city (stop before domain like , AXA.ch)
|
||||||
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
# Supports Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||||||
# Street + house number (standalone)
|
r'\b[^,\n]+(?:,\s*[^,\n]+)*,\s*\d{4,5}\s+[A-Za-zäöüßÄÖÜ]+\s*(?=,\s*[a-zA-Z0-9.-]+\.(?:ch|com|org|net)\b|$)',
|
||||||
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
# Street + house number (standalone); includes "straße" for German
|
||||||
|
r'\b(?:[A-Za-zäöüßÄÖÜ]+(?:-[A-Za-zäöüßÄÖÜ]+)*(?:straße|strasse|str\.|gasse|weg|platz|allee|boulevard|avenue|via|strada|rue|chemin|route))\s+\d{1,4}(?:/\d{1,4})?(?:[a-z])?\b',
|
||||||
# Postfach / PO Box (standalone)
|
# Postfach / PO Box (standalone)
|
||||||
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
r'\b(?:Postfach|Postbox|P\.?O\.?\s*Box|Case\s+postale|Casella\s+postale|Boîte\s+postale)\s+\d{1,6}\b',
|
||||||
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
# Postal code + city (standalone); exclude year+non-city and common non-city words
|
||||||
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
# (?<!\d{2}\.\d{2}\.) = not part of date DD.MM.YYYY (e.g. 27.01.2026)
|
||||||
r'(?<!\d{2}\.\d{2}\.)\b\d{4}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
# Exclude business terms (Marketing, Qualitätsmanagement, etc.) – often follow years
|
||||||
|
# Swiss PLZ (4 digits) and German PLZ (5 digits)
|
||||||
|
r'(?<!\d{2}\.\d{2}\.)\b\d{4,5}\s+(?!den|der|die|das|dem|des|und|oder|für|bei|mit|Version|Versand|Vertrag|Verfügung|Verschickung|Versicherung|erhalten|Schreiben|Jahr|Jahres|incomplete|Application|Complete|Pending|Matrikel|Student|Studien|Kontakt|Telefon|Rechnung|Invoice|Marketing|Verkaufsstrategien|Qualitätsmanagement|Management|Strategien|Projektmanagement|Vertrieb|Vertriebsstrategien|Ausbildungsstätte|Realschule)[A-Za-zäöüßÄÖÜ]+\b(?!\s*:)'
|
||||||
],
|
],
|
||||||
replacement_template="[ADDRESS_{}]"
|
replacement_template="[ADDRESS_{}]"
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,16 @@ def neutralize_pdf_in_place(
|
||||||
logger.error(f"Failed to open PDF: {e}")
|
logger.error(f"Failed to open PDF: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sorted_items = sorted(mapping.items(), key=lambda x: -len(x[0]))
|
# For same placeholder: only search longest original_text to avoid triple overlay
|
||||||
|
# (e.g. "Ida Dittrich", "Ida", "Dittrich" all map to [name.x] → only search "Ida Dittrich")
|
||||||
|
placeholder_to_longest: Dict[str, str] = {}
|
||||||
|
for orig, ph in mapping.items():
|
||||||
|
if not orig or not ph:
|
||||||
|
continue
|
||||||
|
if ph not in placeholder_to_longest or len(orig) > len(placeholder_to_longest[ph]):
|
||||||
|
placeholder_to_longest[ph] = orig
|
||||||
|
filtered = [(orig, ph) for ph, orig in placeholder_to_longest.items()]
|
||||||
|
sorted_items = sorted(filtered, key=lambda x: -len(x[0]))
|
||||||
fill_color = (1, 1, 1)
|
fill_color = (1, 1, 1)
|
||||||
text_color = (0, 0, 0)
|
text_color = (0, 0, 0)
|
||||||
fontname = "helv"
|
fontname = "helv"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue