# Copyright (c) 2025 Patrick Motsch # All rights reserved. """Text normalisation utilities used by knowledge ingestion. The email body cleaning logic is intentionally regex-based and works on plain text after an HTML→text pass so we never store unsanitised HTML/JS in the knowledge store and retrieval stays robust (no extraneous markup tokens eating embedding budget). """ from __future__ import annotations import re from typing import Optional DEFAULT_MAX_CHARS = 8000 _QUOTE_MARKER_PATTERNS = [ re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE), ] _SIGNATURE_MARKERS = [ re.compile(r"^\s*-{2,}\s*$", re.MULTILINE), re.compile(r"^\s*—\s*$", re.MULTILINE), re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE), re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE), ] def _htmlToText(html: str) -> str: """Prefer BeautifulSoup when available, fall back to regex.""" try: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(html, "html.parser") for tag in soup(["script", "style", "head"]): tag.decompose() for br in soup.find_all(["br"]): br.replace_with("\n") for p in soup.find_all(["p", "div", "li", "tr"]): p.append("\n") text = soup.get_text() except Exception: # Minimal fallback: strip tags crudely. text = re.sub(r"", "\n", html, flags=re.IGNORECASE) text = re.sub(r"", "\n", text, flags=re.IGNORECASE) text = re.sub(r"<[^>]+>", "", text) # Collapse non-breaking + zero-width whitespace. text = text.replace("\u00a0", " ").replace("\u200b", "") return text def _stripQuotedThread(text: str) -> str: """Remove reply-chain content so only the author's own contribution remains.""" earliest = len(text) for pattern in _QUOTE_MARKER_PATTERNS: match = pattern.search(text) if match and match.start() < earliest: earliest = match.start() # Drop any block starting with "> " quoted lines (often Gmail/Thunderbird). quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE) if quotedBlock and quotedBlock.start() < earliest: earliest = quotedBlock.start() return text[:earliest].rstrip() def _stripSignature(text: str) -> str: earliest = len(text) for pattern in _SIGNATURE_MARKERS: match = pattern.search(text) if match and match.start() < earliest: earliest = match.start() return text[:earliest].rstrip() def _collapseWhitespace(text: str) -> str: text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str: """Return a compact plain-text view of an email body suitable for embedding. Steps: HTML → text, remove quoted reply chain, remove signature, collapse whitespace, truncate to maxChars. Always returns a string (possibly empty). """ if not html: return "" text = _htmlToText(html) if "<" in html and ">" in html else html text = _stripQuotedThread(text) text = _stripSignature(text) text = _collapseWhitespace(text) if maxChars and len(text) > maxChars: text = text[:maxChars].rstrip() + "…" return text