gateway/modules/serviceCenter/services/serviceKnowledge/subTextClean.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Text normalisation utilities used by knowledge ingestion.

The email body cleaning logic is intentionally regex-based and works on plain
text after an HTML→text pass so we never store unsanitised HTML/JS in the
knowledge store and retrieval stays robust (no extraneous markup tokens
eating embedding budget).
"""

from __future__ import annotations

import re
from typing import Optional

DEFAULT_MAX_CHARS = 8000


_QUOTE_MARKER_PATTERNS = [
    re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
]

_SIGNATURE_MARKERS = [
    re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
    re.compile(r"^\s*—\s*$", re.MULTILINE),
    re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
    re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
]


def _htmlToText(html: str) -> str:
    """Prefer BeautifulSoup when available, fall back to regex."""
    try:
        from bs4 import BeautifulSoup  # type: ignore

        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "head"]):
            tag.decompose()
        for br in soup.find_all(["br"]):
            br.replace_with("\n")
        for p in soup.find_all(["p", "div", "li", "tr"]):
            p.append("\n")
        text = soup.get_text()
    except Exception:
        # Minimal fallback: strip tags crudely.
        text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
        text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
        text = re.sub(r"<[^>]+>", "", text)
    # Collapse non-breaking + zero-width whitespace.
    text = text.replace("\u00a0", " ").replace("\u200b", "")
    return text


def _stripQuotedThread(text: str) -> str:
    """Remove reply-chain content so only the author's own contribution remains."""
    earliest = len(text)
    for pattern in _QUOTE_MARKER_PATTERNS:
        match = pattern.search(text)
        if match and match.start() < earliest:
            earliest = match.start()
    # Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
    quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
    if quotedBlock and quotedBlock.start() < earliest:
        earliest = quotedBlock.start()
    return text[:earliest].rstrip()


def _stripSignature(text: str) -> str:
    earliest = len(text)
    for pattern in _SIGNATURE_MARKERS:
        match = pattern.search(text)
        if match and match.start() < earliest:
            earliest = match.start()
    return text[:earliest].rstrip()


def _collapseWhitespace(text: str) -> str:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
    """Return a compact plain-text view of an email body suitable for embedding.

    Steps: HTML → text, remove quoted reply chain, remove signature, collapse
    whitespace, truncate to maxChars. Always returns a string (possibly empty).
    """
    if not html:
        return ""
    text = _htmlToText(html) if "<" in html and ">" in html else html
    text = _stripQuotedThread(text)
    text = _stripSignature(text)
    text = _collapseWhitespace(text)
    if maxChars and len(text) > maxChars:
        text = text[:maxChars].rstrip() + "…"
    return text