gateway/modules/serviceCenter/services/serviceKnowledge/subTextClean.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

107 lines
4.1 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Text normalisation utilities used by knowledge ingestion.
The email body cleaning logic is intentionally regex-based and works on plain
text after an HTML→text pass so we never store unsanitised HTML/JS in the
knowledge store and retrieval stays robust (no extraneous markup tokens
eating embedding budget).
"""
from __future__ import annotations
import re
from typing import Optional
DEFAULT_MAX_CHARS = 8000
_QUOTE_MARKER_PATTERNS = [
re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
]
_SIGNATURE_MARKERS = [
re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
re.compile(r"^\s*—\s*$", re.MULTILINE),
re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
]
def _htmlToText(html: str) -> str:
"""Prefer BeautifulSoup when available, fall back to regex."""
try:
from bs4 import BeautifulSoup # type: ignore
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
for br in soup.find_all(["br"]):
br.replace_with("\n")
for p in soup.find_all(["p", "div", "li", "tr"]):
p.append("\n")
text = soup.get_text()
except Exception:
# Minimal fallback: strip tags crudely.
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
# Collapse non-breaking + zero-width whitespace.
text = text.replace("\u00a0", " ").replace("\u200b", "")
return text
def _stripQuotedThread(text: str) -> str:
"""Remove reply-chain content so only the author's own contribution remains."""
earliest = len(text)
for pattern in _QUOTE_MARKER_PATTERNS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
# Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
if quotedBlock and quotedBlock.start() < earliest:
earliest = quotedBlock.start()
return text[:earliest].rstrip()
def _stripSignature(text: str) -> str:
earliest = len(text)
for pattern in _SIGNATURE_MARKERS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
return text[:earliest].rstrip()
def _collapseWhitespace(text: str) -> str:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
"""Return a compact plain-text view of an email body suitable for embedding.
Steps: HTML → text, remove quoted reply chain, remove signature, collapse
whitespace, truncate to maxChars. Always returns a string (possibly empty).
"""
if not html:
return ""
text = _htmlToText(html) if "<" in html and ">" in html else html
text = _stripQuotedThread(text)
text = _stripSignature(text)
text = _collapseWhitespace(text)
if maxChars and len(text) > maxChars:
text = text[:maxChars].rstrip() + ""
return text