- connection.established/revoked callbacks from OAuth routes and connection management endpoints - KnowledgeIngestionConsumer dispatches bootstrap job (established) and synchronous purge (revoked) - FileContentIndex: add connectionId + sourceKind columns - SharePoint bootstrap with @odata.nextLink pagination and eTag-based idempotency - Outlook bootstrap treats messages as virtual documents with cleanEmailBody for HTML/quote/signature stripping - fix(rag): lower buildAgentContext minScore thresholds from 0.55/0.65/0.70 to 0.35 — previous values blocked all real matches from text-embedding-3-small - 24 new unit tests covering purge, consumer dispatch, email cleaning and both bootstrap paths
107 lines
4.1 KiB
Python
107 lines
4.1 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Text normalisation utilities used by knowledge ingestion.
|
|
|
|
The email body cleaning logic is intentionally regex-based and works on plain
|
|
text after an HTML→text pass so we never store unsanitised HTML/JS in the
|
|
knowledge store and retrieval stays robust (no extraneous markup tokens
|
|
eating embedding budget).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Optional
|
|
|
|
DEFAULT_MAX_CHARS = 8000
|
|
|
|
|
|
_QUOTE_MARKER_PATTERNS = [
|
|
re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
|
|
]
|
|
|
|
_SIGNATURE_MARKERS = [
|
|
re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
|
|
re.compile(r"^\s*—\s*$", re.MULTILINE),
|
|
re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
|
|
re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
|
|
]
|
|
|
|
|
|
def _htmlToText(html: str) -> str:
|
|
"""Prefer BeautifulSoup when available, fall back to regex."""
|
|
try:
|
|
from bs4 import BeautifulSoup # type: ignore
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
for tag in soup(["script", "style", "head"]):
|
|
tag.decompose()
|
|
for br in soup.find_all(["br"]):
|
|
br.replace_with("\n")
|
|
for p in soup.find_all(["p", "div", "li", "tr"]):
|
|
p.append("\n")
|
|
text = soup.get_text()
|
|
except Exception:
|
|
# Minimal fallback: strip tags crudely.
|
|
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
|
|
text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
|
|
text = re.sub(r"<[^>]+>", "", text)
|
|
# Collapse non-breaking + zero-width whitespace.
|
|
text = text.replace("\u00a0", " ").replace("\u200b", "")
|
|
return text
|
|
|
|
|
|
def _stripQuotedThread(text: str) -> str:
|
|
"""Remove reply-chain content so only the author's own contribution remains."""
|
|
earliest = len(text)
|
|
for pattern in _QUOTE_MARKER_PATTERNS:
|
|
match = pattern.search(text)
|
|
if match and match.start() < earliest:
|
|
earliest = match.start()
|
|
# Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
|
|
quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
|
|
if quotedBlock and quotedBlock.start() < earliest:
|
|
earliest = quotedBlock.start()
|
|
return text[:earliest].rstrip()
|
|
|
|
|
|
def _stripSignature(text: str) -> str:
|
|
earliest = len(text)
|
|
for pattern in _SIGNATURE_MARKERS:
|
|
match = pattern.search(text)
|
|
if match and match.start() < earliest:
|
|
earliest = match.start()
|
|
return text[:earliest].rstrip()
|
|
|
|
|
|
def _collapseWhitespace(text: str) -> str:
|
|
text = re.sub(r"[ \t]+", " ", text)
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
return text.strip()
|
|
|
|
|
|
def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
|
|
"""Return a compact plain-text view of an email body suitable for embedding.
|
|
|
|
Steps: HTML → text, remove quoted reply chain, remove signature, collapse
|
|
whitespace, truncate to maxChars. Always returns a string (possibly empty).
|
|
"""
|
|
if not html:
|
|
return ""
|
|
text = _htmlToText(html) if "<" in html and ">" in html else html
|
|
text = _stripQuotedThread(text)
|
|
text = _stripSignature(text)
|
|
text = _collapseWhitespace(text)
|
|
if maxChars and len(text) > maxChars:
|
|
text = text[:maxChars].rstrip() + "…"
|
|
return text
|