- connection.established/revoked callbacks from OAuth routes and connection management endpoints - KnowledgeIngestionConsumer dispatches bootstrap job (established) and synchronous purge (revoked) - FileContentIndex: add connectionId + sourceKind columns - SharePoint bootstrap with @odata.nextLink pagination and eTag-based idempotency - Outlook bootstrap treats messages as virtual documents with cleanEmailBody for HTML/quote/signature stripping - fix(rag): lower buildAgentContext minScore thresholds from 0.55/0.65/0.70 to 0.35 — previous values blocked all real matches from text-embedding-3-small - 24 new unit tests covering purge, consumer dispatch, email cleaning and both bootstrap paths
110 lines
3 KiB
Python
110 lines
3 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""Unit tests for cleanEmailBody.
|
|
|
|
Covers: HTML→text normalisation, quoted-reply removal, signature removal,
|
|
whitespace collapse and truncation. The utility is used during Outlook
|
|
bootstrap; buggy cleaning would leak quoted threads / signatures into every
|
|
embedding.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
|
|
from modules.serviceCenter.services.serviceKnowledge.subTextClean import (
|
|
cleanEmailBody,
|
|
)
|
|
|
|
|
|
def test_strips_html_tags_and_scripts():
|
|
html = (
|
|
"<html><head><style>body{}</style></head>"
|
|
"<body><p>Hello <b>world</b></p>"
|
|
"<script>alert('x')</script></body></html>"
|
|
)
|
|
cleaned = cleanEmailBody(html)
|
|
assert "Hello" in cleaned
|
|
assert "world" in cleaned
|
|
assert "<" not in cleaned
|
|
assert "alert" not in cleaned
|
|
|
|
|
|
def test_strips_quoted_reply_english():
|
|
body = (
|
|
"Actual answer from me.\n\n"
|
|
"On Mon, 1 Jan 2024 at 10:00, Someone <s@x.com> wrote:\n"
|
|
"> Original question?\n"
|
|
"> Second line.\n"
|
|
)
|
|
cleaned = cleanEmailBody(body)
|
|
assert "Actual answer" in cleaned
|
|
assert "Original question" not in cleaned
|
|
assert "wrote:" not in cleaned
|
|
|
|
|
|
def test_strips_quoted_reply_german():
|
|
body = (
|
|
"Meine Antwort.\n\n"
|
|
"Am 1. Januar 2024 um 10:00 schrieb Max Muster <m@x.com>:\n"
|
|
"> Ursprüngliche Frage?\n"
|
|
)
|
|
cleaned = cleanEmailBody(body)
|
|
assert "Meine Antwort" in cleaned
|
|
assert "Ursprüngliche Frage" not in cleaned
|
|
|
|
|
|
def test_strips_signature_after_dashes():
|
|
body = (
|
|
"Kurze Nachricht.\n"
|
|
"\n"
|
|
"--\n"
|
|
"Max Muster\n"
|
|
"Vorstand, Beispiel GmbH\n"
|
|
)
|
|
cleaned = cleanEmailBody(body)
|
|
assert "Kurze Nachricht" in cleaned
|
|
assert "Beispiel GmbH" not in cleaned
|
|
|
|
|
|
def test_strips_signature_salutation_de():
|
|
body = (
|
|
"Die eigentliche Information steht hier.\n\n"
|
|
"Mit freundlichen Grüßen\n"
|
|
"Max Muster"
|
|
)
|
|
cleaned = cleanEmailBody(body)
|
|
assert "eigentliche Information" in cleaned
|
|
assert "Max Muster" not in cleaned
|
|
|
|
|
|
def test_truncate_to_max_chars():
|
|
body = "abc " * 5000
|
|
cleaned = cleanEmailBody(body, maxChars=200)
|
|
assert len(cleaned) <= 201 # includes trailing ellipsis
|
|
|
|
|
|
def test_empty_input_returns_empty_string():
|
|
assert cleanEmailBody("") == ""
|
|
assert cleanEmailBody(None) == "" # type: ignore[arg-type]
|
|
|
|
|
|
def test_collapses_whitespace():
|
|
body = "A lot of spaces\n\n\n\nand blank lines"
|
|
cleaned = cleanEmailBody(body)
|
|
assert " " not in cleaned
|
|
assert "\n\n\n" not in cleaned
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_strips_html_tags_and_scripts()
|
|
test_strips_quoted_reply_english()
|
|
test_strips_quoted_reply_german()
|
|
test_strips_signature_after_dashes()
|
|
test_strips_signature_salutation_de()
|
|
test_truncate_to_max_chars()
|
|
test_empty_input_returns_empty_string()
|
|
test_collapses_whitespace()
|
|
print("OK — cleanEmailBody tests passed")
|