gateway/tests/unit/services/test_clean_email_body.py
Ida 6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00

110 lines
3 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unit tests for cleanEmailBody.
Covers: HTML→text normalisation, quoted-reply removal, signature removal,
whitespace collapse and truncation. The utility is used during Outlook
bootstrap; buggy cleaning would leak quoted threads / signatures into every
embedding.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.subTextClean import (
cleanEmailBody,
)
def test_strips_html_tags_and_scripts():
html = (
"<html><head><style>body{}</style></head>"
"<body><p>Hello <b>world</b></p>"
"<script>alert('x')</script></body></html>"
)
cleaned = cleanEmailBody(html)
assert "Hello" in cleaned
assert "world" in cleaned
assert "<" not in cleaned
assert "alert" not in cleaned
def test_strips_quoted_reply_english():
body = (
"Actual answer from me.\n\n"
"On Mon, 1 Jan 2024 at 10:00, Someone <s@x.com> wrote:\n"
"> Original question?\n"
"> Second line.\n"
)
cleaned = cleanEmailBody(body)
assert "Actual answer" in cleaned
assert "Original question" not in cleaned
assert "wrote:" not in cleaned
def test_strips_quoted_reply_german():
body = (
"Meine Antwort.\n\n"
"Am 1. Januar 2024 um 10:00 schrieb Max Muster <m@x.com>:\n"
"> Ursprüngliche Frage?\n"
)
cleaned = cleanEmailBody(body)
assert "Meine Antwort" in cleaned
assert "Ursprüngliche Frage" not in cleaned
def test_strips_signature_after_dashes():
body = (
"Kurze Nachricht.\n"
"\n"
"--\n"
"Max Muster\n"
"Vorstand, Beispiel GmbH\n"
)
cleaned = cleanEmailBody(body)
assert "Kurze Nachricht" in cleaned
assert "Beispiel GmbH" not in cleaned
def test_strips_signature_salutation_de():
body = (
"Die eigentliche Information steht hier.\n\n"
"Mit freundlichen Grüßen\n"
"Max Muster"
)
cleaned = cleanEmailBody(body)
assert "eigentliche Information" in cleaned
assert "Max Muster" not in cleaned
def test_truncate_to_max_chars():
body = "abc " * 5000
cleaned = cleanEmailBody(body, maxChars=200)
assert len(cleaned) <= 201 # includes trailing ellipsis
def test_empty_input_returns_empty_string():
assert cleanEmailBody("") == ""
assert cleanEmailBody(None) == "" # type: ignore[arg-type]
def test_collapses_whitespace():
body = "A lot of spaces\n\n\n\nand blank lines"
cleaned = cleanEmailBody(body)
assert " " not in cleaned
assert "\n\n\n" not in cleaned
if __name__ == "__main__":
test_strips_html_tags_and_scripts()
test_strips_quoted_reply_english()
test_strips_quoted_reply_german()
test_strips_signature_after_dashes()
test_strips_signature_salutation_de()
test_truncate_to_max_chars()
test_empty_input_returns_empty_string()
test_collapses_whitespace()
print("OK — cleanEmailBody tests passed")