144 lines
4.8 KiB
Python
144 lines
4.8 KiB
Python
# Unit tests: unified extractContent handover (text vs image sidecars).
|
|
|
|
import base64
|
|
|
|
from modules.workflows.methods.methodContext.actions.extractContent import (
|
|
HANDOVER_KIND,
|
|
_apply_content_filter,
|
|
_joined_text_from_handover_payload,
|
|
_split_images_to_sidecar_documents,
|
|
)
|
|
|
|
|
|
def test_joined_text_orders_text_table_and_skips_container():
|
|
payload = {
|
|
"kind": HANDOVER_KIND,
|
|
"fileOrder": ["f1"],
|
|
"files": {
|
|
"f1": {
|
|
"parts": [
|
|
{"typeGroup": "text", "data": " A\n", "id": "x"},
|
|
{"typeGroup": "container", "data": "", "id": "c"},
|
|
{"typeGroup": "text", "data": "B", "id": "y"},
|
|
]
|
|
}
|
|
},
|
|
}
|
|
assert _joined_text_from_handover_payload(payload) == "A\n\nB"
|
|
|
|
|
|
def test_joined_text_includes_csv_table_parts():
|
|
payload = {
|
|
"fileOrder": ["f1"],
|
|
"files": {
|
|
"f1": {
|
|
"parts": [
|
|
{"typeGroup": "table", "mimeType": "text/csv", "data": "a,b\n1,2", "id": "t"},
|
|
]
|
|
}
|
|
},
|
|
}
|
|
assert _joined_text_from_handover_payload(payload) == "a,b\n1,2"
|
|
|
|
|
|
def test_split_images_moves_pixels_to_blob_docs():
|
|
raw = b"fake-binary-image"
|
|
b64 = base64.b64encode(raw).decode("ascii")
|
|
payload = {
|
|
"kind": HANDOVER_KIND,
|
|
"schemaVersion": 1,
|
|
"fileOrder": ["f1"],
|
|
"files": {
|
|
"f1": {
|
|
"parts": [
|
|
{"typeGroup": "text", "data": "x", "id": "t1"},
|
|
{
|
|
"typeGroup": "image",
|
|
"mimeType": "image/png",
|
|
"data": b64,
|
|
"id": "p1-img",
|
|
"metadata": {},
|
|
},
|
|
]
|
|
}
|
|
},
|
|
}
|
|
stripped, blobs = _split_images_to_sidecar_documents(payload, document_name_stem="abc")
|
|
assert len(blobs) == 1
|
|
assert blobs[0].mimeType == "image/png"
|
|
assert blobs[0].documentData == raw
|
|
assert blobs[0].documentName.endswith(".png")
|
|
assert blobs[0].documentName.startswith("extract_media_")
|
|
meta = blobs[0].validationMetadata or {}
|
|
assert meta.get("handoverRole") == "extractedMedia"
|
|
img_parts = [
|
|
p
|
|
for p in stripped["files"]["f1"]["parts"]
|
|
if isinstance(p, dict) and (p.get("typeGroup") or "") == "image"
|
|
]
|
|
assert len(img_parts) == 1
|
|
assert img_parts[0]["data"] == ""
|
|
assert img_parts[0]["handoverMediaDocumentName"] == blobs[0].documentName
|
|
assert "image" in stripped["files"]["f1"]["byTypeGroup"]
|
|
|
|
|
|
def _mixed_payload():
|
|
return {
|
|
"kind": HANDOVER_KIND,
|
|
"schemaVersion": 1,
|
|
"fileOrder": ["f1"],
|
|
"files": {
|
|
"f1": {
|
|
"parts": [
|
|
{"typeGroup": "text", "data": "hello", "id": "t1"},
|
|
{"typeGroup": "table", "mimeType": "text/csv", "data": "a,b", "id": "tb1"},
|
|
{"typeGroup": "image", "mimeType": "image/png", "data": "abc=", "id": "i1"},
|
|
{"typeGroup": "structure", "mimeType": "text/html", "data": "<p/>", "id": "s1"},
|
|
],
|
|
}
|
|
},
|
|
}
|
|
|
|
|
|
def test_content_filter_all_is_noop():
|
|
payload = _mixed_payload()
|
|
result = _apply_content_filter(payload, "all")
|
|
assert result is payload # same object, no copy
|
|
|
|
|
|
def test_content_filter_text_only_keeps_text_table_structure():
|
|
result = _apply_content_filter(_mixed_payload(), "textOnly")
|
|
parts = result["files"]["f1"]["parts"]
|
|
type_groups = {p["typeGroup"] for p in parts}
|
|
assert type_groups == {"text", "table", "structure"}
|
|
assert "image" not in type_groups
|
|
|
|
|
|
def test_content_filter_images_only():
|
|
result = _apply_content_filter(_mixed_payload(), "imagesOnly")
|
|
parts = result["files"]["f1"]["parts"]
|
|
assert all(p["typeGroup"] == "image" for p in parts)
|
|
assert len(parts) == 1
|
|
|
|
|
|
def test_content_filter_no_images_removes_only_images():
|
|
result = _apply_content_filter(_mixed_payload(), "noImages")
|
|
parts = result["files"]["f1"]["parts"]
|
|
type_groups = {p["typeGroup"] for p in parts}
|
|
assert "image" not in type_groups
|
|
# text, table, structure all remain
|
|
assert {"text", "table", "structure"} == type_groups
|
|
|
|
|
|
def test_content_filter_text_only_joined_text_has_no_image_data():
|
|
result = _apply_content_filter(_mixed_payload(), "textOnly")
|
|
text = _joined_text_from_handover_payload(result)
|
|
assert "hello" in text
|
|
assert "abc=" not in text # base64 image data must not appear
|
|
|
|
|
|
def test_content_filter_text_only_no_sidecars():
|
|
"""textOnly: no image parts → _split produces zero sidecars."""
|
|
result = _apply_content_filter(_mixed_payload(), "textOnly")
|
|
stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test")
|
|
assert blobs == []
|