# Unit tests: unified extractContent handover (text vs image sidecars). import base64 from modules.workflows.methods.methodContext.actions.extractContent import ( HANDOVER_KIND, _apply_content_filter, _joined_text_from_handover_payload, _split_images_to_sidecar_documents, ) def test_joined_text_orders_text_table_and_skips_container(): payload = { "kind": HANDOVER_KIND, "fileOrder": ["f1"], "files": { "f1": { "parts": [ {"typeGroup": "text", "data": " A\n", "id": "x"}, {"typeGroup": "container", "data": "", "id": "c"}, {"typeGroup": "text", "data": "B", "id": "y"}, ] } }, } assert _joined_text_from_handover_payload(payload) == "A\n\nB" def test_joined_text_includes_csv_table_parts(): payload = { "fileOrder": ["f1"], "files": { "f1": { "parts": [ {"typeGroup": "table", "mimeType": "text/csv", "data": "a,b\n1,2", "id": "t"}, ] } }, } assert _joined_text_from_handover_payload(payload) == "a,b\n1,2" def test_split_images_moves_pixels_to_blob_docs(): raw = b"fake-binary-image" b64 = base64.b64encode(raw).decode("ascii") payload = { "kind": HANDOVER_KIND, "schemaVersion": 1, "fileOrder": ["f1"], "files": { "f1": { "parts": [ {"typeGroup": "text", "data": "x", "id": "t1"}, { "typeGroup": "image", "mimeType": "image/png", "data": b64, "id": "p1-img", "metadata": {}, }, ] } }, } stripped, blobs = _split_images_to_sidecar_documents(payload, document_name_stem="abc") assert len(blobs) == 1 assert blobs[0].mimeType == "image/png" assert blobs[0].documentData == raw assert blobs[0].documentName.endswith(".png") assert blobs[0].documentName.startswith("extract_media_") meta = blobs[0].validationMetadata or {} assert meta.get("handoverRole") == "extractedMedia" img_parts = [ p for p in stripped["files"]["f1"]["parts"] if isinstance(p, dict) and (p.get("typeGroup") or "") == "image" ] assert len(img_parts) == 1 assert img_parts[0]["data"] == "" assert img_parts[0]["handoverMediaDocumentName"] == blobs[0].documentName assert "image" in stripped["files"]["f1"]["byTypeGroup"] def _mixed_payload(): return { "kind": HANDOVER_KIND, "schemaVersion": 1, "fileOrder": ["f1"], "files": { "f1": { "parts": [ {"typeGroup": "text", "data": "hello", "id": "t1"}, {"typeGroup": "table", "mimeType": "text/csv", "data": "a,b", "id": "tb1"}, {"typeGroup": "image", "mimeType": "image/png", "data": "abc=", "id": "i1"}, {"typeGroup": "structure", "mimeType": "text/html", "data": "

", "id": "s1"}, ], } }, } def test_content_filter_all_is_noop(): payload = _mixed_payload() result = _apply_content_filter(payload, "all") assert result is payload # same object, no copy def test_content_filter_text_only_keeps_text_table_structure(): result = _apply_content_filter(_mixed_payload(), "textOnly") parts = result["files"]["f1"]["parts"] type_groups = {p["typeGroup"] for p in parts} assert type_groups == {"text", "table", "structure"} assert "image" not in type_groups def test_content_filter_images_only(): result = _apply_content_filter(_mixed_payload(), "imagesOnly") parts = result["files"]["f1"]["parts"] assert all(p["typeGroup"] == "image" for p in parts) assert len(parts) == 1 def test_content_filter_no_images_removes_only_images(): result = _apply_content_filter(_mixed_payload(), "noImages") parts = result["files"]["f1"]["parts"] type_groups = {p["typeGroup"] for p in parts} assert "image" not in type_groups # text, table, structure all remain assert {"text", "table", "structure"} == type_groups def test_content_filter_text_only_joined_text_has_no_image_data(): result = _apply_content_filter(_mixed_payload(), "textOnly") text = _joined_text_from_handover_payload(result) assert "hello" in text assert "abc=" not in text # base64 image data must not appear def test_content_filter_text_only_no_sidecars(): """textOnly: no image parts → _split produces zero sidecars.""" result = _apply_content_filter(_mixed_payload(), "textOnly") stripped, blobs = _split_images_to_sidecar_documents(result, document_name_stem="test") assert blobs == []