65 lines
1.8 KiB
Python
65 lines
1.8 KiB
Python
# Phase 7: UDM tools (getUdmStructure, walkUdmBlocks, filterUdmByType).
|
|
|
|
from modules.serviceCenter.services.serviceAgent.coreTools._documentTools import (
|
|
_filterUdmByTypeImpl,
|
|
_getUdmStructureText,
|
|
_parseUdmJson,
|
|
_walkUdmBlocksImpl,
|
|
)
|
|
|
|
|
|
def test_parseUdmJson_dict():
|
|
d = {"id": "1", "role": "document", "children": []}
|
|
assert _parseUdmJson(d) == d
|
|
|
|
|
|
def test_parseUdmJson_string():
|
|
raw = '{"id":"x","role":"document","children":[]}'
|
|
assert _parseUdmJson(raw)["id"] == "x"
|
|
|
|
|
|
def test_getUdmStructure_text():
|
|
udm = {
|
|
"id": "d1",
|
|
"role": "document",
|
|
"sourceType": "pdf",
|
|
"children": [
|
|
{"id": "p1", "role": "page", "index": 0, "label": "P1", "children": [{"id": "c1", "contentType": "text", "raw": "hi"}]},
|
|
],
|
|
}
|
|
text = _getUdmStructureText(udm)
|
|
assert "pdf" in text
|
|
assert "contentBlocks=1" in text
|
|
|
|
|
|
def test_walkUdm_blocks():
|
|
udm = {
|
|
"id": "d1",
|
|
"role": "document",
|
|
"children": [
|
|
{"id": "p1", "role": "page", "children": [
|
|
{"id": "t1", "contentType": "text", "raw": "a"},
|
|
{"id": "i1", "contentType": "image", "raw": ""},
|
|
]},
|
|
],
|
|
}
|
|
out = []
|
|
_walkUdmBlocksImpl(udm, out, "document")
|
|
assert len(out) == 2
|
|
assert {b["contentType"] for b in out} == {"text", "image"}
|
|
|
|
|
|
def test_filter_udm_by_type():
|
|
udm = {
|
|
"id": "d1",
|
|
"role": "document",
|
|
"children": [
|
|
{"id": "p1", "role": "page", "children": [
|
|
{"id": "t1", "contentType": "text"},
|
|
{"id": "x1", "contentType": "table"},
|
|
]},
|
|
],
|
|
}
|
|
r = _filterUdmByTypeImpl(udm, "table")
|
|
assert r["count"] == 1
|
|
assert r["nodes"][0]["id"] == "x1"
|