gateway/tests/integration/extraction/test_extract_udm_pipeline.py
2026-04-16 23:13:05 +02:00

37 lines
1.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import ChunkerRegistry, getExtractorRegistry
def test_run_extraction_html_with_udm():
reg = getExtractorRegistry()
chunkers = ChunkerRegistry()
html = b"<html><body><p>Hello</p></body></html>"
opts = ExtractionOptions(
mergeStrategy=MergeStrategy(),
outputFormat="both",
outputDetail="full",
)
ec = runExtraction(reg, chunkers, html, "t.html", "text/html", opts)
assert ec.parts
assert ec.udm is not None
assert ec.udm.sourceType == "html"
assert ec.udm.children
def test_run_extraction_parts_only_no_udm():
reg = getExtractorRegistry()
chunkers = ChunkerRegistry()
html = b"<html></html>"
opts = ExtractionOptions(mergeStrategy=MergeStrategy(), outputFormat="parts")
ec = runExtraction(reg, chunkers, html, "t.html", "text/html", opts)
assert ec.parts
assert ec.udm is None
def test_get_extractor_registry_singleton():
a = getExtractorRegistry()
b = getExtractorRegistry()
assert a is b