37 lines
1.3 KiB
Python
37 lines
1.3 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
|
|
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
|
from modules.serviceCenter.services.serviceExtraction.subRegistry import ChunkerRegistry, getExtractorRegistry
|
|
|
|
|
|
def test_run_extraction_html_with_udm():
|
|
reg = getExtractorRegistry()
|
|
chunkers = ChunkerRegistry()
|
|
html = b"<html><body><p>Hello</p></body></html>"
|
|
opts = ExtractionOptions(
|
|
mergeStrategy=MergeStrategy(),
|
|
outputFormat="both",
|
|
outputDetail="full",
|
|
)
|
|
ec = runExtraction(reg, chunkers, html, "t.html", "text/html", opts)
|
|
assert ec.parts
|
|
assert ec.udm is not None
|
|
assert ec.udm.sourceType == "html"
|
|
assert ec.udm.children
|
|
|
|
|
|
def test_run_extraction_parts_only_no_udm():
|
|
reg = getExtractorRegistry()
|
|
chunkers = ChunkerRegistry()
|
|
html = b"<html></html>"
|
|
opts = ExtractionOptions(mergeStrategy=MergeStrategy(), outputFormat="parts")
|
|
ec = runExtraction(reg, chunkers, html, "t.html", "text/html", opts)
|
|
assert ec.parts
|
|
assert ec.udm is None
|
|
|
|
|
|
def test_get_extractor_registry_singleton():
|
|
a = getExtractorRegistry()
|
|
b = getExtractorRegistry()
|
|
assert a is b
|