gateway/modules/serviceCenter/services/serviceExtraction/subPipeline.py
2026-04-26 08:31:35 +02:00

63 lines
2.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import List
import logging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from modules.datamodels.datamodelUdm import applyUdmOutputDetail
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
logger = logging.getLogger(__name__)
# REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None:
# fallback: single binary part
part = ContentPart(
id=makeId(),
parentId=None,
label="file",
typeGroup="binary",
mimeType=mimeType or "application/octet-stream",
data="",
metadata={"warning": "No extractor registered"}
)
return ContentExtracted(id=makeId(), parts=[part])
extractCtx = {
"fileName": fileName,
"mimeType": mimeType,
"lazyContainer": options.lazyContainer,
}
parts = extractor.extract(documentBytes, extractCtx)
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic)
if options.mergeStrategy:
# Use module-level applyMerging function
from .mainServiceExtraction import applyMerging
parts = applyMerging(parts, options.mergeStrategy)
ec_id = makeId()
extracted = ContentExtracted(id=ec_id, parts=parts)
if options.outputFormat in ("udm", "both"):
udm = extractor.extractToUdm(
documentBytes,
{**extractCtx, "extractionId": ec_id},
precomputedParts=parts,
)
extracted.udm = applyUdmOutputDetail(udm, options.outputDetail)
return extracted
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
# REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access