# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import List import logging from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy from modules.datamodels.datamodelUdm import applyUdmOutputDetail from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry logger = logging.getLogger(__name__) # REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted: extractor = extractorRegistry.resolve(mimeType, fileName) if extractor is None: # fallback: single binary part part = ContentPart( id=makeId(), parentId=None, label="file", typeGroup="binary", mimeType=mimeType or "application/octet-stream", data="", metadata={"warning": "No extractor registered"} ) return ContentExtracted(id=makeId(), parts=[part]) extractCtx = { "fileName": fileName, "mimeType": mimeType, "lazyContainer": options.lazyContainer, } parts = extractor.extract(documentBytes, extractCtx) # REMOVED: poolAndLimit(parts, chunkerRegistry, options) # REMOVED: Chunking logic - now handled in AI call phase # Apply merging strategy if provided (preserve existing logic) if options.mergeStrategy: # Use module-level applyMerging function from .mainServiceExtraction import applyMerging parts = applyMerging(parts, options.mergeStrategy) ec_id = makeId() extracted = ContentExtracted(id=ec_id, parts=parts) if options.outputFormat in ("udm", "both"): udm = extractor.extractToUdm( documentBytes, {**extractCtx, "extractionId": ec_id}, precomputedParts=parts, ) extracted.udm = applyUdmOutputDetail(udm, options.outputDetail) return extracted # REMOVED: poolAndLimit function - chunking now handled in AI call phase # REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access