# Copyright (c) 2025 Patrick Motsch # All rights reserved. from typing import List import logging from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy from .subUtils import makeId from .subRegistry import ExtractorRegistry, ChunkerRegistry logger = logging.getLogger(__name__) # REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted: extractor = extractorRegistry.resolve(mimeType, fileName) if extractor is None: # fallback: single binary part part = ContentPart( id=makeId(), parentId=None, label="file", typeGroup="binary", mimeType=mimeType or "application/octet-stream", data="", metadata={"warning": "No extractor registered"} ) return ContentExtracted(id=makeId(), parts=[part]) parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType}) # REMOVED: poolAndLimit(parts, chunkerRegistry, options) # REMOVED: Chunking logic - now handled in AI call phase # Apply merging strategy if provided (preserve existing logic) if options.mergeStrategy: # Use module-level applyMerging function from .mainServiceExtraction import applyMerging parts = applyMerging(parts, options.mergeStrategy) return ContentExtracted(id=makeId(), parts=parts) # REMOVED: poolAndLimit function - chunking now handled in AI call phase # REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access