from typing import Any, Dict, List, Optional, Union import uuid from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart class ExtractionService: def __init__(self, services: Optional[Any] = None): self.services = services self._extractorRegistry = ExtractorRegistry() self._chunkerRegistry = ChunkerRegistry() def extractContent(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> List[ExtractedContent]: results: List[ExtractedContent] = [] for doc in documentList: ec = runExtraction( extractorRegistry=self._extractorRegistry, chunkerRegistry=self._chunkerRegistry, documentBytes=doc.get("bytes"), fileName=doc.get("fileName"), mimeType=doc.get("mimeType"), options=options ) # Attach document id to parts if missing for p in ec.parts: if "documentId" not in p.metadata: p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4()) ec = applyAiIfRequested(ec, options) results.append(ec) return results async def extractContentFromDocument(self, prompt: str, documents: List[Dict[str, Any]], options: Optional[Dict[str, Any]] = None) -> List[ExtractedContent]: """ Batch extract content from multiple documents. Args: prompt: Instructional prompt for optional AI post-processing/selection. documents: List of dicts with keys: id, bytes, fileName, mimeType. options: Optional extraction options. "ai" config may be provided. Returns: List[ExtractedContent]: one per input document in order. """ # Build options safely and inject prompt for downstream AI selection if desired effectiveOptions: Dict[str, Any] = options.copy() if options else {} aiCfg = effectiveOptions.get("ai") or {} if prompt: aiCfg["prompt"] = prompt effectiveOptions["ai"] = aiCfg # Delegate to existing synchronous pipeline return self.extractContent(documents, effectiveOptions)