from typing import Any, Dict, List, Optional import uuid from .subRegistry import ExtractorRegistry, ChunkerRegistry from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart class ExtractionService: def __init__(self): self._extractorRegistry = ExtractorRegistry() self._chunkerRegistry = ChunkerRegistry() def extractDocuments(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> Any: processIndividually = options.get("processDocumentsIndividually", True) if processIndividually: results: List[ExtractedContent] = [] for doc in documentList: ec = runExtraction( extractorRegistry=self._extractorRegistry, chunkerRegistry=self._chunkerRegistry, documentBytes=doc.get("bytes"), fileName=doc.get("fileName"), mimeType=doc.get("mimeType"), options=options ) ec = applyAiIfRequested(ec, options) results.append(ec) return results else: allParts: List[ContentPart] = [] for doc in documentList: ec = runExtraction( extractorRegistry=self._extractorRegistry, chunkerRegistry=self._chunkerRegistry, documentBytes=doc.get("bytes"), fileName=doc.get("fileName"), mimeType=doc.get("mimeType"), options=options ) for p in ec.parts: if "documentId" not in p.metadata: p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4()) allParts.extend(ec.parts) pooled = poolAndLimit(allParts, self._chunkerRegistry, options) # In pooled mode we return a dict containing pooled parts and an optional AI output pooledResult: Dict[str, Any] = { "parts": pooled, "summary": {"documents": len(documentList)} } aiOut = applyAiIfRequested(ExtractedContent(id=str(uuid.uuid4()), parts=pooled, summary=None), options) pooledResult["ai"] = aiOut.summary if isinstance(aiOut, ExtractedContent) else aiOut return pooledResult