gateway/modules/services/serviceExtraction/mainServiceExtraction.py

57 lines
2.4 KiB
Python

from typing import Any, Dict, List, Optional
import uuid
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
class ExtractionService:
def __init__(self):
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
def extractDocuments(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> Any:
processIndividually = options.get("processDocumentsIndividually", True)
if processIndividually:
results: List[ExtractedContent] = []
for doc in documentList:
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=doc.get("bytes"),
fileName=doc.get("fileName"),
mimeType=doc.get("mimeType"),
options=options
)
ec = applyAiIfRequested(ec, options)
results.append(ec)
return results
else:
allParts: List[ContentPart] = []
for doc in documentList:
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=doc.get("bytes"),
fileName=doc.get("fileName"),
mimeType=doc.get("mimeType"),
options=options
)
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4())
allParts.extend(ec.parts)
pooled = poolAndLimit(allParts, self._chunkerRegistry, options)
# In pooled mode we return a dict containing pooled parts and an optional AI output
pooledResult: Dict[str, Any] = {
"parts": pooled,
"summary": {"documents": len(documentList)}
}
aiOut = applyAiIfRequested(ExtractedContent(id=str(uuid.uuid4()), parts=pooled, summary=None), options)
pooledResult["ai"] = aiOut.summary if isinstance(aiOut, ExtractedContent) else aiOut
return pooledResult