57 lines
2.4 KiB
Python
57 lines
2.4 KiB
Python
from typing import Any, Dict, List, Optional
|
|
import uuid
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested
|
|
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self):
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
|
|
def extractDocuments(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> Any:
|
|
processIndividually = options.get("processDocumentsIndividually", True)
|
|
|
|
if processIndividually:
|
|
results: List[ExtractedContent] = []
|
|
for doc in documentList:
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=doc.get("bytes"),
|
|
fileName=doc.get("fileName"),
|
|
mimeType=doc.get("mimeType"),
|
|
options=options
|
|
)
|
|
ec = applyAiIfRequested(ec, options)
|
|
results.append(ec)
|
|
return results
|
|
else:
|
|
allParts: List[ContentPart] = []
|
|
for doc in documentList:
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=doc.get("bytes"),
|
|
fileName=doc.get("fileName"),
|
|
mimeType=doc.get("mimeType"),
|
|
options=options
|
|
)
|
|
for p in ec.parts:
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4())
|
|
allParts.extend(ec.parts)
|
|
|
|
pooled = poolAndLimit(allParts, self._chunkerRegistry, options)
|
|
# In pooled mode we return a dict containing pooled parts and an optional AI output
|
|
pooledResult: Dict[str, Any] = {
|
|
"parts": pooled,
|
|
"summary": {"documents": len(documentList)}
|
|
}
|
|
aiOut = applyAiIfRequested(ExtractedContent(id=str(uuid.uuid4()), parts=pooled, summary=None), options)
|
|
pooledResult["ai"] = aiOut.summary if isinstance(aiOut, ExtractedContent) else aiOut
|
|
return pooledResult
|
|
|
|
|