56 lines
2.3 KiB
Python
56 lines
2.3 KiB
Python
from typing import Any, Dict, List, Optional, Union
|
|
import uuid
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested
|
|
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self, services: Optional[Any] = None):
|
|
self.services = services
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
|
|
def extractContent(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> List[ExtractedContent]:
|
|
results: List[ExtractedContent] = []
|
|
for doc in documentList:
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=doc.get("bytes"),
|
|
fileName=doc.get("fileName"),
|
|
mimeType=doc.get("mimeType"),
|
|
options=options
|
|
)
|
|
# Attach document id to parts if missing
|
|
for p in ec.parts:
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4())
|
|
ec = applyAiIfRequested(ec, options)
|
|
results.append(ec)
|
|
return results
|
|
|
|
async def extractContentFromDocument(self, prompt: str, documents: List[Dict[str, Any]], options: Optional[Dict[str, Any]] = None) -> List[ExtractedContent]:
|
|
"""
|
|
Batch extract content from multiple documents.
|
|
|
|
Args:
|
|
prompt: Instructional prompt for optional AI post-processing/selection.
|
|
documents: List of dicts with keys: id, bytes, fileName, mimeType.
|
|
options: Optional extraction options. "ai" config may be provided.
|
|
|
|
Returns:
|
|
List[ExtractedContent]: one per input document in order.
|
|
"""
|
|
# Build options safely and inject prompt for downstream AI selection if desired
|
|
effectiveOptions: Dict[str, Any] = options.copy() if options else {}
|
|
aiCfg = effectiveOptions.get("ai") or {}
|
|
if prompt:
|
|
aiCfg["prompt"] = prompt
|
|
effectiveOptions["ai"] = aiCfg
|
|
|
|
# Delegate to existing synchronous pipeline
|
|
return self.extractContent(documents, effectiveOptions)
|
|
|
|
|