gateway/modules/services/serviceExtraction/mainServiceExtraction.py
2025-09-30 18:30:33 +02:00

56 lines
2.3 KiB
Python

from typing import Any, Dict, List, Optional, Union
import uuid
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
class ExtractionService:
def __init__(self, services: Optional[Any] = None):
self.services = services
self._extractorRegistry = ExtractorRegistry()
self._chunkerRegistry = ChunkerRegistry()
def extractContent(self, documentList: List[Dict[str, Any]], options: Dict[str, Any]) -> List[ExtractedContent]:
results: List[ExtractedContent] = []
for doc in documentList:
ec = runExtraction(
extractorRegistry=self._extractorRegistry,
chunkerRegistry=self._chunkerRegistry,
documentBytes=doc.get("bytes"),
fileName=doc.get("fileName"),
mimeType=doc.get("mimeType"),
options=options
)
# Attach document id to parts if missing
for p in ec.parts:
if "documentId" not in p.metadata:
p.metadata["documentId"] = doc.get("id") or str(uuid.uuid4())
ec = applyAiIfRequested(ec, options)
results.append(ec)
return results
async def extractContentFromDocument(self, prompt: str, documents: List[Dict[str, Any]], options: Optional[Dict[str, Any]] = None) -> List[ExtractedContent]:
"""
Batch extract content from multiple documents.
Args:
prompt: Instructional prompt for optional AI post-processing/selection.
documents: List of dicts with keys: id, bytes, fileName, mimeType.
options: Optional extraction options. "ai" config may be provided.
Returns:
List[ExtractedContent]: one per input document in order.
"""
# Build options safely and inject prompt for downstream AI selection if desired
effectiveOptions: Dict[str, Any] = options.copy() if options else {}
aiCfg = effectiveOptions.get("ai") or {}
if prompt:
aiCfg["prompt"] = prompt
effectiveOptions["ai"] = aiCfg
# Delegate to existing synchronous pipeline
return self.extractContent(documents, effectiveOptions)