379 lines
15 KiB
Python
379 lines
15 KiB
Python
from typing import Any, Dict, List, Optional, Union
|
|
import uuid
|
|
import logging
|
|
import time
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction, poolAndLimit, applyAiIfRequested
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallResponse
|
|
from modules.interfaces.interfaceAiObjects import aiModels
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self, services: Optional[Any] = None):
|
|
self.services = services
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
|
|
def extractContent(self, documents: List[ChatDocument], options: Dict[str, Any]) -> List[ContentExtracted]:
|
|
"""
|
|
Extract content from a list of ChatDocument objects.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to extract content from
|
|
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
|
|
|
|
Returns:
|
|
List of ContentExtracted objects, one per input document
|
|
"""
|
|
results: List[ContentExtracted] = []
|
|
|
|
# Lazy import to avoid circular deps and heavy init at module import
|
|
from modules.interfaces.interfaceDbComponentObjects import getInterface
|
|
dbInterface = getInterface()
|
|
|
|
for i, doc in enumerate(documents):
|
|
logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===")
|
|
logger.info(f"Initial MIME type: {doc.mimeType}")
|
|
|
|
# Start timing for this document
|
|
startTime = time.time()
|
|
|
|
# Resolve raw bytes for this document using interface
|
|
documentBytes = dbInterface.getFileData(doc.fileId)
|
|
if not documentBytes:
|
|
raise ValueError(f"No file data found for fileId={doc.fileId}")
|
|
|
|
# Convert ChatDocument to the format expected by runExtraction
|
|
documentData = {
|
|
"id": doc.id,
|
|
"bytes": documentBytes,
|
|
"fileName": doc.fileName,
|
|
"mimeType": doc.mimeType
|
|
}
|
|
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=documentData["bytes"],
|
|
fileName=documentData["fileName"],
|
|
mimeType=documentData["mimeType"],
|
|
options=options
|
|
)
|
|
|
|
# Log content parts metadata
|
|
logger.debug(f"Content parts: {len(ec.parts)}")
|
|
for j, part in enumerate(ec.parts):
|
|
logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
|
|
if part.metadata:
|
|
logger.debug(f" Metadata: {part.metadata}")
|
|
|
|
# Attach document id and MIME type to parts if missing
|
|
for p in ec.parts:
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
|
if "documentMimeType" not in p.metadata:
|
|
p.metadata["documentMimeType"] = documentData["mimeType"]
|
|
|
|
# Log chunking information
|
|
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
|
if chunked_parts:
|
|
logger.debug(f"=== CHUNKING RESULTS ===")
|
|
logger.debug(f"Total parts: {len(ec.parts)}")
|
|
logger.debug(f"Chunked parts: {len(chunked_parts)}")
|
|
for chunk in chunked_parts:
|
|
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
|
|
else:
|
|
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
|
|
|
|
ec = applyAiIfRequested(ec, options)
|
|
|
|
# Calculate timing and emit stats
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
bytesSent = len(documentBytes)
|
|
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
|
|
|
|
# Emit stats for extraction operation
|
|
|
|
# Use internal extraction model for pricing
|
|
modelName = "internal_extraction"
|
|
priceUsd = aiModels[modelName]["calculatePriceUsd"](processingTime, bytesSent, bytesReceived)
|
|
|
|
# Create AiCallResponse with real calculation
|
|
aiResponse = AiCallResponse(
|
|
content="", # No content for extraction stats needed
|
|
modelName=modelName,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=bytesSent,
|
|
bytesReceived=bytesReceived,
|
|
errorCount=0
|
|
)
|
|
|
|
self.services.workflow.storeWorkflowStat(
|
|
self.services.currentWorkflow,
|
|
aiResponse,
|
|
f"extraction.process.{doc.mimeType}"
|
|
)
|
|
|
|
results.append(ec)
|
|
|
|
return results
|
|
|
|
def mergeAiResults(
|
|
self,
|
|
extractedContent: List[ContentExtracted],
|
|
aiResults: List[str],
|
|
strategy: MergeStrategy
|
|
) -> ContentExtracted:
|
|
"""
|
|
Merge AI results from chunked content back into a single ContentExtracted.
|
|
|
|
Args:
|
|
extractedContent: List of ContentExtracted objects that were processed
|
|
aiResults: List of AI response strings, one per chunk
|
|
strategy: Merge strategy configuration (dict or MergeStrategy object)
|
|
|
|
Returns:
|
|
Single ContentExtracted with merged AI results
|
|
"""
|
|
logger.debug(f"=== MERGING AI RESULTS ===")
|
|
logger.debug(f"Extracted content: {len(extractedContent)} documents")
|
|
logger.debug(f"AI results: {len(aiResults)} responses")
|
|
logger.debug(f"Merge strategy: {strategy.mergeType}")
|
|
|
|
mergeStrategy = strategy
|
|
|
|
# Collect all parts from all extracted content
|
|
allParts: List[ContentPart] = []
|
|
for ec in extractedContent:
|
|
allParts.extend(ec.parts)
|
|
|
|
logger.debug(f"Total original parts: {len(allParts)}")
|
|
|
|
# Create AI result parts
|
|
aiResultParts: List[ContentPart] = []
|
|
for i, aiResult in enumerate(aiResults):
|
|
aiPart = ContentPart(
|
|
id=f"ai_result_{i}",
|
|
parentId=None, # Will be set based on strategy
|
|
label="ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=aiResult,
|
|
metadata={
|
|
"aiResult": True,
|
|
"order": i,
|
|
"size": len(aiResult.encode('utf-8'))
|
|
}
|
|
)
|
|
aiResultParts.append(aiPart)
|
|
|
|
logger.debug(f"Created {len(aiResultParts)} AI result parts")
|
|
|
|
# Apply merging strategy
|
|
if mergeStrategy.mergeType == "concatenate":
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "hierarchical":
|
|
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "intelligent":
|
|
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
|
|
else:
|
|
# Default to concatenate
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
|
|
# Create final ContentExtracted
|
|
mergedContent = ContentExtracted(
|
|
id=f"merged_{uuid.uuid4()}",
|
|
parts=mergedParts
|
|
)
|
|
|
|
logger.debug(f"=== MERGE COMPLETED ===")
|
|
logger.debug(f"Final merged parts: {len(mergedParts)}")
|
|
logger.debug(f"Merged content ID: {mergedContent.id}")
|
|
|
|
return mergedContent
|
|
|
|
def _mergeConcatenate(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts by simple concatenation."""
|
|
mergedParts = []
|
|
|
|
# Add original parts (filtered if needed)
|
|
for part in originalParts:
|
|
if strategy.preserveChunks or not part.metadata.get("chunk", False):
|
|
mergedParts.append(part)
|
|
|
|
# Add AI results
|
|
if aiResultParts:
|
|
# Group AI results by parentId if available
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
# Merge AI results for each parent
|
|
for parentId, aiParts in aiResultsByParent.items():
|
|
if len(aiParts) == 1:
|
|
mergedParts.append(aiParts[0])
|
|
else:
|
|
# Concatenate multiple AI results for same parent
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
|
|
combinedPart = ContentPart(
|
|
id=f"merged_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="merged_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"merged": True,
|
|
"sourceCount": len(aiParts),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeHierarchical(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts hierarchically based on parentId relationships."""
|
|
# Group parts by parentId
|
|
partsByParent = {}
|
|
for part in originalParts:
|
|
parentId = part.parentId or "root"
|
|
if parentId not in partsByParent:
|
|
partsByParent[parentId] = []
|
|
partsByParent[parentId].append(part)
|
|
|
|
# Group AI results by parentId
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
mergedParts = []
|
|
|
|
# Process each parent group
|
|
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
|
|
originalGroup = partsByParent.get(parentId, [])
|
|
aiGroup = aiResultsByParent.get(parentId, [])
|
|
|
|
# Add original parts
|
|
mergedParts.extend(originalGroup)
|
|
|
|
# Add AI results for this parent
|
|
if aiGroup:
|
|
if len(aiGroup) == 1:
|
|
mergedParts.append(aiGroup[0])
|
|
else:
|
|
# Merge multiple AI results
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
|
|
combinedPart = ContentPart(
|
|
id=f"hierarchical_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="hierarchical_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"hierarchical": True,
|
|
"sourceCount": len(aiGroup),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeIntelligent(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts using intelligent strategies based on content type."""
|
|
mergedParts = []
|
|
|
|
# Group by typeGroup for intelligent merging
|
|
partsByType = {}
|
|
for part in originalParts:
|
|
typeGroup = part.typeGroup
|
|
if typeGroup not in partsByType:
|
|
partsByType[typeGroup] = []
|
|
partsByType[typeGroup].append(part)
|
|
|
|
# Process each type group
|
|
for typeGroup, parts in partsByType.items():
|
|
if typeGroup == "text":
|
|
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "table":
|
|
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "structure":
|
|
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
|
|
else:
|
|
# Default handling for other types
|
|
mergedParts.extend(parts)
|
|
|
|
# Add any remaining AI results that weren't merged
|
|
for aiPart in aiResultParts:
|
|
if not any(p.id == aiPart.id for p in mergedParts):
|
|
mergedParts.append(aiPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeTextIntelligent(
|
|
self,
|
|
textParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for text content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with semantic analysis, summarization, etc.
|
|
return self._mergeConcatenate(textParts, aiResultParts, strategy)
|
|
|
|
def _mergeTableIntelligent(
|
|
self,
|
|
tableParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for table content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with table merging logic
|
|
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
|
|
|
|
def _mergeStructureIntelligent(
|
|
self,
|
|
structureParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for structured content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with structure-aware merging
|
|
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
|
|
|
|
|