1188 lines
54 KiB
Python
1188 lines
54 KiB
Python
from typing import Any, Dict, List, Optional, Union
|
|
import uuid
|
|
import logging
|
|
import time
|
|
import asyncio
|
|
import base64
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum, AiModelCall
|
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
|
from modules.aicore.aicoreModelSelector import modelSelector
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self, services: Optional[Any] = None):
|
|
self.services = services
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
|
|
# Ensure connectors are registered
|
|
discovered = modelRegistry.discoverConnectors()
|
|
for connector in discovered:
|
|
modelRegistry.registerConnector(connector)
|
|
|
|
# Verify required internal model is available (used for pricing in extractContent)
|
|
modelDisplayName = "Internal Document Extractor"
|
|
model = modelRegistry.getModel(modelDisplayName)
|
|
if model is None or model.calculatePriceUsd is None:
|
|
raise RuntimeError(f"FATAL: Required internal model '{modelDisplayName}' is not available. Check connector registration.")
|
|
|
|
def extractContent(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
options: ExtractionOptions,
|
|
operationId: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> List[ContentExtracted]:
|
|
"""
|
|
Extract content from a list of ChatDocument objects.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to extract content from
|
|
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
|
|
operationId: Optional operation ID for progress logging (parent operation)
|
|
parentOperationId: Optional parent operation ID for hierarchical logging
|
|
|
|
Returns:
|
|
List of ContentExtracted objects, one per input document
|
|
"""
|
|
|
|
results: List[ContentExtracted] = []
|
|
|
|
# Lazy import to avoid circular deps and heavy init at module import
|
|
from modules.interfaces.interfaceDbComponentObjects import getInterface
|
|
dbInterface = getInterface()
|
|
|
|
totalDocs = len(documents)
|
|
|
|
for i, doc in enumerate(documents):
|
|
logger.info(f"=== DOCUMENT {i + 1}/{totalDocs}: {doc.fileName} ===")
|
|
logger.info(f"Initial MIME type: {doc.mimeType}")
|
|
|
|
# Create child operation for this document if parent operationId is provided
|
|
docOperationId = None
|
|
if operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
docOperationId = f"{operationId}_doc_{i}"
|
|
self.services.chat.progressLogStart(
|
|
docOperationId,
|
|
"Extracting Document",
|
|
f"Document {i + 1}/{totalDocs}",
|
|
doc.fileName[:50] + "..." if len(doc.fileName) > 50 else doc.fileName,
|
|
parentOperationId=operationId # Use operationId as parent (not parentOperationId)
|
|
)
|
|
|
|
# Start timing for this document
|
|
startTime = time.time()
|
|
|
|
try:
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.1, "Loading document data")
|
|
|
|
# Resolve raw bytes for this document using interface
|
|
documentBytes = dbInterface.getFileData(doc.fileId)
|
|
if not documentBytes:
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
raise ValueError(f"No file data found for fileId={doc.fileId}")
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.2, "Running extraction pipeline")
|
|
|
|
# Convert ChatDocument to the format expected by runExtraction
|
|
documentData = {
|
|
"id": doc.id,
|
|
"bytes": documentBytes,
|
|
"fileName": doc.fileName,
|
|
"mimeType": doc.mimeType
|
|
}
|
|
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=documentData["bytes"],
|
|
fileName=documentData["fileName"],
|
|
mimeType=documentData["mimeType"],
|
|
options=options
|
|
)
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.7, f"Extracted {len(ec.parts)} parts")
|
|
|
|
# Log content parts metadata
|
|
logger.debug(f"Content parts: {len(ec.parts)}")
|
|
for j, part in enumerate(ec.parts):
|
|
logger.debug(f" Part {j + 1}/{len(ec.parts)}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
|
|
if part.metadata:
|
|
logger.debug(f" Metadata: {part.metadata}")
|
|
|
|
# Attach document id and MIME type to parts if missing
|
|
for p in ec.parts:
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
|
if "documentMimeType" not in p.metadata:
|
|
p.metadata["documentMimeType"] = documentData["mimeType"]
|
|
|
|
# Log chunking information
|
|
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
|
if chunkedParts:
|
|
logger.debug(f"=== CHUNKING RESULTS ===")
|
|
logger.debug(f"Total parts: {len(ec.parts)}")
|
|
logger.debug(f"Chunked parts: {len(chunkedParts)}")
|
|
for chunk in chunkedParts:
|
|
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
|
|
else:
|
|
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.9, f"Processing complete: {len(ec.parts)} parts extracted")
|
|
|
|
# Calculate timing and emit stats
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
bytesSent = len(documentBytes)
|
|
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
|
|
|
|
# Emit stats for extraction operation
|
|
|
|
# Use internal extraction model for pricing
|
|
modelDisplayName = "Internal Document Extractor"
|
|
model = modelRegistry.getModel(modelDisplayName)
|
|
# Hard fail if model is missing; caller must ensure connectors are registered
|
|
if model is None or model.calculatePriceUsd is None:
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
raise RuntimeError(f"Pricing model not available: {modelDisplayName}")
|
|
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
|
|
|
|
# Create AiCallResponse with real calculation
|
|
# Use model.name for the response (API identifier), not displayName
|
|
aiResponse = AiCallResponse(
|
|
content="", # No content for extraction stats needed
|
|
modelName=model.name,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=bytesSent,
|
|
bytesReceived=bytesReceived,
|
|
errorCount=0
|
|
)
|
|
|
|
self.services.chat.storeWorkflowStat(
|
|
self.services.workflow,
|
|
aiResponse,
|
|
f"extraction.process.{doc.mimeType}"
|
|
)
|
|
|
|
# Write extraction results to debug file
|
|
try:
|
|
from modules.shared.debugLogger import writeDebugFile
|
|
import json
|
|
# Create summary of extraction results for debug
|
|
extractionSummary = {
|
|
"documentName": doc.fileName,
|
|
"documentMimeType": doc.mimeType,
|
|
"partsCount": len(ec.parts),
|
|
"parts": []
|
|
}
|
|
for part in ec.parts:
|
|
partSummary = {
|
|
"typeGroup": part.typeGroup,
|
|
"mimeType": part.mimeType,
|
|
"label": part.label,
|
|
"dataLength": len(part.data) if part.data else 0,
|
|
"metadata": part.metadata
|
|
}
|
|
# Include data preview for small parts (first 500 chars)
|
|
if part.data and len(part.data) <= 500:
|
|
partSummary["dataPreview"] = part.data[:500]
|
|
elif part.data:
|
|
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
|
|
extractionSummary["parts"].append(partSummary)
|
|
|
|
writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to write extraction debug file: {str(e)}")
|
|
|
|
results.append(ec)
|
|
|
|
# Finish document operation successfully
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, True)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content from document {i + 1}/{totalDocs} ({doc.fileName}): {str(e)}")
|
|
if docOperationId:
|
|
try:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
except:
|
|
pass # Don't fail on progress logging errors
|
|
# Continue with next document instead of failing completely
|
|
# This allows parallel processing to continue even if one document fails
|
|
continue
|
|
|
|
return results
|
|
|
|
def mergeAiResults(
|
|
self,
|
|
extractedContent: List[ContentExtracted],
|
|
aiResults: List[str],
|
|
strategy: MergeStrategy
|
|
) -> ContentExtracted:
|
|
"""
|
|
Merge AI results from chunked content back into a single ContentExtracted.
|
|
|
|
Args:
|
|
extractedContent: List of ContentExtracted objects that were processed
|
|
aiResults: List of AI response strings, one per chunk
|
|
strategy: Merge strategy configuration (dict or MergeStrategy object)
|
|
|
|
Returns:
|
|
Single ContentExtracted with merged AI results
|
|
"""
|
|
logger.debug(f"=== MERGING AI RESULTS ===")
|
|
logger.debug(f"Extracted content: {len(extractedContent)} documents")
|
|
logger.debug(f"AI results: {len(aiResults)} responses")
|
|
logger.debug(f"Merge strategy: {strategy.mergeType}")
|
|
|
|
mergeStrategy = strategy
|
|
|
|
# Collect all parts from all extracted content
|
|
allParts: List[ContentPart] = []
|
|
for ec in extractedContent:
|
|
allParts.extend(ec.parts)
|
|
|
|
logger.debug(f"Total original parts: {len(allParts)}")
|
|
|
|
# Create AI result parts
|
|
aiResultParts: List[ContentPart] = []
|
|
for i, aiResult in enumerate(aiResults):
|
|
aiPart = ContentPart(
|
|
id=f"ai_result_{i}",
|
|
parentId=None, # Will be set based on strategy
|
|
label="ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=aiResult,
|
|
metadata={
|
|
"aiResult": True,
|
|
"order": i,
|
|
"size": len(aiResult.encode('utf-8'))
|
|
}
|
|
)
|
|
aiResultParts.append(aiPart)
|
|
|
|
logger.debug(f"Created {len(aiResultParts)} AI result parts")
|
|
|
|
# Apply merging strategy
|
|
if mergeStrategy.mergeType == "concatenate":
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "hierarchical":
|
|
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "intelligent":
|
|
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
|
|
else:
|
|
# Default to concatenate
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
|
|
# Create final ContentExtracted
|
|
mergedContent = ContentExtracted(
|
|
id=f"merged_{uuid.uuid4()}",
|
|
parts=mergedParts
|
|
)
|
|
|
|
logger.debug(f"=== MERGE COMPLETED ===")
|
|
logger.debug(f"Final merged parts: {len(mergedParts)}")
|
|
logger.debug(f"Merged content ID: {mergedContent.id}")
|
|
|
|
return mergedContent
|
|
|
|
def _mergeConcatenate(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts by simple concatenation."""
|
|
mergedParts = []
|
|
|
|
# Add original parts (filtered if needed)
|
|
for part in originalParts:
|
|
if strategy.preserveChunks or not part.metadata.get("chunk", False):
|
|
mergedParts.append(part)
|
|
|
|
# Add AI results
|
|
if aiResultParts:
|
|
# Group AI results by parentId if available
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
# Merge AI results for each parent
|
|
for parentId, aiParts in aiResultsByParent.items():
|
|
if len(aiParts) == 1:
|
|
mergedParts.append(aiParts[0])
|
|
else:
|
|
# Concatenate multiple AI results for same parent
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
|
|
combinedPart = ContentPart(
|
|
id=f"merged_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="merged_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"merged": True,
|
|
"sourceCount": len(aiParts),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeHierarchical(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts hierarchically based on parentId relationships."""
|
|
# Group parts by parentId
|
|
partsByParent = {}
|
|
for part in originalParts:
|
|
parentId = part.parentId or "root"
|
|
if parentId not in partsByParent:
|
|
partsByParent[parentId] = []
|
|
partsByParent[parentId].append(part)
|
|
|
|
# Group AI results by parentId
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
mergedParts = []
|
|
|
|
# Process each parent group
|
|
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
|
|
originalGroup = partsByParent.get(parentId, [])
|
|
aiGroup = aiResultsByParent.get(parentId, [])
|
|
|
|
# Add original parts
|
|
mergedParts.extend(originalGroup)
|
|
|
|
# Add AI results for this parent
|
|
if aiGroup:
|
|
if len(aiGroup) == 1:
|
|
mergedParts.append(aiGroup[0])
|
|
else:
|
|
# Merge multiple AI results
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
|
|
combinedPart = ContentPart(
|
|
id=f"hierarchical_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="hierarchical_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"hierarchical": True,
|
|
"sourceCount": len(aiGroup),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeIntelligent(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts using intelligent strategies based on content type."""
|
|
mergedParts = []
|
|
|
|
# Group by typeGroup for intelligent merging
|
|
partsByType = {}
|
|
for part in originalParts:
|
|
typeGroup = part.typeGroup
|
|
if typeGroup not in partsByType:
|
|
partsByType[typeGroup] = []
|
|
partsByType[typeGroup].append(part)
|
|
|
|
# Process each type group
|
|
for typeGroup, parts in partsByType.items():
|
|
if typeGroup == "text":
|
|
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "table":
|
|
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "structure":
|
|
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
|
|
else:
|
|
# Default handling for other types
|
|
mergedParts.extend(parts)
|
|
|
|
# Add any remaining AI results that weren't merged
|
|
for aiPart in aiResultParts:
|
|
if not any(p.id == aiPart.id for p in mergedParts):
|
|
mergedParts.append(aiPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeTextIntelligent(
|
|
self,
|
|
textParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for text content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with semantic analysis, summarization, etc.
|
|
return self._mergeConcatenate(textParts, aiResultParts, strategy)
|
|
|
|
def _mergeTableIntelligent(
|
|
self,
|
|
tableParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for table content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with table merging logic
|
|
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
|
|
|
|
def _mergeStructureIntelligent(
|
|
self,
|
|
structureParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for structured content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with structure-aware merging
|
|
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
|
|
|
|
async def processDocumentsPerChunk(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None
|
|
) -> str:
|
|
"""
|
|
Process documents with model-aware chunking and merge results.
|
|
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to process
|
|
prompt: AI prompt for processing
|
|
aiObjects: AiObjects instance for making AI calls
|
|
options: AI call options
|
|
operationId: Optional operation ID for progress tracking
|
|
|
|
Returns:
|
|
Merged AI results as string with preserved document structure
|
|
"""
|
|
if not documents:
|
|
return ""
|
|
|
|
# Create operationId if not provided
|
|
if not operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
|
|
self.services.chat.progressLogStart(
|
|
operationId,
|
|
"AI Text Extract",
|
|
"Document Processing",
|
|
f"Processing {len(documents)} documents"
|
|
)
|
|
|
|
try:
|
|
# Build extraction options using Pydantic model
|
|
mergeStrategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
prompt=prompt,
|
|
groupBy="typeGroup",
|
|
orderBy="id",
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
extractionOptions = ExtractionOptions(
|
|
prompt=prompt,
|
|
processDocumentsIndividually=True,
|
|
mergeStrategy=mergeStrategy
|
|
)
|
|
|
|
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars")
|
|
|
|
# Extract content WITHOUT chunking
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
|
|
# Pass operationId as parentOperationId for hierarchical logging
|
|
extractionResult = self.extractContent(documents, extractionOptions, operationId=operationId, parentOperationId=parentOperationId)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return "[Error: No extraction results]"
|
|
|
|
# Process parts (not chunks) with model-aware AI calls
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
|
|
# Use parent operation ID directly (parentId should be operationId, not log entry ID)
|
|
parentOperationId = operationId # Use the parent's operationId directly
|
|
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId, parentOperationId)
|
|
|
|
# Merge results using existing merging system
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
|
|
mergedContent = self.mergePartResults(partResults, options)
|
|
|
|
# Save merged extraction content to debug
|
|
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
|
|
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, True)
|
|
|
|
return mergedContent
|
|
except Exception as e:
|
|
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
raise
|
|
|
|
async def _processPartsWithMapping(
|
|
self,
|
|
extractionResult: List[ContentExtracted],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> List[PartResult]:
|
|
"""Process content parts with model-aware chunking and proper mapping."""
|
|
|
|
# Collect all parts that need processing
|
|
partsToProcess = []
|
|
partIndex = 0
|
|
|
|
for ec in extractionResult:
|
|
for part in ec.parts:
|
|
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
|
# Skip empty container parts
|
|
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
|
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
|
|
continue
|
|
|
|
partsToProcess.append({
|
|
'part': part,
|
|
'part_index': partIndex,
|
|
'document_id': ec.id
|
|
})
|
|
partIndex += 1
|
|
|
|
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
|
|
|
|
totalParts = len(partsToProcess)
|
|
|
|
# Process parts in parallel
|
|
processedCount = [0] # Use list to allow modification in nested function
|
|
|
|
async def processSinglePart(partInfo: Dict) -> PartResult:
|
|
part = partInfo['part']
|
|
part_index = partInfo['part_index']
|
|
documentId = partInfo['document_id']
|
|
|
|
start_time = time.time()
|
|
|
|
# Create separate operation for each part with parent reference
|
|
partOperationId = None
|
|
if operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
partOperationId = f"{operationId}_part_{part_index}"
|
|
self.services.chat.progressLogStart(
|
|
partOperationId,
|
|
"Content Processing",
|
|
f"Part {part_index + 1}",
|
|
f"Type: {part.typeGroup}",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
# Create AI call request with content part
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="", # Context is in the content part
|
|
options=options,
|
|
contentParts=[part] # Pass as list for unified processing
|
|
)
|
|
|
|
# Update progress - initiating
|
|
if partOperationId:
|
|
self.services.chat.progressLogUpdate(partOperationId, 0.3, "Initiating")
|
|
|
|
# Call AI with model-aware chunking (no progress callback - handled by parent operation)
|
|
response = await aiObjects.call(request)
|
|
|
|
# Update progress - completed
|
|
if partOperationId:
|
|
self.services.chat.progressLogUpdate(partOperationId, 0.9, "Completed")
|
|
self.services.chat.progressLogFinish(partOperationId, True)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=response.content,
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": True,
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"resultSize": len(response.content),
|
|
"typeGroup": part.typeGroup,
|
|
"modelName": response.modelName,
|
|
"priceUsd": response.priceUsd
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
processing_time = time.time() - start_time
|
|
logger.warning(f"Error processing part {part_index}: {str(e)}")
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=f"[Error processing part: {str(e)}]",
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": False,
|
|
"error": str(e),
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"typeGroup": part.typeGroup
|
|
}
|
|
)
|
|
|
|
# Process parts with concurrency control
|
|
maxConcurrent = 5
|
|
if options and hasattr(options, 'maxConcurrentParts'):
|
|
maxConcurrent = options.maxConcurrentParts
|
|
|
|
semaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
async def processWithSemaphore(partInfo):
|
|
async with semaphore:
|
|
return await processSinglePart(partInfo)
|
|
|
|
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
|
|
partResults = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle exceptions
|
|
processedResults = []
|
|
for i, result in enumerate(partResults):
|
|
if isinstance(result, Exception):
|
|
part_info = partsToProcess[i]
|
|
processedResults.append(PartResult(
|
|
originalPart=part_info['part'],
|
|
aiResult=f"[Error in parallel processing: {str(result)}]",
|
|
partIndex=part_info['part_index'],
|
|
documentId=part_info['document_id'],
|
|
processingTime=0.0,
|
|
metadata={"success": False, "error": str(result)}
|
|
))
|
|
elif result is not None:
|
|
processedResults.append(result)
|
|
|
|
logger.info(f"Completed processing {len(processedResults)} parts")
|
|
return processedResults
|
|
|
|
def _convertToContentParts(
|
|
self, partResults: Union[List[PartResult], List[AiCallResponse]]
|
|
) -> List[ContentPart]:
|
|
"""Convert part results to ContentParts (internal helper for consolidation).
|
|
|
|
Handles both PartResult (from extraction workflow) and AiCallResponse (from content parts processing).
|
|
"""
|
|
content_parts = []
|
|
|
|
if not partResults:
|
|
return content_parts
|
|
|
|
# Detect input type and convert accordingly
|
|
if isinstance(partResults[0], PartResult):
|
|
# Existing logic for PartResult (from processDocumentsPerChunk)
|
|
for part_result in partResults:
|
|
content_part = ContentPart(
|
|
id=part_result.originalPart.id,
|
|
parentId=part_result.originalPart.parentId,
|
|
label=part_result.originalPart.label,
|
|
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
|
|
mimeType=part_result.originalPart.mimeType,
|
|
data=part_result.aiResult, # Use AI result as data
|
|
metadata={
|
|
**part_result.originalPart.metadata,
|
|
"aiResult": True,
|
|
"partIndex": part_result.partIndex,
|
|
"documentId": part_result.documentId,
|
|
"processingTime": part_result.processingTime,
|
|
"success": part_result.metadata.get("success", False)
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
elif isinstance(partResults[0], AiCallResponse):
|
|
# Logic from interfaceAiObjects (from content parts processing)
|
|
for i, result in enumerate(partResults):
|
|
if result.content:
|
|
content_part = ContentPart(
|
|
id=str(uuid.uuid4()),
|
|
parentId=None,
|
|
label=f"ai_result_{i}",
|
|
typeGroup="text", # Default to text for AI results
|
|
mimeType="text/plain",
|
|
data=result.content,
|
|
metadata={
|
|
"aiResult": True,
|
|
"modelName": result.modelName,
|
|
"priceUsd": result.priceUsd,
|
|
"processingTime": result.processingTime,
|
|
"bytesSent": result.bytesSent,
|
|
"bytesReceived": result.bytesReceived
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
|
|
return content_parts
|
|
|
|
def mergePartResults(
|
|
self,
|
|
partResults: Union[List[PartResult], List[AiCallResponse]],
|
|
options: Optional[AiCallOptions] = None
|
|
) -> str:
|
|
"""Unified merge for both PartResult and AiCallResponse.
|
|
|
|
Consolidated from both interfaceAiObjects.py and existing serviceExtraction method.
|
|
"""
|
|
if not partResults:
|
|
return ""
|
|
|
|
# Convert to ContentParts using unified helper
|
|
content_parts = self._convertToContentParts(partResults)
|
|
|
|
# Determine merge strategy based on input type
|
|
if isinstance(partResults[0], PartResult):
|
|
# Use strategy for extraction workflow (group by document, order by part index)
|
|
merge_strategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
groupBy="documentId", # Group by document
|
|
orderBy="partIndex", # Order by part index
|
|
mergeType="concatenate"
|
|
)
|
|
else:
|
|
# Default strategy for content parts workflow
|
|
merge_strategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
groupBy="typeGroup",
|
|
orderBy="id",
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
# Apply merging
|
|
merged_parts = applyMerging(content_parts, merge_strategy)
|
|
|
|
# Convert back to string
|
|
final_content = "\n\n".join([part.data for part in merged_parts])
|
|
|
|
logger.info(f"Merged {len(partResults)} parts using unified merging system")
|
|
return final_content.strip()
|
|
|
|
async def chunkContentPartForAi(self, contentPart, model, options, prompt: str = "") -> List[Dict[str, Any]]:
|
|
"""Chunk a content part based on model capabilities, accounting for prompt, system message overhead, and maxTokens output.
|
|
|
|
Moved from interfaceAiObjects.py - model-aware chunking for AI processing.
|
|
Complementary to existing size-based chunking in extraction pipeline.
|
|
"""
|
|
# Calculate model-specific chunk sizes
|
|
modelContextTokens = model.contextLength # Total context in tokens
|
|
modelMaxOutputTokens = model.maxTokens # Maximum output tokens
|
|
|
|
# Reserve tokens for:
|
|
# 1. Prompt (user message)
|
|
promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
|
|
|
|
# 2. System message wrapper ("Context from documents:\n")
|
|
systemMessageTokens = 10 # ~40 bytes = 10 tokens
|
|
|
|
# 3. Max output tokens (model will reserve space for completion)
|
|
outputTokens = modelMaxOutputTokens
|
|
|
|
# 4. JSON structure and message overhead (~100 tokens)
|
|
messageOverheadTokens = 100
|
|
|
|
# Total reserved tokens = input overhead + output reservation
|
|
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
|
|
|
|
# Available tokens for content = context length - reserved tokens
|
|
# Use 80% of available for safety margin
|
|
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
|
|
|
|
# Ensure we have at least some space
|
|
if availableContentTokens < 100:
|
|
logger.warning(f"Very limited space for content: {availableContentTokens} tokens available. Model: {model.name}, contextLength: {modelContextTokens}, maxTokens: {modelMaxOutputTokens}, prompt: {promptTokens:.0f} tokens")
|
|
availableContentTokens = max(100, int(modelContextTokens * 0.1)) # Fallback to 10% of context
|
|
|
|
# Convert tokens to bytes (1 token ≈ 4 bytes)
|
|
availableContentBytes = availableContentTokens * 4
|
|
|
|
logger.debug(f"Chunking calculation for {model.name}: contextLength={modelContextTokens} tokens, maxTokens={modelMaxOutputTokens} tokens, prompt={promptTokens:.0f} tokens, reserved={totalReservedTokens:.0f} tokens, available={availableContentTokens} tokens ({availableContentBytes} bytes)")
|
|
|
|
# Use 70% of available content bytes for text chunks (conservative)
|
|
textChunkSize = int(availableContentBytes * 0.7)
|
|
imageChunkSize = int(availableContentBytes * 0.8) # 80% for image chunks
|
|
|
|
# Build chunking options
|
|
chunkingOptions = {
|
|
"textChunkSize": textChunkSize,
|
|
"imageChunkSize": imageChunkSize,
|
|
"maxSize": availableContentBytes,
|
|
"chunkAllowed": True
|
|
}
|
|
|
|
# Get appropriate chunker (uses existing ChunkerRegistry ✅)
|
|
chunker = self._chunkerRegistry.resolve(contentPart.typeGroup)
|
|
|
|
if not chunker:
|
|
logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
|
|
return []
|
|
|
|
# Chunk the content part
|
|
try:
|
|
chunks = chunker.chunk(contentPart, chunkingOptions)
|
|
logger.debug(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part")
|
|
return chunks
|
|
except Exception as e:
|
|
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
|
|
return []
|
|
|
|
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
|
|
"""Process a single content part with model-aware chunking and fallback.
|
|
|
|
Moved from interfaceAiObjects.py - orchestrates chunking and merging.
|
|
Calls aiObjects._callWithModel() for actual AI calls.
|
|
"""
|
|
lastError = None
|
|
|
|
# Check if this is an image - Vision models need special handling
|
|
isImage = (contentPart.typeGroup == "image") or (contentPart.mimeType and contentPart.mimeType.startswith("image/"))
|
|
|
|
# Determine the correct operation type based on content type
|
|
actualOperationType = options.operationType
|
|
if isImage:
|
|
actualOperationType = OperationTypeEnum.IMAGE_ANALYSE
|
|
# Get vision-capable models for images
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
visionFailoverList = modelSelector.getFailoverModelList(prompt, "", AiCallOptions(operationType=actualOperationType), availableModels)
|
|
if visionFailoverList:
|
|
logger.debug(f"Using {len(visionFailoverList)} vision-capable models for image processing")
|
|
failoverModelList = visionFailoverList
|
|
|
|
for attempt, model in enumerate(failoverModelList):
|
|
try:
|
|
logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
|
|
|
# Special handling for images with Vision models
|
|
if isImage and hasattr(model, 'functionCall'):
|
|
try:
|
|
if not contentPart.data:
|
|
raise ValueError("Image content part has no data")
|
|
|
|
mimeType = contentPart.mimeType or "image/jpeg"
|
|
if not mimeType.startswith("image/"):
|
|
raise ValueError(f"Invalid mimeType for image: {mimeType}")
|
|
|
|
# Prepare base64 data
|
|
if isinstance(contentPart.data, str):
|
|
try:
|
|
base64.b64decode(contentPart.data, validate=True)
|
|
base64Data = contentPart.data
|
|
except Exception as e:
|
|
raise ValueError(f"Invalid base64 data in contentPart: {str(e)}")
|
|
elif isinstance(contentPart.data, bytes):
|
|
base64Data = base64.b64encode(contentPart.data).decode('utf-8')
|
|
else:
|
|
raise ValueError(f"Unsupported data type for image: {type(contentPart.data)}")
|
|
|
|
imageDataUrl = f"data:{mimeType};base64,{base64Data}"
|
|
|
|
modelCall = AiModelCall(
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": prompt or ""},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": imageDataUrl}
|
|
}
|
|
]
|
|
}
|
|
],
|
|
model=model,
|
|
options=AiCallOptions(operationType=actualOperationType)
|
|
)
|
|
|
|
modelResponse = await model.functionCall(modelCall)
|
|
|
|
if not modelResponse.success:
|
|
raise ValueError(f"Model call failed: {modelResponse.error}")
|
|
|
|
logger.info(f"✅ Image content part processed successfully with model: {model.name}")
|
|
|
|
processingTime = getattr(modelResponse, 'processingTime', None) or 0.0
|
|
|
|
return AiCallResponse(
|
|
content=modelResponse.content,
|
|
modelName=model.name,
|
|
priceUsd=0.0,
|
|
processingTime=processingTime,
|
|
bytesSent=0,
|
|
bytesReceived=0,
|
|
errorCount=0
|
|
)
|
|
except Exception as e:
|
|
lastError = e
|
|
logger.warning(f"❌ Image processing failed with model {model.name}: {str(e)}")
|
|
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next fallback model for image processing...")
|
|
continue
|
|
else:
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for image processing")
|
|
raise
|
|
|
|
# For non-image parts, check if part fits in model context
|
|
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
|
|
|
|
modelContextTokens = model.contextLength
|
|
modelMaxOutputTokens = model.maxTokens
|
|
|
|
promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
|
|
systemMessageTokens = 10
|
|
outputTokens = modelMaxOutputTokens
|
|
messageOverheadTokens = 100
|
|
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
|
|
|
|
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
|
|
if availableContentTokens < 100:
|
|
availableContentTokens = max(100, int(modelContextTokens * 0.1))
|
|
|
|
availableContentBytes = availableContentTokens * 4
|
|
|
|
logger.debug(f"Size check for {model.name}: partSize={partSize} bytes, availableContentBytes={availableContentBytes} bytes")
|
|
|
|
if partSize <= availableContentBytes:
|
|
# Part fits - call AI directly via aiObjects interface
|
|
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
|
|
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
|
return response
|
|
else:
|
|
# Part too large - chunk it
|
|
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
|
if not chunks:
|
|
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
|
|
|
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
|
|
|
|
chunkResults = []
|
|
for idx, chunk in enumerate(chunks):
|
|
chunkNum = idx + 1
|
|
chunkData = chunk.get('data', '')
|
|
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
|
|
|
|
try:
|
|
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
|
|
chunkResults.append(chunkResponse)
|
|
logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
|
|
|
|
if progressCallback:
|
|
progressCallback(chunkNum / len(chunks), f"Chunk {chunkNum}/{len(chunks)} processed")
|
|
except Exception as e:
|
|
logger.error(f"❌ Error processing chunk {chunkNum}/{len(chunks)}: {str(e)}")
|
|
raise
|
|
|
|
# Merge chunk results
|
|
mergedContent = self.mergeChunkResults(chunkResults)
|
|
|
|
logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName=model.name,
|
|
priceUsd=sum(r.priceUsd for r in chunkResults),
|
|
processingTime=sum(r.processingTime for r in chunkResults),
|
|
bytesSent=sum(r.bytesSent for r in chunkResults),
|
|
bytesReceived=sum(r.bytesReceived for r in chunkResults),
|
|
errorCount=sum(r.errorCount for r in chunkResults)
|
|
)
|
|
|
|
except Exception as e:
|
|
lastError = e
|
|
error_msg = str(e) if str(e) else f"{type(e).__name__}"
|
|
logger.warning(f"❌ Model {model.name} failed for content part: {error_msg}", exc_info=True)
|
|
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next failover model...")
|
|
continue
|
|
else:
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
|
|
break
|
|
|
|
# All models failed
|
|
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
|
|
|
|
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
|
|
"""Create an error response."""
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=1
|
|
)
|
|
|
|
async def processContentPartsWithAi(
|
|
self,
|
|
request: AiCallRequest,
|
|
aiObjects, # Pass interface for AI calls
|
|
progressCallback=None
|
|
) -> AiCallResponse:
|
|
"""Process content parts with model-aware chunking and AI calls.
|
|
|
|
Moved from interfaceAiObjects.callWithContentParts() - entry point for content parts processing.
|
|
"""
|
|
prompt = request.prompt
|
|
options = request.options
|
|
contentParts = request.contentParts
|
|
|
|
# Get failover models
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
|
|
|
|
if not failoverModelList:
|
|
return self._createErrorResponse("No suitable models found", 0, 0)
|
|
|
|
# Process each content part
|
|
allResults = []
|
|
for contentPart in contentParts:
|
|
partResult = await self.processContentPartWithFallback(
|
|
contentPart, prompt, options, failoverModelList, aiObjects, progressCallback
|
|
)
|
|
allResults.append(partResult)
|
|
|
|
# Merge all results using unified mergePartResults
|
|
mergedContent = self.mergePartResults(allResults)
|
|
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName="multiple",
|
|
priceUsd=sum(r.priceUsd for r in allResults),
|
|
processingTime=sum(r.processingTime for r in allResults),
|
|
bytesSent=sum(r.bytesSent for r in allResults),
|
|
bytesReceived=sum(r.bytesReceived for r in allResults),
|
|
errorCount=sum(r.errorCount for r in allResults)
|
|
)
|
|
|
|
|
|
# Module-level function for use by subPipeline and ExtractionService
|
|
def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
|
"""Apply merging strategy to parts with intelligent token-aware merging.
|
|
|
|
Moved from interfaceAiObjects.py to resolve dependency violations.
|
|
Can be used as module-level function or called from ExtractionService methods.
|
|
"""
|
|
logger.debug(f"applyMerging called with {len(parts)} parts")
|
|
|
|
# Import merging dependencies (now local imports ✅)
|
|
from .merging.mergerText import TextMerger
|
|
from .merging.mergerTable import TableMerger
|
|
from .merging.mergerDefault import DefaultMerger
|
|
from .subMerger import IntelligentTokenAwareMerger
|
|
|
|
# Check if intelligent merging is enabled
|
|
if strategy.useIntelligentMerging:
|
|
modelCapabilities = strategy.capabilities or {}
|
|
subMerger = IntelligentTokenAwareMerger(modelCapabilities)
|
|
|
|
# Use intelligent merging for all parts
|
|
merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
|
|
|
|
# Calculate and log optimization stats
|
|
stats = subMerger.calculateOptimizationStats(parts, merged)
|
|
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
|
logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
|
|
|
return merged
|
|
|
|
# Fallback to traditional merging
|
|
textMerger = TextMerger()
|
|
tableMerger = TableMerger()
|
|
defaultMerger = DefaultMerger()
|
|
|
|
# Group by typeGroup
|
|
textParts = [p for p in parts if p.typeGroup == "text"]
|
|
tableParts = [p for p in parts if p.typeGroup == "table"]
|
|
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
|
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
|
|
|
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
|
|
|
|
merged: List[ContentPart] = []
|
|
|
|
if textParts:
|
|
textMerged = textMerger.merge(textParts, strategy)
|
|
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
|
|
merged.extend(textMerged)
|
|
if tableParts:
|
|
tableMerged = tableMerger.merge(tableParts, strategy)
|
|
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
|
|
merged.extend(tableMerged)
|
|
if structureParts:
|
|
# For now, treat structure like text
|
|
structureMerged = textMerger.merge(structureParts, strategy)
|
|
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
|
|
merged.extend(structureMerged)
|
|
if otherParts:
|
|
otherMerged = defaultMerger.merge(otherParts, strategy)
|
|
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
|
|
merged.extend(otherMerged)
|
|
|
|
logger.debug(f"applyMerging returning {len(merged)} parts")
|
|
return merged
|
|
|
|
|