660 lines
27 KiB
Python
660 lines
27 KiB
Python
from typing import Any, Dict, List, Optional, Union
|
|
import uuid
|
|
import logging
|
|
import time
|
|
import asyncio
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction, _applyMerging
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self, services: Optional[Any] = None):
|
|
self.services = services
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
# Ensure AI connectors are discovered so pricing models are available
|
|
try:
|
|
# If internal model is missing, trigger discovery and registration
|
|
if modelRegistry.getModel("internal-extractor") is None:
|
|
discovered = modelRegistry.discoverConnectors()
|
|
for connector in discovered:
|
|
modelRegistry.registerConnector(connector)
|
|
except Exception:
|
|
# Propagate actual errors during use; init should be fast and side-effect free otherwise
|
|
pass
|
|
|
|
def extractContent(self, documents: List[ChatDocument], options: ExtractionOptions) -> List[ContentExtracted]:
|
|
"""
|
|
Extract content from a list of ChatDocument objects.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to extract content from
|
|
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
|
|
|
|
Returns:
|
|
List of ContentExtracted objects, one per input document
|
|
"""
|
|
|
|
results: List[ContentExtracted] = []
|
|
|
|
# Lazy import to avoid circular deps and heavy init at module import
|
|
from modules.interfaces.interfaceDbComponentObjects import getInterface
|
|
dbInterface = getInterface()
|
|
|
|
for i, doc in enumerate(documents):
|
|
logger.info(f"=== DOCUMENT {i}: {doc.fileName} ===")
|
|
logger.info(f"Initial MIME type: {doc.mimeType}")
|
|
|
|
# Start timing for this document
|
|
startTime = time.time()
|
|
|
|
# Resolve raw bytes for this document using interface
|
|
documentBytes = dbInterface.getFileData(doc.fileId)
|
|
if not documentBytes:
|
|
raise ValueError(f"No file data found for fileId={doc.fileId}")
|
|
|
|
# Convert ChatDocument to the format expected by runExtraction
|
|
documentData = {
|
|
"id": doc.id,
|
|
"bytes": documentBytes,
|
|
"fileName": doc.fileName,
|
|
"mimeType": doc.mimeType
|
|
}
|
|
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=documentData["bytes"],
|
|
fileName=documentData["fileName"],
|
|
mimeType=documentData["mimeType"],
|
|
options=options
|
|
)
|
|
|
|
# Log content parts metadata
|
|
logger.debug(f"Content parts: {len(ec.parts)}")
|
|
for j, part in enumerate(ec.parts):
|
|
logger.debug(f" Part {j}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
|
|
if part.metadata:
|
|
logger.debug(f" Metadata: {part.metadata}")
|
|
|
|
# Attach document id and MIME type to parts if missing
|
|
for p in ec.parts:
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
|
if "documentMimeType" not in p.metadata:
|
|
p.metadata["documentMimeType"] = documentData["mimeType"]
|
|
|
|
# Log chunking information
|
|
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
|
if chunkedParts:
|
|
logger.debug(f"=== CHUNKING RESULTS ===")
|
|
logger.debug(f"Total parts: {len(ec.parts)}")
|
|
logger.debug(f"Chunked parts: {len(chunkedParts)}")
|
|
for chunk in chunkedParts:
|
|
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
|
|
else:
|
|
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
|
|
|
|
# Calculate timing and emit stats
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
bytesSent = len(documentBytes)
|
|
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
|
|
|
|
# Emit stats for extraction operation
|
|
|
|
# Use internal extraction model for pricing
|
|
modelName = "internal-extractor"
|
|
model = modelRegistry.getModel(modelName)
|
|
# Hard fail if model is missing; caller must ensure connectors are registered
|
|
if model is None or model.calculatePriceUsd is None:
|
|
raise RuntimeError(f"Pricing model not available: {modelName}")
|
|
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
|
|
|
|
# Create AiCallResponse with real calculation
|
|
aiResponse = AiCallResponse(
|
|
content="", # No content for extraction stats needed
|
|
modelName=modelName,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=bytesSent,
|
|
bytesReceived=bytesReceived,
|
|
errorCount=0
|
|
)
|
|
|
|
self.services.workflow.storeWorkflowStat(
|
|
self.services.currentWorkflow,
|
|
aiResponse,
|
|
f"extraction.process.{doc.mimeType}"
|
|
)
|
|
|
|
results.append(ec)
|
|
|
|
return results
|
|
|
|
def mergeAiResults(
|
|
self,
|
|
extractedContent: List[ContentExtracted],
|
|
aiResults: List[str],
|
|
strategy: MergeStrategy
|
|
) -> ContentExtracted:
|
|
"""
|
|
Merge AI results from chunked content back into a single ContentExtracted.
|
|
|
|
Args:
|
|
extractedContent: List of ContentExtracted objects that were processed
|
|
aiResults: List of AI response strings, one per chunk
|
|
strategy: Merge strategy configuration (dict or MergeStrategy object)
|
|
|
|
Returns:
|
|
Single ContentExtracted with merged AI results
|
|
"""
|
|
logger.debug(f"=== MERGING AI RESULTS ===")
|
|
logger.debug(f"Extracted content: {len(extractedContent)} documents")
|
|
logger.debug(f"AI results: {len(aiResults)} responses")
|
|
logger.debug(f"Merge strategy: {strategy.mergeType}")
|
|
|
|
mergeStrategy = strategy
|
|
|
|
# Collect all parts from all extracted content
|
|
allParts: List[ContentPart] = []
|
|
for ec in extractedContent:
|
|
allParts.extend(ec.parts)
|
|
|
|
logger.debug(f"Total original parts: {len(allParts)}")
|
|
|
|
# Create AI result parts
|
|
aiResultParts: List[ContentPart] = []
|
|
for i, aiResult in enumerate(aiResults):
|
|
aiPart = ContentPart(
|
|
id=f"ai_result_{i}",
|
|
parentId=None, # Will be set based on strategy
|
|
label="ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=aiResult,
|
|
metadata={
|
|
"aiResult": True,
|
|
"order": i,
|
|
"size": len(aiResult.encode('utf-8'))
|
|
}
|
|
)
|
|
aiResultParts.append(aiPart)
|
|
|
|
logger.debug(f"Created {len(aiResultParts)} AI result parts")
|
|
|
|
# Apply merging strategy
|
|
if mergeStrategy.mergeType == "concatenate":
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "hierarchical":
|
|
mergedParts = self._mergeHierarchical(allParts, aiResultParts, mergeStrategy)
|
|
elif mergeStrategy.mergeType == "intelligent":
|
|
mergedParts = self._mergeIntelligent(allParts, aiResultParts, mergeStrategy)
|
|
else:
|
|
# Default to concatenate
|
|
mergedParts = self._mergeConcatenate(allParts, aiResultParts, mergeStrategy)
|
|
|
|
# Create final ContentExtracted
|
|
mergedContent = ContentExtracted(
|
|
id=f"merged_{uuid.uuid4()}",
|
|
parts=mergedParts
|
|
)
|
|
|
|
logger.debug(f"=== MERGE COMPLETED ===")
|
|
logger.debug(f"Final merged parts: {len(mergedParts)}")
|
|
logger.debug(f"Merged content ID: {mergedContent.id}")
|
|
|
|
return mergedContent
|
|
|
|
def _mergeConcatenate(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts by simple concatenation."""
|
|
mergedParts = []
|
|
|
|
# Add original parts (filtered if needed)
|
|
for part in originalParts:
|
|
if strategy.preserveChunks or not part.metadata.get("chunk", False):
|
|
mergedParts.append(part)
|
|
|
|
# Add AI results
|
|
if aiResultParts:
|
|
# Group AI results by parentId if available
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
# Merge AI results for each parent
|
|
for parentId, aiParts in aiResultsByParent.items():
|
|
if len(aiParts) == 1:
|
|
mergedParts.append(aiParts[0])
|
|
else:
|
|
# Concatenate multiple AI results for same parent
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiParts])
|
|
combinedPart = ContentPart(
|
|
id=f"merged_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="merged_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"merged": True,
|
|
"sourceCount": len(aiParts),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeHierarchical(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts hierarchically based on parentId relationships."""
|
|
# Group parts by parentId
|
|
partsByParent = {}
|
|
for part in originalParts:
|
|
parentId = part.parentId or "root"
|
|
if parentId not in partsByParent:
|
|
partsByParent[parentId] = []
|
|
partsByParent[parentId].append(part)
|
|
|
|
# Group AI results by parentId
|
|
aiResultsByParent = {}
|
|
for aiPart in aiResultParts:
|
|
parentId = aiPart.parentId or "root"
|
|
if parentId not in aiResultsByParent:
|
|
aiResultsByParent[parentId] = []
|
|
aiResultsByParent[parentId].append(aiPart)
|
|
|
|
mergedParts = []
|
|
|
|
# Process each parent group
|
|
for parentId in set(list(partsByParent.keys()) + list(aiResultsByParent.keys())):
|
|
originalGroup = partsByParent.get(parentId, [])
|
|
aiGroup = aiResultsByParent.get(parentId, [])
|
|
|
|
# Add original parts
|
|
mergedParts.extend(originalGroup)
|
|
|
|
# Add AI results for this parent
|
|
if aiGroup:
|
|
if len(aiGroup) == 1:
|
|
mergedParts.append(aiGroup[0])
|
|
else:
|
|
# Merge multiple AI results
|
|
combinedData = strategy.chunkSeparator.join([p.data for p in aiGroup])
|
|
combinedPart = ContentPart(
|
|
id=f"hierarchical_ai_{parentId}",
|
|
parentId=parentId if parentId != "root" else None,
|
|
label="hierarchical_ai_result",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=combinedData,
|
|
metadata={
|
|
"aiResult": True,
|
|
"hierarchical": True,
|
|
"sourceCount": len(aiGroup),
|
|
"size": len(combinedData.encode('utf-8'))
|
|
}
|
|
)
|
|
mergedParts.append(combinedPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeIntelligent(
|
|
self,
|
|
originalParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Merge parts using intelligent strategies based on content type."""
|
|
mergedParts = []
|
|
|
|
# Group by typeGroup for intelligent merging
|
|
partsByType = {}
|
|
for part in originalParts:
|
|
typeGroup = part.typeGroup
|
|
if typeGroup not in partsByType:
|
|
partsByType[typeGroup] = []
|
|
partsByType[typeGroup].append(part)
|
|
|
|
# Process each type group
|
|
for typeGroup, parts in partsByType.items():
|
|
if typeGroup == "text":
|
|
mergedParts.extend(self._mergeTextIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "table":
|
|
mergedParts.extend(self._mergeTableIntelligent(parts, aiResultParts, strategy))
|
|
elif typeGroup == "structure":
|
|
mergedParts.extend(self._mergeStructureIntelligent(parts, aiResultParts, strategy))
|
|
else:
|
|
# Default handling for other types
|
|
mergedParts.extend(parts)
|
|
|
|
# Add any remaining AI results that weren't merged
|
|
for aiPart in aiResultParts:
|
|
if not any(p.id == aiPart.id for p in mergedParts):
|
|
mergedParts.append(aiPart)
|
|
|
|
return mergedParts
|
|
|
|
def _mergeTextIntelligent(
|
|
self,
|
|
textParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for text content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with semantic analysis, summarization, etc.
|
|
return self._mergeConcatenate(textParts, aiResultParts, strategy)
|
|
|
|
def _mergeTableIntelligent(
|
|
self,
|
|
tableParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for table content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with table merging logic
|
|
return self._mergeConcatenate(tableParts, aiResultParts, strategy)
|
|
|
|
def _mergeStructureIntelligent(
|
|
self,
|
|
structureParts: List[ContentPart],
|
|
aiResultParts: List[ContentPart],
|
|
strategy: MergeStrategy
|
|
) -> List[ContentPart]:
|
|
"""Intelligent merging for structured content."""
|
|
# For now, use concatenate strategy
|
|
# This could be enhanced with structure-aware merging
|
|
return self._mergeConcatenate(structureParts, aiResultParts, strategy)
|
|
|
|
async def processDocumentsPerChunk(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None
|
|
) -> str:
|
|
"""
|
|
Process documents with model-aware chunking and merge results.
|
|
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to process
|
|
prompt: AI prompt for processing
|
|
aiObjects: AiObjects instance for making AI calls
|
|
options: AI call options
|
|
operationId: Optional operation ID for progress tracking
|
|
|
|
Returns:
|
|
Merged AI results as string with preserved document structure
|
|
"""
|
|
if not documents:
|
|
return ""
|
|
|
|
# Create operationId if not provided
|
|
if not operationId:
|
|
workflowId = self.services.currentWorkflow.id if self.services.currentWorkflow else f"no-workflow-{int(time.time())}"
|
|
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
|
|
self.services.workflow.progressLogStart(
|
|
operationId,
|
|
"AI Text Extract",
|
|
"Document Processing",
|
|
f"Processing {len(documents)} documents"
|
|
)
|
|
|
|
try:
|
|
# Build extraction options using Pydantic model
|
|
mergeStrategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
prompt=prompt,
|
|
groupBy="typeGroup",
|
|
orderBy="id",
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
extractionOptions = ExtractionOptions(
|
|
prompt=prompt,
|
|
operationType=options.operationType if options else OperationTypeEnum.DATA_EXTRACT,
|
|
processDocumentsIndividually=True,
|
|
mergeStrategy=mergeStrategy
|
|
)
|
|
|
|
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars, operationType={extractionOptions.operationType}")
|
|
|
|
# Extract content WITHOUT chunking
|
|
if operationId:
|
|
self.services.workflow.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
|
|
extractionResult = self.extractContent(documents, extractionOptions)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
if operationId:
|
|
self.services.workflow.progressLogFinish(operationId, False)
|
|
return "[Error: No extraction results]"
|
|
|
|
# Process parts (not chunks) with model-aware AI calls
|
|
if operationId:
|
|
self.services.workflow.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
|
|
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId)
|
|
|
|
# Merge results using existing merging system
|
|
if operationId:
|
|
self.services.workflow.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
|
|
mergedContent = self._mergePartResults(partResults, options)
|
|
|
|
# Save merged extraction content to debug
|
|
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
|
|
|
|
if operationId:
|
|
self.services.workflow.progressLogFinish(operationId, True)
|
|
|
|
return mergedContent
|
|
except Exception as e:
|
|
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
|
|
if operationId:
|
|
self.services.workflow.progressLogFinish(operationId, False)
|
|
raise
|
|
|
|
async def _processPartsWithMapping(
|
|
self,
|
|
extractionResult: List[ContentExtracted],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None
|
|
) -> List[PartResult]:
|
|
"""Process content parts with model-aware chunking and proper mapping."""
|
|
|
|
# Collect all parts that need processing
|
|
partsToProcess = []
|
|
partIndex = 0
|
|
|
|
for ec in extractionResult:
|
|
for part in ec.parts:
|
|
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
|
# Skip empty container parts
|
|
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
|
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
|
|
continue
|
|
|
|
partsToProcess.append({
|
|
'part': part,
|
|
'part_index': partIndex,
|
|
'document_id': ec.id
|
|
})
|
|
partIndex += 1
|
|
|
|
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
|
|
|
|
totalParts = len(partsToProcess)
|
|
|
|
# Process parts in parallel
|
|
processedCount = [0] # Use list to allow modification in nested function
|
|
|
|
async def processSinglePart(partInfo: Dict) -> PartResult:
|
|
part = partInfo['part']
|
|
part_index = partInfo['part_index']
|
|
documentId = partInfo['document_id']
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Create AI call request with content part
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="", # Context is in the content part
|
|
options=options,
|
|
contentParts=[part] # Pass as list for unified processing
|
|
)
|
|
|
|
# Update progress before AI call
|
|
if operationId and totalParts > 0:
|
|
processedCount[0] += 1
|
|
progress = 0.3 + (processedCount[0] / totalParts * 0.6) # Progress from 0.3 to 0.9
|
|
self.services.workflow.progressLogUpdate(operationId, progress, f"Processing part {processedCount[0]}/{totalParts}")
|
|
|
|
# Call AI with model-aware chunking
|
|
response = await aiObjects.call(request)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=response.content,
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": True,
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"resultSize": len(response.content),
|
|
"typeGroup": part.typeGroup,
|
|
"modelName": response.modelName,
|
|
"priceUsd": response.priceUsd
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
processing_time = time.time() - start_time
|
|
logger.warning(f"Error processing part {part_index}: {str(e)}")
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=f"[Error processing part: {str(e)}]",
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": False,
|
|
"error": str(e),
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"typeGroup": part.typeGroup
|
|
}
|
|
)
|
|
|
|
# Process parts with concurrency control
|
|
maxConcurrent = 5
|
|
if options and hasattr(options, 'maxConcurrentParts'):
|
|
maxConcurrent = options.maxConcurrentParts
|
|
|
|
semaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
async def processWithSemaphore(partInfo):
|
|
async with semaphore:
|
|
return await processSinglePart(partInfo)
|
|
|
|
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
|
|
partResults = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle exceptions
|
|
processedResults = []
|
|
for i, result in enumerate(partResults):
|
|
if isinstance(result, Exception):
|
|
part_info = partsToProcess[i]
|
|
processedResults.append(PartResult(
|
|
originalPart=part_info['part'],
|
|
aiResult=f"[Error in parallel processing: {str(result)}]",
|
|
partIndex=part_info['part_index'],
|
|
documentId=part_info['document_id'],
|
|
processingTime=0.0,
|
|
metadata={"success": False, "error": str(result)}
|
|
))
|
|
elif result is not None:
|
|
processedResults.append(result)
|
|
|
|
logger.info(f"Completed processing {len(processedResults)} parts")
|
|
return processedResults
|
|
|
|
def _mergePartResults(
|
|
self,
|
|
partResults: List[PartResult],
|
|
options: Optional[AiCallOptions] = None
|
|
) -> str:
|
|
"""Merge part results using existing sophisticated merging system."""
|
|
if not partResults:
|
|
return ""
|
|
|
|
# Convert PartResults back to ContentParts for existing merger system
|
|
content_parts = []
|
|
for part_result in partResults:
|
|
# Create ContentPart from PartResult with proper typeGroup
|
|
content_part = ContentPart(
|
|
id=part_result.originalPart.id,
|
|
parentId=part_result.originalPart.parentId,
|
|
label=part_result.originalPart.label,
|
|
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
|
|
mimeType=part_result.originalPart.mimeType,
|
|
data=part_result.aiResult, # Use AI result as data
|
|
metadata={
|
|
**part_result.originalPart.metadata,
|
|
"aiResult": True,
|
|
"partIndex": part_result.partIndex,
|
|
"documentId": part_result.documentId,
|
|
"processingTime": part_result.processingTime,
|
|
"success": part_result.metadata.get("success", False)
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
|
|
# Use existing merging strategy from options
|
|
merge_strategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
groupBy="documentId", # Group by document
|
|
orderBy="partIndex", # Order by part index
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
|
|
# Apply existing merging logic using the sophisticated merging system
|
|
merged_parts = _applyMerging(content_parts, merge_strategy)
|
|
|
|
# Convert merged parts back to final string
|
|
final_content = "\n\n".join([part.data for part in merged_parts])
|
|
|
|
logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system")
|
|
return final_content.strip()
|
|
|
|
|