1696 lines
89 KiB
Python
1696 lines
89 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
from typing import Any, Dict, List, Optional, Union
|
|
import uuid
|
|
import logging
|
|
import time
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
|
|
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
|
from .subPipeline import runExtraction
|
|
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, MergeStrategy, ExtractionOptions, PartResult, DocumentIntent
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallResponse, AiCallRequest, AiCallOptions, OperationTypeEnum, AiModelCall
|
|
from modules.aicore.aicoreModelRegistry import modelRegistry
|
|
from modules.aicore.aicoreModelSelector import modelSelector
|
|
from modules.shared.jsonUtils import stripCodeFences
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExtractionService:
|
|
def __init__(self, services: Optional[Any] = None):
|
|
self.services = services
|
|
self._extractorRegistry = ExtractorRegistry()
|
|
self._chunkerRegistry = ChunkerRegistry()
|
|
|
|
# Ensure connectors are registered
|
|
discovered = modelRegistry.discoverConnectors()
|
|
for connector in discovered:
|
|
modelRegistry.registerConnector(connector)
|
|
|
|
# Verify required internal model is available (used for pricing in extractContent)
|
|
modelDisplayName = "Internal Document Extractor"
|
|
model = modelRegistry.getModel(modelDisplayName)
|
|
if model is None or model.calculatePriceUsd is None:
|
|
raise RuntimeError(f"FATAL: Required internal model '{modelDisplayName}' is not available. Check connector registration.")
|
|
|
|
def extractContent(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
options: ExtractionOptions,
|
|
operationId: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> List[ContentExtracted]:
|
|
"""
|
|
Extract content from a list of ChatDocument objects.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to extract content from
|
|
options: Extraction options including maxSize, chunkAllowed, mergeStrategy, etc.
|
|
operationId: Optional operation ID for progress logging (parent operation)
|
|
parentOperationId: Optional parent operation ID for hierarchical logging
|
|
|
|
Returns:
|
|
List of ContentExtracted objects, one per input document
|
|
"""
|
|
|
|
results: List[ContentExtracted] = []
|
|
|
|
# Lazy import to avoid circular deps and heavy init at module import
|
|
from modules.interfaces.interfaceDbComponentObjects import getInterface
|
|
dbInterface = getInterface()
|
|
|
|
totalDocs = len(documents)
|
|
|
|
for i, doc in enumerate(documents):
|
|
logger.info(f"=== DOCUMENT {i + 1}/{totalDocs}: {doc.fileName} ===")
|
|
logger.info(f"Initial MIME type: {doc.mimeType}")
|
|
|
|
# Create child operation for this document if parent operationId is provided
|
|
docOperationId = None
|
|
if operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
docOperationId = f"{operationId}_doc_{i}"
|
|
# Use parentOperationId if provided, otherwise use operationId as parent
|
|
parentId = parentOperationId if parentOperationId else operationId
|
|
self.services.chat.progressLogStart(
|
|
docOperationId,
|
|
"Extracting Document",
|
|
f"Document {i + 1}/{totalDocs}",
|
|
doc.fileName[:50] + "..." if len(doc.fileName) > 50 else doc.fileName,
|
|
parentOperationId=parentId # Correct parent reference for ChatLog hierarchy
|
|
)
|
|
|
|
# Start timing for this document
|
|
startTime = time.time()
|
|
|
|
try:
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.1, "Loading document data")
|
|
|
|
# Resolve raw bytes for this document using interface
|
|
documentBytes = dbInterface.getFileData(doc.fileId)
|
|
if not documentBytes:
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
raise ValueError(f"No file data found for fileId={doc.fileId}")
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.2, "Running extraction pipeline")
|
|
|
|
# Convert ChatDocument to the format expected by runExtraction
|
|
documentData = {
|
|
"id": doc.id,
|
|
"bytes": documentBytes,
|
|
"fileName": doc.fileName,
|
|
"mimeType": doc.mimeType
|
|
}
|
|
|
|
ec = runExtraction(
|
|
extractorRegistry=self._extractorRegistry,
|
|
chunkerRegistry=self._chunkerRegistry,
|
|
documentBytes=documentData["bytes"],
|
|
fileName=documentData["fileName"],
|
|
mimeType=documentData["mimeType"],
|
|
options=options
|
|
)
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.7, f"Extracted {len(ec.parts)} parts")
|
|
|
|
# Log content parts metadata
|
|
logger.debug(f"Content parts: {len(ec.parts)}")
|
|
for j, part in enumerate(ec.parts):
|
|
logger.debug(f" Part {j + 1}/{len(ec.parts)}: {part.typeGroup} ({part.mimeType}) - {len(part.data) if part.data else 0} chars")
|
|
if part.metadata:
|
|
logger.debug(f" Metadata: {part.metadata}")
|
|
|
|
# Attach complete metadata to parts according to ContentPart Metadaten-Schema
|
|
for p in ec.parts:
|
|
# Ensure metadata dict exists
|
|
if not p.metadata:
|
|
p.metadata = {}
|
|
|
|
# Required metadata fields (from concept)
|
|
if "documentId" not in p.metadata:
|
|
p.metadata["documentId"] = documentData["id"] or str(uuid.uuid4())
|
|
if "documentMimeType" not in p.metadata:
|
|
p.metadata["documentMimeType"] = documentData["mimeType"]
|
|
if "originalFileName" not in p.metadata:
|
|
p.metadata["originalFileName"] = documentData["fileName"]
|
|
|
|
# ContentFormat: Set based on typeGroup and mimeType
|
|
# Default to "extracted" for text content, but can be overridden by caller
|
|
if "contentFormat" not in p.metadata:
|
|
# Default: extracted text content
|
|
p.metadata["contentFormat"] = "extracted"
|
|
|
|
# Intent: Default to "extract" for extracted content
|
|
if "intent" not in p.metadata:
|
|
p.metadata["intent"] = "extract"
|
|
|
|
# ExtractionPrompt: Use from options if available
|
|
if "extractionPrompt" not in p.metadata and options and options.prompt:
|
|
p.metadata["extractionPrompt"] = options.prompt
|
|
|
|
# UsageHint: Provide default hint
|
|
if "usageHint" not in p.metadata:
|
|
p.metadata["usageHint"] = f"Use extracted content from {documentData['fileName']}"
|
|
|
|
# SourceAction: Mark as from extraction service
|
|
if "sourceAction" not in p.metadata:
|
|
p.metadata["sourceAction"] = "extraction.extractContent"
|
|
|
|
# Write debug file for each text part extracted (without AI)
|
|
for j, part in enumerate(ec.parts):
|
|
if part.typeGroup == "text" and part.data and self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
debug_content = {
|
|
"partIndex": j + 1,
|
|
"partId": part.id,
|
|
"typeGroup": part.typeGroup,
|
|
"mimeType": part.mimeType or "text/plain",
|
|
"label": part.label,
|
|
"dataLength": len(part.data),
|
|
"metadata": part.metadata.copy() if part.metadata else {},
|
|
"data": part.data # Full extracted text
|
|
}
|
|
debug_json = json.dumps(debug_content, indent=2, ensure_ascii=False)
|
|
# Use document name and part index for filename
|
|
doc_name_safe = documentData["fileName"].replace(" ", "_").replace("/", "_").replace("\\", "_")[:50]
|
|
debug_filename = f"extraction_text_part_{j+1}_{doc_name_safe}.txt"
|
|
self.services.utils.writeDebugFile(debug_json, debug_filename)
|
|
logger.info(f"Wrote debug file for extracted text part {j+1}/{len(ec.parts)}: {debug_filename}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to write debug file for text part {j+1}: {str(e)}")
|
|
|
|
# Log chunking information
|
|
chunkedParts = [p for p in ec.parts if p.metadata.get("chunk", False)]
|
|
if chunkedParts:
|
|
logger.debug(f"=== CHUNKING RESULTS ===")
|
|
logger.debug(f"Total parts: {len(ec.parts)}")
|
|
logger.debug(f"Chunked parts: {len(chunkedParts)}")
|
|
for chunk in chunkedParts:
|
|
logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
|
|
else:
|
|
logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
|
|
|
|
if docOperationId:
|
|
self.services.chat.progressLogUpdate(docOperationId, 0.9, f"Processing complete: {len(ec.parts)} parts extracted")
|
|
|
|
# Calculate timing and emit stats
|
|
endTime = time.time()
|
|
processingTime = endTime - startTime
|
|
bytesSent = len(documentBytes)
|
|
bytesReceived = sum(len(part.data) if part.data else 0 for part in ec.parts)
|
|
|
|
# Emit stats for extraction operation
|
|
|
|
# Use internal extraction model for pricing
|
|
modelDisplayName = "Internal Document Extractor"
|
|
model = modelRegistry.getModel(modelDisplayName)
|
|
# Hard fail if model is missing; caller must ensure connectors are registered
|
|
if model is None or model.calculatePriceUsd is None:
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
raise RuntimeError(f"Pricing model not available: {modelDisplayName}")
|
|
priceUsd = model.calculatePriceUsd(processingTime, bytesSent, bytesReceived)
|
|
|
|
# Create AiCallResponse with real calculation
|
|
# Use model.name for the response (API identifier), not displayName
|
|
aiResponse = AiCallResponse(
|
|
content="", # No content for extraction stats needed
|
|
modelName=model.name,
|
|
priceUsd=priceUsd,
|
|
processingTime=processingTime,
|
|
bytesSent=bytesSent,
|
|
bytesReceived=bytesReceived,
|
|
errorCount=0
|
|
)
|
|
|
|
self.services.chat.storeWorkflowStat(
|
|
self.services.workflow,
|
|
aiResponse,
|
|
f"extraction.process.{doc.mimeType}"
|
|
)
|
|
|
|
# Write extraction results to debug file
|
|
try:
|
|
from modules.shared.debugLogger import writeDebugFile
|
|
# json is already imported at module level
|
|
# Create summary of extraction results for debug
|
|
extractionSummary = {
|
|
"documentName": doc.fileName,
|
|
"documentMimeType": doc.mimeType,
|
|
"partsCount": len(ec.parts),
|
|
"parts": []
|
|
}
|
|
for part in ec.parts:
|
|
partSummary = {
|
|
"typeGroup": part.typeGroup,
|
|
"mimeType": part.mimeType,
|
|
"label": part.label,
|
|
"dataLength": len(part.data) if part.data else 0,
|
|
"metadata": part.metadata
|
|
}
|
|
# Include data preview for small parts (first 500 chars)
|
|
if part.data and len(part.data) <= 500:
|
|
partSummary["dataPreview"] = part.data[:500]
|
|
elif part.data:
|
|
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
|
|
extractionSummary["parts"].append(partSummary)
|
|
|
|
writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}.txt")
|
|
except Exception as e:
|
|
logger.debug(f"Failed to write extraction debug file: {str(e)}")
|
|
|
|
results.append(ec)
|
|
|
|
# Finish document operation successfully
|
|
if docOperationId:
|
|
self.services.chat.progressLogFinish(docOperationId, True)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting content from document {i + 1}/{totalDocs} ({doc.fileName}): {str(e)}")
|
|
if docOperationId:
|
|
try:
|
|
self.services.chat.progressLogFinish(docOperationId, False)
|
|
except:
|
|
pass # Don't fail on progress logging errors
|
|
# Continue with next document instead of failing completely
|
|
# This allows parallel processing to continue even if one document fails
|
|
continue
|
|
|
|
return results
|
|
|
|
async def processDocumentsPerChunk(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> str:
|
|
"""
|
|
Process documents with model-aware chunking and merge results.
|
|
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to process
|
|
prompt: AI prompt for processing
|
|
aiObjects: AiObjects instance for making AI calls
|
|
options: AI call options
|
|
operationId: Optional operation ID for progress tracking
|
|
parentOperationId: Optional parent operation ID for hierarchical logging
|
|
|
|
Returns:
|
|
Merged AI results as string with preserved document structure
|
|
"""
|
|
if not documents:
|
|
return ""
|
|
|
|
# Create operationId if not provided
|
|
if not operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operationId = f"ai_text_extract_{workflowId}_{int(time.time())}"
|
|
self.services.chat.progressLogStart(
|
|
operationId,
|
|
"AI Text Extract",
|
|
"Document Processing",
|
|
f"Processing {len(documents)} documents",
|
|
parentOperationId=parentOperationId # Use parentOperationId if provided
|
|
)
|
|
|
|
try:
|
|
# Build extraction options using Pydantic model
|
|
mergeStrategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
prompt=prompt,
|
|
groupBy="typeGroup",
|
|
orderBy="id",
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
extractionOptions = ExtractionOptions(
|
|
prompt=prompt,
|
|
processDocumentsIndividually=True,
|
|
mergeStrategy=mergeStrategy
|
|
)
|
|
|
|
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.prompt)} chars")
|
|
|
|
# Extract content WITHOUT chunking
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.1, f"Extracting content from {len(documents)} documents")
|
|
# Pass operationId as parentOperationId for hierarchical logging
|
|
# Correct hierarchy: parentOperationId -> operationId -> docOperationId
|
|
extractionResult = self.extractContent(documents, extractionOptions, operationId=operationId, parentOperationId=operationId)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return "[Error: No extraction results]"
|
|
|
|
# Process parts (not chunks) with model-aware AI calls
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.3, f"Processing {len(extractionResult)} extracted content parts")
|
|
# Use operationId as parentOperationId for child operations
|
|
# Correct hierarchy: parentOperationId -> operationId -> partOperationId
|
|
processParentOperationId = operationId
|
|
partResults = await self._processPartsWithMapping(extractionResult, prompt, aiObjects, options, operationId, processParentOperationId)
|
|
|
|
# Merge results using existing merging system
|
|
if operationId:
|
|
self.services.chat.progressLogUpdate(operationId, 0.9, f"Merging {len(partResults)} part results")
|
|
mergedContent = self.mergePartResults(partResults, options)
|
|
|
|
# Save merged extraction content to debug
|
|
self.services.utils.writeDebugFile(mergedContent or '', "extraction_merged_text")
|
|
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, True)
|
|
|
|
return mergedContent
|
|
except Exception as e:
|
|
logger.error(f"Error in processDocumentsPerChunk: {str(e)}")
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
raise
|
|
|
|
async def _processPartsWithMapping(
|
|
self,
|
|
extractionResult: List[ContentExtracted],
|
|
prompt: str,
|
|
aiObjects: Any,
|
|
options: Optional[AiCallOptions] = None,
|
|
operationId: Optional[str] = None,
|
|
parentOperationId: Optional[str] = None
|
|
) -> List[PartResult]:
|
|
"""Process content parts with model-aware chunking and proper mapping."""
|
|
|
|
# Collect all parts that need processing
|
|
partsToProcess = []
|
|
partIndex = 0
|
|
|
|
for ec in extractionResult:
|
|
for part in ec.parts:
|
|
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
|
# Skip empty container parts
|
|
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
|
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
|
|
continue
|
|
|
|
partsToProcess.append({
|
|
'part': part,
|
|
'part_index': partIndex,
|
|
'document_id': ec.id
|
|
})
|
|
partIndex += 1
|
|
|
|
logger.info(f"Processing {len(partsToProcess)} parts with model-aware chunking")
|
|
|
|
totalParts = len(partsToProcess)
|
|
|
|
# Process parts in parallel
|
|
processedCount = [0] # Use list to allow modification in nested function
|
|
|
|
async def processSinglePart(partInfo: Dict) -> PartResult:
|
|
part = partInfo['part']
|
|
part_index = partInfo['part_index']
|
|
documentId = partInfo['document_id']
|
|
|
|
start_time = time.time()
|
|
|
|
# Create separate operation for each part with parent reference
|
|
partOperationId = None
|
|
if operationId:
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
partOperationId = f"{operationId}_part_{part_index}"
|
|
self.services.chat.progressLogStart(
|
|
partOperationId,
|
|
"Content Processing",
|
|
f"Part {part_index + 1}",
|
|
f"Type: {part.typeGroup}",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
try:
|
|
# Create AI call request with content part
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="", # Context is in the content part
|
|
options=options,
|
|
contentParts=[part] # Pass as list for unified processing
|
|
)
|
|
|
|
# Update progress - initiating
|
|
if partOperationId:
|
|
self.services.chat.progressLogUpdate(partOperationId, 0.3, "Initiating")
|
|
|
|
# Call AI with model-aware chunking (no progress callback - handled by parent operation)
|
|
response = await aiObjects.call(request)
|
|
|
|
# Update progress - completed
|
|
if partOperationId:
|
|
self.services.chat.progressLogUpdate(partOperationId, 0.9, "Completed")
|
|
self.services.chat.progressLogFinish(partOperationId, True)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=response.content,
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": True,
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"resultSize": len(response.content),
|
|
"typeGroup": part.typeGroup,
|
|
"modelName": response.modelName,
|
|
"priceUsd": response.priceUsd
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
processing_time = time.time() - start_time
|
|
logger.warning(f"Error processing part {part_index}: {str(e)}")
|
|
|
|
return PartResult(
|
|
originalPart=part,
|
|
aiResult=f"[Error processing part: {str(e)}]",
|
|
partIndex=part_index,
|
|
documentId=documentId,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": False,
|
|
"error": str(e),
|
|
"partSize": len(part.data) if part.data else 0,
|
|
"typeGroup": part.typeGroup
|
|
}
|
|
)
|
|
|
|
# Process parts with concurrency control
|
|
maxConcurrent = 5
|
|
if options and hasattr(options, 'maxConcurrentParts'):
|
|
maxConcurrent = options.maxConcurrentParts
|
|
|
|
semaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
async def processWithSemaphore(partInfo):
|
|
async with semaphore:
|
|
return await processSinglePart(partInfo)
|
|
|
|
tasks = [processWithSemaphore(part_info) for part_info in partsToProcess]
|
|
partResults = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle exceptions
|
|
processedResults = []
|
|
for i, result in enumerate(partResults):
|
|
if isinstance(result, Exception):
|
|
part_info = partsToProcess[i]
|
|
processedResults.append(PartResult(
|
|
originalPart=part_info['part'],
|
|
aiResult=f"[Error in parallel processing: {str(result)}]",
|
|
partIndex=part_info['part_index'],
|
|
documentId=part_info['document_id'],
|
|
processingTime=0.0,
|
|
metadata={"success": False, "error": str(result)}
|
|
))
|
|
elif result is not None:
|
|
processedResults.append(result)
|
|
|
|
logger.info(f"Completed processing {len(processedResults)} parts")
|
|
return processedResults
|
|
|
|
def _convertToContentParts(
|
|
self, partResults: Union[List[PartResult], List[AiCallResponse]], originalContentParts: Optional[List[ContentPart]] = None
|
|
) -> List[ContentPart]:
|
|
"""Convert part results to ContentParts (internal helper for consolidation).
|
|
|
|
Handles both PartResult (from extraction workflow) and AiCallResponse (from content parts processing).
|
|
|
|
Args:
|
|
partResults: List of PartResult or AiCallResponse objects
|
|
originalContentParts: Optional list of original ContentPart objects to preserve typeGroup and metadata
|
|
"""
|
|
content_parts = []
|
|
|
|
if not partResults:
|
|
return content_parts
|
|
|
|
# Detect input type and convert accordingly
|
|
if isinstance(partResults[0], PartResult):
|
|
# Existing logic for PartResult (from processDocumentsPerChunk)
|
|
# Phase 7: Add originalIndex for explicit ordering
|
|
for i, part_result in enumerate(partResults):
|
|
content_part = ContentPart(
|
|
id=part_result.originalPart.id,
|
|
parentId=part_result.originalPart.parentId,
|
|
label=part_result.originalPart.label,
|
|
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
|
|
mimeType=part_result.originalPart.mimeType,
|
|
data=part_result.aiResult, # Use AI result as data
|
|
metadata={
|
|
**part_result.originalPart.metadata,
|
|
"aiResult": True,
|
|
"originalIndex": i, # Phase 7: Explicit order index
|
|
"partIndex": part_result.partIndex,
|
|
"processingOrder": i, # Phase 7: Processing order
|
|
"documentId": part_result.documentId,
|
|
"processingTime": part_result.processingTime,
|
|
"success": part_result.metadata.get("success", False)
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
elif isinstance(partResults[0], AiCallResponse):
|
|
# Logic from interfaceAiObjects (from content parts processing)
|
|
# Phase 7: Add originalIndex for explicit ordering
|
|
# REQUIRED: originalContentParts must be provided for AiCallResponse path to preserve typeGroup
|
|
if not originalContentParts:
|
|
raise ValueError("originalContentParts is required when merging AiCallResponse objects. All callers must provide the original ContentPart objects to preserve typeGroup.")
|
|
|
|
for i, result in enumerate(partResults):
|
|
if result.content:
|
|
# Handle one-to-many relationships (e.g., chunking: 1 contentPart -> N chunkResults)
|
|
# If we have fewer originalContentParts than partResults, use the first one for all
|
|
if i < len(originalContentParts):
|
|
originalPart = originalContentParts[i]
|
|
else:
|
|
# One-to-many: use first originalContentPart for remaining results
|
|
originalPart = originalContentParts[0]
|
|
|
|
originalTypeGroup = originalPart.typeGroup or "text"
|
|
originalMimeType = originalPart.mimeType or "text/plain"
|
|
originalLabel = originalPart.label or f"ai_result_{i}"
|
|
|
|
content_part = ContentPart(
|
|
id=str(uuid.uuid4()),
|
|
parentId=None,
|
|
label=originalLabel,
|
|
typeGroup=originalTypeGroup, # Preserve original typeGroup from originalContentParts
|
|
mimeType=originalMimeType,
|
|
data=result.content,
|
|
metadata={
|
|
"aiResult": True,
|
|
"originalIndex": i, # Phase 7: Explicit order index
|
|
"processingOrder": i, # Phase 7: Processing order
|
|
"modelName": result.modelName,
|
|
"priceUsd": result.priceUsd,
|
|
"processingTime": result.processingTime,
|
|
"bytesSent": result.bytesSent,
|
|
"bytesReceived": result.bytesReceived
|
|
}
|
|
)
|
|
content_parts.append(content_part)
|
|
|
|
return content_parts
|
|
|
|
def mergePartResults(
|
|
self,
|
|
partResults: Union[List[PartResult], List[AiCallResponse]],
|
|
options: Optional[AiCallOptions] = None,
|
|
originalContentParts: Optional[List[ContentPart]] = None
|
|
) -> str:
|
|
"""Unified merge for both PartResult and AiCallResponse.
|
|
|
|
Consolidated from both interfaceAiObjects.py and existing serviceExtraction method.
|
|
|
|
Args:
|
|
partResults: List of PartResult or AiCallResponse objects to merge
|
|
options: Optional AiCallOptions for merge strategy
|
|
originalContentParts: Optional list of original ContentPart objects to preserve typeGroup
|
|
"""
|
|
if not partResults:
|
|
return ""
|
|
|
|
# Convert to ContentParts using unified helper, preserving original typeGroup
|
|
content_parts = self._convertToContentParts(partResults, originalContentParts)
|
|
|
|
# Determine merge strategy based on input type
|
|
if isinstance(partResults[0], PartResult):
|
|
# Phase 7: Use originalIndex for explicit ordering
|
|
# Use strategy for extraction workflow (group by document, order by originalIndex)
|
|
merge_strategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
groupBy="documentId", # Group by document
|
|
orderBy="originalIndex", # Phase 7: Order by originalIndex instead of partIndex
|
|
mergeType="concatenate"
|
|
)
|
|
else:
|
|
# Default strategy for content parts workflow
|
|
merge_strategy = MergeStrategy(
|
|
useIntelligentMerging=True,
|
|
groupBy="typeGroup",
|
|
orderBy="id",
|
|
mergeType="concatenate"
|
|
)
|
|
|
|
# Check if this is an elements response format (elements array structure)
|
|
# This is used for section content generation where multiple ContentParts are processed
|
|
isElementsResponse = self._isElementsResponse(content_parts)
|
|
|
|
if isElementsResponse:
|
|
# Merge JSON elements responses intelligently (merge tables, combine elements)
|
|
logger.info(f"Detected 'elements' JSON response format - merging {len(content_parts)} JSON responses")
|
|
merged_json = self._mergeElementsResponses(content_parts)
|
|
merged_json_str = json.dumps(merged_json, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully merged 'elements' JSON responses into single unified JSON ({len(merged_json_str)} chars)")
|
|
return merged_json_str
|
|
|
|
# Check if this is a JSON extraction response format (extracted_content structure)
|
|
# If so, merge JSON structures intelligently before applying regular merging
|
|
isJsonExtractionResponse = self._isJsonExtractionResponse(content_parts)
|
|
|
|
if isJsonExtractionResponse:
|
|
# Merge JSON extraction responses intelligently
|
|
logger.info(f"Detected JSON extraction response format - merging {len(content_parts)} JSON responses")
|
|
merged_json = self._mergeJsonExtractionResponses(content_parts, originalContentParts)
|
|
merged_json_str = json.dumps(merged_json, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully merged JSON extraction responses into single unified JSON ({len(merged_json_str)} chars)")
|
|
return merged_json_str
|
|
|
|
# Apply regular merging for non-JSON extraction responses
|
|
merged_parts = applyMerging(content_parts, merge_strategy)
|
|
|
|
# Phase 6: Enhanced format with metadata preservation
|
|
# CRITICAL: Don't add SOURCE markers for internal use - metadata is already preserved in ContentPart objects
|
|
# SOURCE markers should ONLY be added when content is returned directly to user for display/debugging
|
|
# For extraction content used in generation pipelines, metadata is in ContentPart.metadata, not in text markers
|
|
|
|
# Check if this is a generation response by looking at operationType or content structure
|
|
isGenerationResponse = False
|
|
if options and hasattr(options, 'operationType'):
|
|
# Generation responses use DATA_GENERATE operation type
|
|
from modules.datamodels.datamodelAi import OperationTypeEnum
|
|
isGenerationResponse = options.operationType == OperationTypeEnum.DATA_GENERATE
|
|
|
|
# Also check if content looks like JSON (starts with { or [)
|
|
if not isGenerationResponse and merged_parts:
|
|
firstPartData = merged_parts[0].data if merged_parts[0].data else ""
|
|
if isinstance(firstPartData, str) and firstPartData.strip().startswith(('{', '[')):
|
|
# Check if it's a complete JSON structure (not extracted content)
|
|
# Generation responses are complete JSON, extraction responses are text content
|
|
try:
|
|
# json is already imported at module level
|
|
json.loads(firstPartData.strip())
|
|
# If it parses as JSON and has "documents" key, it's likely a generation response
|
|
parsed = json.loads(firstPartData.strip())
|
|
if isinstance(parsed, dict) and "documents" in parsed:
|
|
isGenerationResponse = True
|
|
except:
|
|
pass
|
|
|
|
# ROOT CAUSE FIX: Never add SOURCE markers - metadata is preserved in ContentPart.metadata
|
|
# SOURCE markers pollute content and cause issues when content is used in generation pipelines
|
|
# If traceability is needed, use ContentPart.metadata fields (documentId, documentMimeType, label, etc.)
|
|
content_sections = []
|
|
for part in merged_parts:
|
|
# Always return clean content without SOURCE markers
|
|
# Metadata is available in ContentPart.metadata for traceability
|
|
content_sections.append(part.data if part.data else "")
|
|
|
|
final_content = "\n\n".join(content_sections)
|
|
|
|
logger.info(f"Merged {len(partResults)} parts using unified merging system with metadata preservation (generationResponse={isGenerationResponse})")
|
|
return final_content.strip()
|
|
|
|
def _isJsonExtractionResponse(self, content_parts: List[ContentPart]) -> bool:
|
|
"""Check if contentParts contain JSON extraction responses (extracted_content format)."""
|
|
if not content_parts:
|
|
return False
|
|
|
|
# Check first part to see if it's JSON extraction response format
|
|
firstPartData = content_parts[0].data if content_parts[0].data else ""
|
|
if not isinstance(firstPartData, str):
|
|
return False
|
|
|
|
# Strip markdown code fences (```json ... ```) before checking
|
|
strippedData = stripCodeFences(firstPartData.strip())
|
|
|
|
# Check if it starts with JSON object/array
|
|
if not strippedData.startswith(('{', '[')):
|
|
return False
|
|
|
|
try:
|
|
parsed = json.loads(strippedData)
|
|
# Check if it has the extraction response structure: {"extracted_content": {...}}
|
|
if isinstance(parsed, dict) and "extracted_content" in parsed:
|
|
return True
|
|
except:
|
|
pass
|
|
|
|
return False
|
|
|
|
def _isElementsResponse(self, content_parts: List[ContentPart]) -> bool:
|
|
"""Check if contentParts contain JSON responses with an 'elements' array (e.g., section content)."""
|
|
if not content_parts:
|
|
return False
|
|
|
|
firstPartData = content_parts[0].data if content_parts[0].data else ""
|
|
if not isinstance(firstPartData, str):
|
|
return False
|
|
|
|
strippedData = stripCodeFences(firstPartData.strip())
|
|
if not strippedData.startswith(('{', '[')):
|
|
return False
|
|
|
|
try:
|
|
parsed = json.loads(strippedData)
|
|
if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
|
|
return True
|
|
except:
|
|
pass
|
|
|
|
return False
|
|
|
|
def _mergeElementsResponses(self, content_parts: List[ContentPart]) -> Dict[str, Any]:
|
|
"""Merge multiple JSON responses with an 'elements' array into one unified response.
|
|
Specifically designed to merge tables within the 'elements' array.
|
|
Empty tables (no rows) are ignored if a table with the same headers already has data.
|
|
"""
|
|
merged_elements = []
|
|
table_headers_map: Dict[str, List[Dict[str, Any]]] = {} # headers_tuple -> [table_contents]
|
|
|
|
for part in content_parts:
|
|
if not part.data:
|
|
continue
|
|
|
|
# Handle multiple JSON blocks in a single response (separated by ---)
|
|
partDataBlocks = part.data.split('---')
|
|
|
|
for blockData in partDataBlocks:
|
|
if not blockData.strip():
|
|
continue
|
|
|
|
try:
|
|
strippedData = stripCodeFences(blockData.strip())
|
|
if not strippedData:
|
|
continue
|
|
|
|
parsed = json.loads(strippedData)
|
|
if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
|
|
for element in parsed["elements"]:
|
|
if isinstance(element, dict) and element.get("type") == "table" and "content" in element:
|
|
table_content = element["content"]
|
|
headers = table_content.get("headers", [])
|
|
rows = table_content.get("rows", [])
|
|
|
|
if headers:
|
|
headers_key = tuple(headers)
|
|
|
|
# If table has no rows, only add it if no table with these headers exists yet
|
|
if not rows:
|
|
if headers_key not in table_headers_map:
|
|
# No table with these headers exists - keep empty table for now
|
|
table_headers_map[headers_key] = []
|
|
# If a table with these headers already exists (with or without data), skip empty table
|
|
continue
|
|
|
|
# Table has rows - add to merge map
|
|
if headers_key not in table_headers_map:
|
|
table_headers_map[headers_key] = []
|
|
table_headers_map[headers_key].append(table_content)
|
|
else:
|
|
# Keep non-table elements as is, but avoid duplicates if possible
|
|
if element not in merged_elements:
|
|
merged_elements.append(element)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse JSON elements response from part {part.id}: {str(e)}")
|
|
continue
|
|
|
|
# Merge tables by headers - combine rows from tables with same headers
|
|
for headers_key, tables in table_headers_map.items():
|
|
if not tables:
|
|
# Only empty tables with these headers - skip them
|
|
continue
|
|
|
|
all_rows = []
|
|
|
|
for table_content in tables:
|
|
rows = table_content.get("rows", [])
|
|
all_rows.extend(rows)
|
|
|
|
# Only add table if it has rows
|
|
if all_rows:
|
|
merged_elements.append({
|
|
"type": "table",
|
|
"content": {
|
|
"headers": list(headers_key),
|
|
"rows": all_rows
|
|
}
|
|
})
|
|
|
|
return {"elements": merged_elements}
|
|
|
|
def _mergeJsonExtractionResponses(self, content_parts: List[ContentPart], originalContentParts: Optional[List[ContentPart]] = None) -> Dict[str, Any]:
|
|
"""Merge multiple JSON extraction responses into one unified response.
|
|
|
|
Merges:
|
|
- Tables: Combines all table rows, preserves headers (duplicates preserved)
|
|
- Text: Combines all text blocks
|
|
- Headings: Combines all headings arrays
|
|
- Lists: Combines all list items
|
|
- Images: Combines all image descriptions
|
|
"""
|
|
merged = {
|
|
"extracted_content": {
|
|
"text": "",
|
|
"tables": [],
|
|
"headings": [],
|
|
"lists": [],
|
|
"images": []
|
|
}
|
|
}
|
|
|
|
# Track table headers to merge tables with same structure
|
|
table_headers_map: Dict[str, List[Dict[str, Any]]] = {} # headers_tuple -> [tables]
|
|
all_text_parts = []
|
|
all_headings = []
|
|
all_lists = []
|
|
all_images = []
|
|
|
|
# Collect per-part extracted data for debug file
|
|
per_part_extracted_data = []
|
|
# Track original parts and their extracted data
|
|
original_parts_extracted_data = []
|
|
|
|
for part_idx, part in enumerate(content_parts, 1):
|
|
logger.info(f"=== Processing ContentPart {part_idx}/{len(content_parts)}: id={part.id}, label={part.label}, typeGroup={part.typeGroup} ===")
|
|
|
|
if not part.data:
|
|
logger.warning(f"ContentPart {part.id} has no data, skipping")
|
|
continue
|
|
|
|
# Handle multiple JSON blocks in a single response (separated by ---)
|
|
# Split by --- to handle multiple JSON blocks per ContentPart
|
|
partDataBlocks = part.data.split('---')
|
|
logger.debug(f"ContentPart {part.id}: Found {len(partDataBlocks)} JSON block(s) (split by ---)")
|
|
|
|
for block_idx, blockData in enumerate(partDataBlocks, 1):
|
|
if not blockData.strip():
|
|
continue
|
|
|
|
try:
|
|
# Strip markdown code fences before parsing
|
|
strippedData = stripCodeFences(blockData.strip())
|
|
if not strippedData:
|
|
logger.debug(f"ContentPart {part.id}, Block {block_idx}: Empty after stripping code fences")
|
|
continue
|
|
|
|
parsed = json.loads(strippedData)
|
|
if not isinstance(parsed, dict) or "extracted_content" not in parsed:
|
|
logger.debug(f"ContentPart {part.id}, Block {block_idx}: Not a valid extraction response format")
|
|
continue
|
|
|
|
extracted = parsed["extracted_content"]
|
|
|
|
# Find corresponding original part (if available)
|
|
original_part = None
|
|
if originalContentParts and part_idx <= len(originalContentParts):
|
|
original_part = originalContentParts[part_idx - 1]
|
|
elif originalContentParts and len(originalContentParts) > 0:
|
|
# Handle one-to-many (chunking) - use first original part
|
|
original_part = originalContentParts[0]
|
|
|
|
# Store extracted data for this part/block for debug file
|
|
part_extracted = {
|
|
"contentPartId": part.id,
|
|
"contentPartLabel": part.label,
|
|
"contentPartTypeGroup": part.typeGroup,
|
|
"blockIndex": block_idx,
|
|
"extracted_content": extracted.copy() # Store full extracted content
|
|
}
|
|
per_part_extracted_data.append(part_extracted)
|
|
|
|
# Store original part extracted data
|
|
if original_part:
|
|
# Extract text from extracted_content for display
|
|
extracted_text = extracted.get("text", "") if isinstance(extracted.get("text"), str) else ""
|
|
if not extracted_text and extracted.get("tables"):
|
|
# If no text but has tables, create a text representation
|
|
table_texts = []
|
|
for table in extracted.get("tables", []):
|
|
if isinstance(table, dict):
|
|
headers = table.get("headers", [])
|
|
rows = table.get("rows", [])
|
|
if headers and rows:
|
|
table_texts.append(f"Table: {', '.join(headers)}\nRows: {len(rows)}")
|
|
extracted_text = "\n".join(table_texts) if table_texts else ""
|
|
|
|
original_part_data = {
|
|
"id": original_part.id,
|
|
"typeGroup": original_part.typeGroup,
|
|
"mimeType": original_part.mimeType or "text/plain",
|
|
"label": original_part.label,
|
|
"dataLength": len(extracted_text),
|
|
"metadata": {
|
|
"documentId": original_part.metadata.get("documentId") if original_part.metadata else None,
|
|
"documentMimeType": original_part.metadata.get("documentMimeType") if original_part.metadata else None,
|
|
"originalFileName": original_part.metadata.get("originalFileName") if original_part.metadata else None,
|
|
},
|
|
"data": extracted_text, # Full extracted text
|
|
"extracted_content": extracted.copy() # Full extracted content structure
|
|
}
|
|
original_parts_extracted_data.append(original_part_data)
|
|
|
|
# Log extracted content summary
|
|
extracted_summary = {
|
|
"text": len(extracted.get("text", "")) if extracted.get("text") else 0,
|
|
"tables": len(extracted.get("tables", [])) if isinstance(extracted.get("tables"), list) else 0,
|
|
"headings": len(extracted.get("headings", [])) if isinstance(extracted.get("headings"), list) else 0,
|
|
"lists": len(extracted.get("lists", [])) if isinstance(extracted.get("lists"), list) else 0,
|
|
"images": len(extracted.get("images", [])) if isinstance(extracted.get("images"), list) else 0,
|
|
}
|
|
logger.info(f"ContentPart {part.id}, Block {block_idx} extracted: text={extracted_summary['text']} chars, tables={extracted_summary['tables']}, headings={extracted_summary['headings']}, lists={extracted_summary['lists']}, images={extracted_summary['images']}")
|
|
|
|
# Log table details
|
|
if extracted_summary['tables'] > 0:
|
|
for table_idx, table in enumerate(extracted.get("tables", []), 1):
|
|
if isinstance(table, dict):
|
|
headers = table.get("headers", [])
|
|
rows = table.get("rows", [])
|
|
logger.info(f" Table {table_idx}: headers={headers}, rows={len(rows) if isinstance(rows, list) else 0}")
|
|
|
|
# Log list details
|
|
if extracted_summary['lists'] > 0:
|
|
for list_idx, list_item in enumerate(extracted.get("lists", []), 1):
|
|
if isinstance(list_item, dict):
|
|
list_type = list_item.get("type", "unknown")
|
|
items = list_item.get("items", [])
|
|
logger.info(f" List {list_idx}: type={list_type}, items={len(items) if isinstance(items, list) else 0}")
|
|
|
|
# Merge text
|
|
if "text" in extracted and extracted["text"]:
|
|
text_content = extracted["text"].strip()
|
|
if text_content:
|
|
all_text_parts.append(text_content)
|
|
|
|
# Merge tables - group by headers to merge compatible tables
|
|
if "tables" in extracted and isinstance(extracted["tables"], list):
|
|
for table in extracted["tables"]:
|
|
if not isinstance(table, dict) or "headers" not in table or "rows" not in table:
|
|
continue
|
|
|
|
headers = table["headers"]
|
|
rows = table["rows"]
|
|
|
|
if not headers or not rows:
|
|
continue
|
|
|
|
# Use headers as key for grouping
|
|
headers_key = tuple(headers)
|
|
if headers_key not in table_headers_map:
|
|
table_headers_map[headers_key] = []
|
|
table_headers_map[headers_key].append(table)
|
|
|
|
# Merge headings
|
|
if "headings" in extracted and isinstance(extracted["headings"], list):
|
|
for heading in extracted["headings"]:
|
|
if isinstance(heading, dict) and "text" in heading:
|
|
all_headings.append(heading)
|
|
|
|
# Merge lists
|
|
if "lists" in extracted and isinstance(extracted["lists"], list):
|
|
for list_item in extracted["lists"]:
|
|
if isinstance(list_item, dict) and "items" in list_item:
|
|
all_lists.append(list_item)
|
|
|
|
# Merge images
|
|
if "images" in extracted and isinstance(extracted["images"], list):
|
|
for image in extracted["images"]:
|
|
if isinstance(image, dict) and "description" in image:
|
|
all_images.append(image)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse JSON extraction response block from part {part.id}: {str(e)}")
|
|
continue
|
|
|
|
# Combine text parts
|
|
if all_text_parts:
|
|
merged["extracted_content"]["text"] = "\n\n".join(all_text_parts)
|
|
|
|
# Merge tables by headers - combine rows from tables with same headers
|
|
for headers_key, tables in table_headers_map.items():
|
|
# Collect all rows from tables with same headers
|
|
all_rows = []
|
|
|
|
for table in tables:
|
|
rows = table.get("rows", [])
|
|
all_rows.extend(rows)
|
|
|
|
# Create merged table
|
|
if all_rows:
|
|
merged["extracted_content"]["tables"].append({
|
|
"headers": list(headers_key),
|
|
"rows": all_rows
|
|
})
|
|
|
|
# Add headings
|
|
if all_headings:
|
|
merged["extracted_content"]["headings"] = all_headings
|
|
|
|
# Add lists - keep them separate (like headings) to preserve document structure
|
|
if all_lists:
|
|
merged["extracted_content"]["lists"] = all_lists
|
|
|
|
# Add images
|
|
if all_images:
|
|
merged["extracted_content"]["images"] = all_images
|
|
|
|
logger.info(f"=== Merging Summary ===")
|
|
logger.info(f"Total ContentParts processed: {len(content_parts)}")
|
|
logger.info(f"Text parts collected: {len(all_text_parts)}")
|
|
logger.info(f"Table groups (by headers): {len(table_headers_map)}")
|
|
logger.info(f"Headings collected: {len(all_headings)}")
|
|
logger.info(f"Lists collected: {len(all_lists)}")
|
|
logger.info(f"Images collected: {len(all_images)}")
|
|
|
|
# Log table merging details
|
|
for headers_key, tables in table_headers_map.items():
|
|
total_rows = sum(len(table.get("rows", [])) for table in tables)
|
|
logger.info(f" Table group with headers {list(headers_key)}: {len(tables)} table(s), {total_rows} total rows")
|
|
|
|
logger.info(f"Merged JSON extraction responses: {len(table_headers_map)} table groups, {len(all_text_parts)} text parts, {len(all_headings)} headings, {len(all_lists)} lists, {len(all_images)} images")
|
|
|
|
# Write per-part extracted data to debug file
|
|
if per_part_extracted_data and self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
debug_content = {
|
|
"summary": {
|
|
"totalContentParts": len(content_parts),
|
|
"totalExtractedBlocks": len(per_part_extracted_data),
|
|
"mergedResult": {
|
|
"textParts": len(all_text_parts),
|
|
"tableGroups": len(table_headers_map),
|
|
"headings": len(all_headings),
|
|
"lists": len(all_lists),
|
|
"images": len(all_images)
|
|
}
|
|
},
|
|
"perPartExtractedData": per_part_extracted_data
|
|
}
|
|
debug_json = json.dumps(debug_content, indent=2, ensure_ascii=False)
|
|
self.services.utils.writeDebugFile(debug_json, "content_extraction_per_part")
|
|
logger.info(f"Wrote per-part extracted data to debug file: {len(per_part_extracted_data)} blocks from {len(content_parts)} content parts")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to write per-part extracted data to debug file: {str(e)}")
|
|
|
|
# Write original parts extracted data in extraction_result format
|
|
if original_parts_extracted_data and self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
# Get document info from first original part if available
|
|
document_name = None
|
|
document_mime_type = None
|
|
if originalContentParts and len(originalContentParts) > 0:
|
|
first_part = originalContentParts[0]
|
|
if first_part.metadata:
|
|
document_name = first_part.metadata.get("originalFileName")
|
|
document_mime_type = first_part.metadata.get("documentMimeType")
|
|
|
|
# Format similar to extraction_result file
|
|
extraction_result_format = {
|
|
"documentName": document_name or "Unknown",
|
|
"documentMimeType": document_mime_type or "application/octet-stream",
|
|
"partsCount": len(original_parts_extracted_data),
|
|
"parts": []
|
|
}
|
|
|
|
for part_data in original_parts_extracted_data:
|
|
# Format each part similar to extraction_result format
|
|
formatted_part = {
|
|
"typeGroup": part_data["typeGroup"],
|
|
"mimeType": part_data["mimeType"],
|
|
"label": part_data["label"],
|
|
"dataLength": part_data["dataLength"],
|
|
"metadata": part_data["metadata"],
|
|
"data": part_data["data"], # Full extracted text
|
|
"extracted_content": part_data["extracted_content"] # Full structure
|
|
}
|
|
extraction_result_format["parts"].append(formatted_part)
|
|
|
|
result_json = json.dumps(extraction_result_format, indent=2, ensure_ascii=False)
|
|
self.services.utils.writeDebugFile(result_json, "content_extraction_original_parts")
|
|
logger.info(f"Wrote original parts extracted data to debug file: {len(original_parts_extracted_data)} original parts")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to write original parts extracted data to debug file: {str(e)}")
|
|
|
|
return merged
|
|
|
|
async def chunkContentPartForAi(self, contentPart, model, options, prompt: str = "") -> List[Dict[str, Any]]:
|
|
"""Chunk a content part based on model capabilities, accounting for prompt, system message overhead, and maxTokens output.
|
|
|
|
Moved from interfaceAiObjects.py - model-aware chunking for AI processing.
|
|
Complementary to existing size-based chunking in extraction pipeline.
|
|
"""
|
|
# Calculate model-specific chunk sizes
|
|
modelContextTokens = model.contextLength # Total context in tokens
|
|
modelMaxOutputTokens = model.maxTokens # Maximum output tokens
|
|
|
|
# CRITICAL: Use same conservative token factor as in processContentPartWithFallback
|
|
# Real-world observation: Our calculation says 94k tokens, but API says 217k tokens (2.3x difference!)
|
|
TOKEN_SAFETY_FACTOR = 2.2 # Conservative: accounts for JSON tokenization and API overhead
|
|
|
|
# Reserve tokens for:
|
|
# 1. Prompt (user message) - use conservative factor
|
|
promptSize = len(prompt.encode('utf-8')) if prompt else 0
|
|
promptTokens = promptSize / TOKEN_SAFETY_FACTOR
|
|
|
|
# 2. System message wrapper ("Context from documents:\n")
|
|
systemMessageTokens = 10 # ~40 bytes = 10 tokens
|
|
|
|
# 3. Max output tokens (model will reserve space for completion)
|
|
outputTokens = modelMaxOutputTokens
|
|
|
|
# 4. JSON structure and message overhead (~100 tokens)
|
|
messageOverheadTokens = 100
|
|
|
|
# Total reserved tokens = input overhead + output reservation
|
|
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
|
|
|
|
# Available tokens for content = context length - reserved tokens
|
|
# Use 60% of available (same conservative margin as in processContentPartWithFallback)
|
|
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.60)
|
|
|
|
# Ensure we have at least some space
|
|
if availableContentTokens < 100:
|
|
logger.warning(f"Very limited space for content: {availableContentTokens} tokens available. Model: {model.name}, contextLength: {modelContextTokens}, maxTokens: {modelMaxOutputTokens}, prompt: {promptTokens:.0f} tokens")
|
|
availableContentTokens = max(100, int(modelContextTokens * 0.1)) # Fallback to 10% of context
|
|
|
|
# Convert tokens to bytes using conservative factor (reverse: bytes = tokens * factor)
|
|
availableContentBytes = int(availableContentTokens * TOKEN_SAFETY_FACTOR)
|
|
|
|
logger.info(f"Chunking calculation for {model.name}: contextLength={modelContextTokens} tokens, maxTokens={modelMaxOutputTokens} tokens, prompt={promptTokens:.0f} tokens est., reserved={totalReservedTokens:.0f} tokens est., available={availableContentTokens} tokens est. ({availableContentBytes} bytes), factor={TOKEN_SAFETY_FACTOR}")
|
|
|
|
# Use 50% of available content bytes for text chunks (very conservative to ensure chunks fit)
|
|
# This ensures that even with token counting inaccuracies, chunks will fit
|
|
textChunkSize = int(availableContentBytes * 0.5)
|
|
structureChunkSize = int(availableContentBytes * 0.5) # CRITICAL: Also set for StructureChunker (JSON content)
|
|
tableChunkSize = int(availableContentBytes * 0.5) # Also set for TableChunker
|
|
imageChunkSize = int(availableContentBytes * 0.6) # 60% for image chunks
|
|
|
|
# Build chunking options - include ALL chunk size options for different chunkers
|
|
chunkingOptions = {
|
|
"textChunkSize": textChunkSize,
|
|
"structureChunkSize": structureChunkSize, # CRITICAL: Required for StructureChunker (JSON)
|
|
"tableChunkSize": tableChunkSize, # Required for TableChunker
|
|
"imageChunkSize": imageChunkSize,
|
|
"maxSize": availableContentBytes,
|
|
"chunkAllowed": True
|
|
}
|
|
|
|
logger.info(f"Chunking options: textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes, tableChunkSize={tableChunkSize} bytes, imageChunkSize={imageChunkSize} bytes, contentPartSize={len(contentPart.data.encode('utf-8')) if contentPart.data else 0} bytes")
|
|
|
|
# Get appropriate chunker (uses existing ChunkerRegistry ✅)
|
|
chunker = self._chunkerRegistry.resolve(contentPart.typeGroup)
|
|
|
|
if not chunker:
|
|
logger.warning(f"No chunker found for typeGroup: {contentPart.typeGroup}")
|
|
return []
|
|
|
|
# Chunk the content part
|
|
try:
|
|
contentSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
|
|
logger.info(f"Chunking {contentPart.typeGroup} part: contentSize={contentSize} bytes, textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes")
|
|
chunks = chunker.chunk(contentPart, chunkingOptions)
|
|
logger.info(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part (contentSize={contentSize} bytes)")
|
|
if chunks:
|
|
for i, chunk in enumerate(chunks):
|
|
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
|
|
logger.info(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes")
|
|
return chunks
|
|
except Exception as e:
|
|
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
|
|
return []
|
|
|
|
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
|
|
"""Process a single content part with model-aware chunking and fallback.
|
|
|
|
Moved from interfaceAiObjects.py - orchestrates chunking and merging.
|
|
Calls aiObjects._callWithModel() for actual AI calls.
|
|
"""
|
|
lastError = None
|
|
|
|
# Check if this is an image - Vision models need special handling
|
|
isImage = (contentPart.typeGroup == "image") or (contentPart.mimeType and contentPart.mimeType.startswith("image/"))
|
|
|
|
# Determine the correct operation type based on content type
|
|
actualOperationType = options.operationType
|
|
if isImage:
|
|
actualOperationType = OperationTypeEnum.IMAGE_ANALYSE
|
|
# Get vision-capable models for images
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
visionFailoverList = modelSelector.getFailoverModelList(prompt, "", AiCallOptions(operationType=actualOperationType), availableModels)
|
|
if visionFailoverList:
|
|
logger.debug(f"Using {len(visionFailoverList)} vision-capable models for image processing")
|
|
failoverModelList = visionFailoverList
|
|
|
|
for attempt, model in enumerate(failoverModelList):
|
|
try:
|
|
logger.info(f"Processing content part with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
|
|
|
|
# Special handling for images with Vision models
|
|
if isImage and hasattr(model, 'functionCall'):
|
|
try:
|
|
if not contentPart.data:
|
|
raise ValueError("Image content part has no data")
|
|
|
|
mimeType = contentPart.mimeType or "image/jpeg"
|
|
if not mimeType.startswith("image/"):
|
|
raise ValueError(f"Invalid mimeType for image: {mimeType}")
|
|
|
|
# Prepare base64 data
|
|
if isinstance(contentPart.data, str):
|
|
try:
|
|
base64.b64decode(contentPart.data, validate=True)
|
|
base64Data = contentPart.data
|
|
except Exception as e:
|
|
raise ValueError(f"Invalid base64 data in contentPart: {str(e)}")
|
|
elif isinstance(contentPart.data, bytes):
|
|
base64Data = base64.b64encode(contentPart.data).decode('utf-8')
|
|
else:
|
|
raise ValueError(f"Unsupported data type for image: {type(contentPart.data)}")
|
|
|
|
imageDataUrl = f"data:{mimeType};base64,{base64Data}"
|
|
|
|
modelCall = AiModelCall(
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": prompt or ""},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": imageDataUrl}
|
|
}
|
|
]
|
|
}
|
|
],
|
|
model=model,
|
|
options=AiCallOptions(operationType=actualOperationType)
|
|
)
|
|
|
|
modelResponse = await model.functionCall(modelCall)
|
|
|
|
if not modelResponse.success:
|
|
raise ValueError(f"Model call failed: {modelResponse.error}")
|
|
|
|
logger.info(f"✅ Image content part processed successfully with model: {model.name}")
|
|
|
|
processingTime = getattr(modelResponse, 'processingTime', None) or 0.0
|
|
|
|
return AiCallResponse(
|
|
content=modelResponse.content,
|
|
modelName=model.name,
|
|
priceUsd=0.0,
|
|
processingTime=processingTime,
|
|
bytesSent=0,
|
|
bytesReceived=0,
|
|
errorCount=0
|
|
)
|
|
except Exception as e:
|
|
lastError = e
|
|
logger.warning(f"❌ Image processing failed with model {model.name}: {str(e)}")
|
|
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next fallback model for image processing...")
|
|
continue
|
|
else:
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for image processing")
|
|
raise
|
|
|
|
# For non-image parts, check if part fits in model context
|
|
partSize = len(contentPart.data.encode('utf-8')) if contentPart.data else 0
|
|
|
|
modelContextTokens = model.contextLength
|
|
modelMaxOutputTokens = model.maxTokens
|
|
|
|
promptTokens = len(prompt.encode('utf-8')) / 4 if prompt else 0
|
|
systemMessageTokens = 10
|
|
outputTokens = modelMaxOutputTokens
|
|
messageOverheadTokens = 100
|
|
totalReservedTokens = promptTokens + systemMessageTokens + messageOverheadTokens + outputTokens
|
|
|
|
availableContentTokens = int((modelContextTokens - totalReservedTokens) * 0.8)
|
|
if availableContentTokens < 100:
|
|
availableContentTokens = max(100, int(modelContextTokens * 0.1))
|
|
|
|
availableContentBytes = availableContentTokens * 4
|
|
|
|
# Also check prompt size - prompt + content together must fit
|
|
promptSize = len(prompt.encode('utf-8')) if prompt else 0
|
|
|
|
# CRITICAL: Token counting approximation is VERY inaccurate for JSON/content
|
|
# Real-world observation: Our calculation says 94k tokens, but API says 217k tokens (2.3x difference!)
|
|
# This happens because:
|
|
# 1. JSON/structured content tokenizes differently (more tokens per byte)
|
|
# 2. API has message structure overhead (system prompts, message wrappers)
|
|
# 3. Tokenizer differences between our approximation and actual API tokenizer
|
|
# Use conservative factor: 1 token ≈ 2.2 bytes (instead of 4) to account for these differences
|
|
TOKEN_SAFETY_FACTOR = 2.2 # Conservative: accounts for JSON tokenization and API overhead
|
|
promptTokens = promptSize / TOKEN_SAFETY_FACTOR
|
|
contentTokens = partSize / TOKEN_SAFETY_FACTOR
|
|
totalTokens = promptTokens + contentTokens
|
|
|
|
# CRITICAL: Use very conservative margin (60%) because:
|
|
# 1. Token counting approximation is inaccurate - real tokens can be 2-3x more
|
|
# 2. API has additional overhead (message structure, system prompts, etc.)
|
|
# 3. Anthropic API is strict about the 200k limit
|
|
# 4. We've seen cases where our calculation says "fits" but API says "too long"
|
|
maxTotalTokens = int(modelContextTokens * 0.60)
|
|
|
|
logger.info(f"Size check for {model.name}: partSize={partSize} bytes ({contentTokens:.0f} tokens est.), promptSize={promptSize} bytes ({promptTokens:.0f} tokens est.), total={totalTokens:.0f} tokens est., modelContext={modelContextTokens} tokens, maxTotal={maxTotalTokens} tokens (60% margin, conservative factor={TOKEN_SAFETY_FACTOR})")
|
|
|
|
# CRITICAL: Always check totalTokens first - if prompt + content exceeds limit, MUST chunk
|
|
# Token counting approximation may differ significantly from API, so use very conservative margin
|
|
if totalTokens > maxTotalTokens:
|
|
logger.warning(f"⚠️ Total tokens ({totalTokens:.0f} est.) exceed model limit ({maxTotalTokens}), chunking required. Prompt: {promptTokens:.0f} tokens est., Content: {contentTokens:.0f} tokens est.")
|
|
elif partSize > availableContentBytes:
|
|
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
|
|
|
|
# If either condition fails, chunk the content
|
|
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
|
|
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
|
|
# Part too large or total exceeds limit - chunk it (but not for image generation)
|
|
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
|
if not chunks:
|
|
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
|
|
|
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
|
|
|
|
chunkResults = []
|
|
for idx, chunk in enumerate(chunks):
|
|
chunkNum = idx + 1
|
|
chunkData = chunk.get('data', '')
|
|
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
|
|
|
|
try:
|
|
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
|
|
chunkResults.append(chunkResponse)
|
|
except Exception as chunkError:
|
|
logger.error(f"Error processing chunk {chunkNum}/{len(chunks)}: {str(chunkError)}")
|
|
# Continue with other chunks even if one fails
|
|
continue
|
|
|
|
# Merge chunk results
|
|
if not chunkResults:
|
|
raise ValueError(f"All chunks failed for content part")
|
|
|
|
# Pass original contentPart to preserve typeGroup for all chunks (one-to-many: 1 part -> N chunks)
|
|
mergedContent = self.mergePartResults(chunkResults, options, [contentPart])
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName=model.name,
|
|
priceUsd=sum(r.priceUsd for r in chunkResults),
|
|
processingTime=sum(r.processingTime for r in chunkResults),
|
|
bytesSent=sum(r.bytesSent for r in chunkResults),
|
|
bytesReceived=sum(r.bytesReceived for r in chunkResults),
|
|
errorCount=sum(r.errorCount for r in chunkResults)
|
|
)
|
|
else:
|
|
# Part fits - call AI directly via aiObjects interface
|
|
logger.info(f"✅ Content part fits within model limits, processing directly")
|
|
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
|
|
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
|
return response
|
|
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
|
if not chunks:
|
|
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
|
|
|
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
|
|
|
|
chunkResults = []
|
|
for idx, chunk in enumerate(chunks):
|
|
chunkNum = idx + 1
|
|
chunkData = chunk.get('data', '')
|
|
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
|
|
|
|
if progressCallback:
|
|
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
|
|
|
|
try:
|
|
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
|
|
chunkResults.append(chunkResponse)
|
|
logger.info(f"✅ Chunk {chunkNum}/{len(chunks)} processed successfully")
|
|
|
|
if progressCallback:
|
|
progressCallback(chunkNum / len(chunks), f"Chunk {chunkNum}/{len(chunks)} processed")
|
|
except Exception as e:
|
|
logger.error(f"❌ Error processing chunk {chunkNum}/{len(chunks)}: {str(e)}")
|
|
raise
|
|
|
|
# Merge chunk results using unified mergePartResults
|
|
# Pass original contentPart to preserve typeGroup for all chunks (one-to-many: 1 part -> N chunks)
|
|
mergedContent = self.mergePartResults(chunkResults, options, [contentPart])
|
|
|
|
logger.info(f"✅ Content part chunked and processed with model: {model.name} ({len(chunks)} chunks)")
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName=model.name,
|
|
priceUsd=sum(r.priceUsd for r in chunkResults),
|
|
processingTime=sum(r.processingTime for r in chunkResults),
|
|
bytesSent=sum(r.bytesSent for r in chunkResults),
|
|
bytesReceived=sum(r.bytesReceived for r in chunkResults),
|
|
errorCount=sum(r.errorCount for r in chunkResults)
|
|
)
|
|
|
|
except Exception as e:
|
|
lastError = e
|
|
error_msg = str(e) if str(e) else f"{type(e).__name__}"
|
|
logger.warning(f"❌ Model {model.name} failed for content part: {error_msg}", exc_info=True)
|
|
|
|
if attempt < len(failoverModelList) - 1:
|
|
logger.info(f"🔄 Trying next failover model...")
|
|
continue
|
|
else:
|
|
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
|
|
break
|
|
|
|
# All models failed
|
|
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
|
|
|
|
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
|
|
"""Create an error response."""
|
|
return AiCallResponse(
|
|
content=errorMsg,
|
|
modelName="error",
|
|
priceUsd=0.0,
|
|
processingTime=0.0,
|
|
bytesSent=inputBytes,
|
|
bytesReceived=outputBytes,
|
|
errorCount=1
|
|
)
|
|
|
|
async def processContentPartsWithAi(
|
|
self,
|
|
request: AiCallRequest,
|
|
aiObjects, # Pass interface for AI calls
|
|
progressCallback=None
|
|
) -> AiCallResponse:
|
|
"""Process content parts with model-aware chunking and AI calls in parallel.
|
|
|
|
Moved from interfaceAiObjects.callWithContentParts() - entry point for content parts processing.
|
|
Uses parallel processing similar to section generation for better performance.
|
|
|
|
SPECIAL CASE: For DATA_EXTRACT operations, processes all contentParts together in ONE call
|
|
to enable proper merging (e.g., merging tables from multiple PDFs into one table).
|
|
"""
|
|
prompt = request.prompt
|
|
options = request.options
|
|
contentParts = request.contentParts
|
|
|
|
# Get failover models
|
|
availableModels = modelRegistry.getAvailableModels()
|
|
failoverModelList = modelSelector.getFailoverModelList(prompt, "", options, availableModels)
|
|
|
|
if not failoverModelList:
|
|
return self._createErrorResponse("No suitable models found", 0, 0)
|
|
|
|
totalParts = len(contentParts)
|
|
if totalParts == 0:
|
|
return self._createErrorResponse("No content parts to process", 0, 0)
|
|
|
|
# NOTE: For DATA_EXTRACT operations, the extraction prompt explicitly asks the AI to merge
|
|
# all contentParts into ONE unified JSON response. Even though we process parts separately,
|
|
# each response should contain merged content. The mergePartResults will concatenate responses,
|
|
# but the new prompt format (flat extracted_content structure) is designed for easier merging.
|
|
|
|
# DEFAULT: Process parts in parallel
|
|
# Thread-safe counter for progress tracking
|
|
completedCount = [0] # Use list to allow modification in nested function
|
|
|
|
# Process parts in parallel with concurrency control
|
|
maxConcurrent = 5
|
|
if options and hasattr(options, 'maxConcurrentParts'):
|
|
maxConcurrent = options.maxConcurrentParts
|
|
|
|
semaphore = asyncio.Semaphore(maxConcurrent)
|
|
|
|
async def processSinglePart(contentPart, partIndex: int) -> AiCallResponse:
|
|
"""Process a single content part with progress logging."""
|
|
async with semaphore:
|
|
partLabel = contentPart.label or f"Part {partIndex+1}"
|
|
partType = contentPart.typeGroup or "unknown"
|
|
|
|
# Log start of processing
|
|
if progressCallback:
|
|
progressCallback(0.1 + (partIndex / totalParts) * 0.8, f"Processing {partLabel} ({partType}) - {partIndex+1}/{totalParts}")
|
|
|
|
try:
|
|
# Process the part
|
|
partResult = await self.processContentPartWithFallback(
|
|
contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging
|
|
)
|
|
|
|
# Write debug files for generation phase (section content generation)
|
|
# Check for DATA_GENERATE or DATA_ANALYSE (used for section generation)
|
|
isGenerationPhase = False
|
|
if options and hasattr(options, 'operationType'):
|
|
isGenerationPhase = (options.operationType == OperationTypeEnum.DATA_GENERATE or
|
|
options.operationType == OperationTypeEnum.DATA_ANALYSE)
|
|
|
|
if isGenerationPhase:
|
|
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
|
|
try:
|
|
# Create debug filename with contentPart ID or label
|
|
partId = contentPart.id[:8] if contentPart.id else f"part_{partIndex+1}"
|
|
partLabelSafe = (contentPart.label or f"part_{partIndex+1}").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
|
|
debugPrefix = f"generation_contentPart_{partId}_{partLabelSafe}"
|
|
|
|
# Write prompt
|
|
self.services.utils.writeDebugFile(prompt, f"{debugPrefix}_prompt")
|
|
|
|
# Write response
|
|
responseContent = partResult.content if partResult.content else ""
|
|
self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
|
|
|
|
logger.debug(f"Wrote debug files for contentPart {partId} (generation): {debugPrefix}_prompt, {debugPrefix}_response")
|
|
except Exception as debugError:
|
|
logger.warning(f"Failed to write debug file for contentPart {contentPart.id}: {str(debugError)}")
|
|
|
|
# Update completed count and log progress
|
|
completedCount[0] += 1
|
|
if progressCallback:
|
|
progressCallback(0.1 + (completedCount[0] / totalParts) * 0.8, f"Completed {partLabel} ({partType}) - {completedCount[0]}/{totalParts}")
|
|
|
|
return partResult
|
|
except Exception as e:
|
|
# Update completed count even on error
|
|
completedCount[0] += 1
|
|
logger.error(f"Error processing part {partIndex+1} ({partLabel}): {str(e)}")
|
|
if progressCallback:
|
|
progressCallback(0.1 + (completedCount[0] / totalParts) * 0.8, f"Error processing {partLabel} ({partType}) - {completedCount[0]}/{totalParts}")
|
|
# Return error response
|
|
return self._createErrorResponse(f"Error processing part: {str(e)}", 0, 0)
|
|
|
|
# Create tasks for all parts
|
|
tasks = [processSinglePart(contentPart, i) for i, contentPart in enumerate(contentParts)]
|
|
|
|
# Execute all tasks in parallel with error handling
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results and handle exceptions
|
|
allResults = []
|
|
for i, result in enumerate(results):
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Exception processing part {i+1}: {str(result)}")
|
|
allResults.append(self._createErrorResponse(f"Exception: {str(result)}", 0, 0))
|
|
elif result is not None:
|
|
allResults.append(result)
|
|
|
|
# Merge all results using unified mergePartResults, preserving original typeGroup
|
|
mergedContent = self.mergePartResults(allResults, options, contentParts)
|
|
|
|
return AiCallResponse(
|
|
content=mergedContent,
|
|
modelName="multiple",
|
|
priceUsd=sum(r.priceUsd for r in allResults),
|
|
processingTime=sum(r.processingTime for r in allResults),
|
|
bytesSent=sum(r.bytesSent for r in allResults),
|
|
bytesReceived=sum(r.bytesReceived for r in allResults),
|
|
errorCount=sum(r.errorCount for r in allResults)
|
|
)
|
|
|
|
|
|
# Module-level function for use by subPipeline and ExtractionService
|
|
def applyMerging(parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
|
"""Apply merging strategy to parts with intelligent token-aware merging.
|
|
|
|
Moved from interfaceAiObjects.py to resolve dependency violations.
|
|
Can be used as module-level function or called from ExtractionService methods.
|
|
"""
|
|
logger.debug(f"applyMerging called with {len(parts)} parts")
|
|
|
|
# Import merging dependencies (now local imports ✅)
|
|
from .merging.mergerText import TextMerger
|
|
from .merging.mergerTable import TableMerger
|
|
from .merging.mergerDefault import DefaultMerger
|
|
from .subMerger import IntelligentTokenAwareMerger
|
|
|
|
# Check if intelligent merging is enabled
|
|
if strategy.useIntelligentMerging:
|
|
modelCapabilities = strategy.capabilities or {}
|
|
subMerger = IntelligentTokenAwareMerger(modelCapabilities)
|
|
|
|
# Use intelligent merging for all parts
|
|
merged = subMerger.mergeChunksIntelligently(parts, strategy.prompt or "")
|
|
|
|
# Calculate and log optimization stats
|
|
stats = subMerger.calculateOptimizationStats(parts, merged)
|
|
logger.info(f"🧠 Intelligent merging stats: {stats}")
|
|
logger.debug(f"Intelligent merging: {stats['original_ai_calls']} → {stats['optimized_ai_calls']} calls ({stats['reduction_percent']}% reduction)")
|
|
|
|
return merged
|
|
|
|
# Fallback to traditional merging
|
|
textMerger = TextMerger()
|
|
tableMerger = TableMerger()
|
|
defaultMerger = DefaultMerger()
|
|
|
|
# Group by typeGroup
|
|
textParts = [p for p in parts if p.typeGroup == "text"]
|
|
tableParts = [p for p in parts if p.typeGroup == "table"]
|
|
structureParts = [p for p in parts if p.typeGroup == "structure"]
|
|
otherParts = [p for p in parts if p.typeGroup not in ("text", "table", "structure")]
|
|
|
|
logger.debug(f"Grouped - text: {len(textParts)}, table: {len(tableParts)}, structure: {len(structureParts)}, other: {len(otherParts)}")
|
|
|
|
merged: List[ContentPart] = []
|
|
|
|
if textParts:
|
|
textMerged = textMerger.merge(textParts, strategy)
|
|
logger.debug(f"TextMerger merged {len(textParts)} parts into {len(textMerged)} parts")
|
|
merged.extend(textMerged)
|
|
if tableParts:
|
|
tableMerged = tableMerger.merge(tableParts, strategy)
|
|
logger.debug(f"TableMerger merged {len(tableParts)} parts into {len(tableMerged)} parts")
|
|
merged.extend(tableMerged)
|
|
if structureParts:
|
|
# For now, treat structure like text
|
|
structureMerged = textMerger.merge(structureParts, strategy)
|
|
logger.debug(f"StructureMerger merged {len(structureParts)} parts into {len(structureMerged)} parts")
|
|
merged.extend(structureMerged)
|
|
if otherParts:
|
|
otherMerged = defaultMerger.merge(otherParts, strategy)
|
|
logger.debug(f"DefaultMerger merged {len(otherParts)} parts into {len(otherMerged)} parts")
|
|
merged.extend(otherMerged)
|
|
|
|
logger.debug(f"applyMerging returning {len(merged)} parts")
|
|
return merged
|
|
|
|
|