gateway/modules/services/serviceAi/subDocumentProcessing.py

1042 lines
54 KiB
Python

import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
logger = logging.getLogger(__name__)
class SubDocumentProcessing:
"""Document processing operations including chunking, processing, and merging."""
def __init__(self, services, aiObjects):
"""Initialize document processing service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
"""
self.services = services
self.aiObjects = aiObjects
self._extractionService = None
@property
def extractionService(self):
"""Lazy initialization of extraction service."""
if self._extractionService is None:
logger.info("Lazy initializing ExtractionService...")
self._extractionService = ExtractionService(self.services)
return self._extractionService
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
"""Calculate maximum context bytes based on model capabilities and options."""
if options and options.maxContextBytes:
return options.maxContextBytes
# Default model capabilities (this should be enhanced with actual model registry)
defaultMaxTokens = 4000
safetyMargin = options.safetyMargin if options else 0.1
# Calculate bytes (4 chars per token estimation)
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
return maxContextBytes
async def processDocumentsPerChunk(
self,
documents: List[ChatDocument],
prompt: str,
options: Optional[AiCallOptions] = None
) -> str:
"""
Process documents with per-chunk AI calls and merge results.
FIXED: Now preserves chunk relationships and document structure.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
options: AI call options
Returns:
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return "[Error: No extraction results]"
# FIXED: Process chunks with proper mapping
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options)
# FIXED: Merge with preserved chunk relationships
mergedContent = self._mergeChunkResults(chunkResults, options)
return mergedContent
except Exception as e:
logger.error(f"Error in per-chunk processing: {str(e)}")
return f"[Error in per-chunk processing: {str(e)}]"
async def processDocumentsPerChunkJson(
self,
documents: List[ChatDocument],
prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with per-chunk AI calls and merge results in JSON mode.
Returns structured JSON document instead of text.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return {"metadata": {"title": "Error Document"}, "sections": []}
# Process chunks with proper mapping
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options, generate_json=True)
# Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
return mergedJsonDocument
except Exception as e:
logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def processDocumentsPerChunkJsonWithPrompt(
self,
documents: List[ChatDocument],
custom_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with per-chunk AI calls and merge results in JSON mode.
Uses a custom prompt instead of the default extraction prompt.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": custom_prompt, # Use the custom prompt instead of default
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": custom_prompt, # Use the custom prompt
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return {"metadata": {"title": "Error Document"}, "sections": []}
# Process chunks with proper mapping
logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
# Debug: Show what content is being processed (before filtering)
for i, ec in enumerate(extractionResult):
if hasattr(ec, 'parts'):
for j, part in enumerate(ec.parts):
if not (hasattr(part, 'data') and part.data):
# Check if this is an empty container chunk (which is expected)
part_type = getattr(part, 'typeGroup', None)
part_mime = getattr(part, 'mimeType', '')
is_empty_container = (
part_type == "container" and
part_mime and
'document' in part_mime.lower()
)
if not is_empty_container:
logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'")
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
# Debug: Show what chunks were actually processed (after filtering)
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
# Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
# Debug: Show what the AI actually returned
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
if 'documents' in mergedJsonDocument:
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
elif 'sections' in mergedJsonDocument:
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
return mergedJsonDocument
except Exception as e:
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def callAiText(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> str:
"""
Handle text calls with document processing through ExtractionService.
UNIFIED PROCESSING: Always use per-chunk processing for consistency.
"""
# UNIFIED PROCESSING: Always use per-chunk processing for consistency
# This ensures MIME-type checking, chunk mapping, and parallel processing
return await self.processDocumentsPerChunk(documents, prompt, options)
async def _processChunksWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
options: Optional[AiCallOptions] = None,
generate_json: bool = False
) -> List[ChunkResult]:
"""Process chunks with proper mapping to preserve relationships."""
from modules.datamodels.datamodelExtraction import ChunkResult
import asyncio
import time
# Collect all chunks that need processing with proper indexing
chunks_to_process = []
chunk_index = 0
for ec in extractionResult:
# Get document MIME type from metadata
document_mime_type = None
for part in ec.parts:
if part.metadata and 'documentMimeType' in part.metadata:
document_mime_type = part.metadata['documentMimeType']
break
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
continue
chunks_to_process.append({
'part': part,
'chunk_index': chunk_index,
'document_id': ec.id,
'document_mime_type': document_mime_type
})
chunk_index += 1
logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping")
# Process chunks in parallel with proper mapping
async def process_single_chunk(chunk_info: Dict) -> ChunkResult:
part = chunk_info['part']
chunk_index = chunk_info['chunk_index']
document_id = chunk_info['document_id']
document_mime_type = chunk_info.get('document_mime_type', part.mimeType)
start_time = time.time()
try:
# FIXED: Check MIME type first, then fallback to typeGroup
is_image = (
(document_mime_type and document_mime_type.startswith('image/')) or
(part.mimeType and part.mimeType.startswith('image/')) or
(part.typeGroup == "image")
)
# Debug logging
self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}")
if is_image:
# Use the same extraction prompt for image analysis (contains table JSON format)
self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Check if image data is available
if not part.data:
error_msg = f"No image data available for chunk {chunk_index}"
logger.warning(error_msg)
ai_result = f"Error: {error_msg}"
else:
try:
# Import here to avoid circular imports
from modules.services.serviceAi.subCoreAi import SubCoreAi
core_ai = SubCoreAi(self.services, self.aiObjects)
ai_result = await core_ai.readImage(
prompt=prompt,
imageData=part.data,
mimeType=part.mimeType,
options=options
)
self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE")
# Check if result is empty or None
if not ai_result or not ai_result.strip():
logger.warning(f"Image chunk {chunk_index} returned empty response from AI")
ai_result = "No content detected in image"
except Exception as e:
logger.error(f"Error processing image chunk {chunk_index}: {str(e)}")
ai_result = f"Error analyzing image: {str(e)}"
# If generating JSON, clean image analysis result
if generate_json:
try:
import json
import re
# Clean the response - remove markdown code blocks if present
cleaned_result = ai_result.strip()
# Remove various markdown patterns
if cleaned_result.startswith('```json'):
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
elif cleaned_result.startswith('```'):
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
# Remove any leading/trailing text that's not JSON
# Look for the first { and last } to extract JSON
first_brace = cleaned_result.find('{')
last_brace = cleaned_result.rfind('}')
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
cleaned_result = cleaned_result[first_brace:last_brace + 1]
# Additional cleaning for common AI response issues
cleaned_result = cleaned_result.strip()
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE")
except json.JSONDecodeError as e:
logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}")
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
# Create fallback JSON with the actual response content (not the error message)
# Use the original AI response content, not the error message
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
ai_result = json.dumps({
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
"sections": [{
"id": f"image_section_{chunk_index}",
"type": "paragraph",
"data": {"text": fallback_content}
}]
})
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
elif part.typeGroup in ("container", "binary"):
# Handle ALL container and binary content generically - let AI process any document type
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
# Skip processing this chunk
pass
elif part.mimeType and part.data and len(part.data.strip()) > 0:
# Process any document container as text content
request_options = options if options is not None else AiCallOptions()
request_options.operationType = OperationType.GENERAL
self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}")
# Log extraction prompt and context
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
ai_result = response.content
# Log extraction response
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
# Save full extraction prompt and response to debug file - only if debug enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
try:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_extraction_container_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f:
f.write(f"EXTRACTION PROMPT:\n{prompt}\n\n")
f.write(f"EXTRACTION CONTEXT:\n{part.data if part.data else 'No context'}\n\n")
f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n")
except Exception:
pass
# If generating JSON, validate the response
if generate_json:
try:
import json
import re
# Clean the response - remove markdown code blocks if present
cleaned_result = ai_result.strip()
# Remove various markdown patterns
if cleaned_result.startswith('```json'):
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
elif cleaned_result.startswith('```'):
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
# Remove any leading/trailing text that's not JSON
# Look for the first { and last } to extract JSON
first_brace = cleaned_result.find('{')
last_brace = cleaned_result.rfind('}')
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
cleaned_result = cleaned_result[first_brace:last_brace + 1]
# Additional cleaning for common AI response issues
cleaned_result = cleaned_result.strip()
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
except json.JSONDecodeError as e:
logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}")
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
# Create fallback JSON with the actual response content (not the error message)
# Use the original AI response content, not the error message
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
ai_result = json.dumps({
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
"sections": [{
"id": f"analysis_section_{chunk_index}",
"type": "paragraph",
"data": {"text": fallback_content}
}]
})
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
else:
# Skip empty or invalid container/binary content - don't create a result
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Return None to indicate this chunk should be completely skipped
return None
else:
# Ensure options is not None and set correct operation type for text
request_options = options if options is not None else AiCallOptions()
# FIXED: Set operation type to general for text processing
request_options.operationType = OperationType.GENERAL
self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}")
# Log extraction context length
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
# Debug: Log the actual prompt being sent to AI
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
# Debug: Log what AI actually returned
logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
ai_result = response.content
# Log extraction response length
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
# Save extraction response to debug file (without verbose prompt) - only if debug enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
try:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
os.makedirs(debug_root, exist_ok=True)
with open(os.path.join(debug_root, f"{ts}_extraction_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f:
f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n")
except Exception:
pass
# If generating JSON, validate the response
if generate_json:
try:
import json
import re
# Clean the response - remove markdown code blocks and extra formatting
cleaned_result = ai_result.strip()
# Remove any markdown code block markers (```json, ```, etc.)
cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
# Remove any remaining ``` markers anywhere in the text
cleaned_result = re.sub(r'```', '', cleaned_result)
# Try to extract JSON from the response if it's embedded in other text
json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
if json_match:
cleaned_result = json_match.group(0)
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
except json.JSONDecodeError as e:
logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}")
# Create fallback JSON
ai_result = json.dumps({
"metadata": {"title": "Error Section"},
"sections": [{
"id": f"error_section_{chunk_index}",
"type": "paragraph",
"data": {"text": f"Error parsing JSON: {str(e)}"}
}]
})
processing_time = time.time() - start_time
logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s")
return ChunkResult(
originalChunk=part,
aiResult=ai_result,
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": True,
"chunkSize": len(part.data) if part.data else 0,
"resultSize": len(ai_result),
"typeGroup": part.typeGroup
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing chunk {chunk_index}: {str(e)}")
return ChunkResult(
originalChunk=part,
aiResult=f"[Error processing chunk: {str(e)}]",
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"chunkSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process chunks with concurrency control
max_concurrent = 5 # Default concurrency
if options and hasattr(options, 'maxConcurrentChunks'):
max_concurrent = options.maxConcurrentChunks
elif options and hasattr(options, 'maxParallelChunks'):
max_concurrent = options.maxParallelChunks
logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}")
self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE")
for i, chunk_info in enumerate(chunks_to_process):
self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE")
# Create semaphore for concurrency control
semaphore = asyncio.Semaphore(max_concurrent)
async def process_with_semaphore(chunk_info):
async with semaphore:
return await process_single_chunk(chunk_info)
# Process all chunks in parallel with concurrency control
tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process]
self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE")
chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE")
# Handle any exceptions in the gather itself
processed_results = []
for i, result in enumerate(chunk_results):
if isinstance(result, Exception):
# Create error ChunkResult
chunk_info = chunks_to_process[i]
processed_results.append(ChunkResult(
originalChunk=chunk_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
chunkIndex=chunk_info['chunk_index'],
documentId=chunk_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
# Only add non-None results (skip empty containers)
processed_results.append(result)
logger.info(f"Completed processing {len(processed_results)} chunks")
return processed_results
def _mergeChunkResults(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results while preserving document structure and chunk order."""
if not chunkResults:
return ""
# Get merging configuration from options
chunk_separator = "\n\n---\n\n"
include_document_headers = True
include_chunk_metadata = False
if options:
if hasattr(options, 'chunkSeparator'):
chunk_separator = options.chunkSeparator
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n")
# Check for enhanced options
if hasattr(options, 'preserveChunkMetadata'):
include_chunk_metadata = options.preserveChunkMetadata
# Group chunk results by document
results_by_document = {}
for chunk_result in chunkResults:
doc_id = chunk_result.documentId
if doc_id not in results_by_document:
results_by_document[doc_id] = []
results_by_document[doc_id].append(chunk_result)
# Sort chunks within each document by chunk index
for doc_id in results_by_document:
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
# Merge results for each document
merged_documents = []
for doc_id, doc_chunks in results_by_document.items():
# Build document header if enabled
doc_header = ""
if include_document_headers:
doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n"
# Merge chunks for this document
doc_content = ""
for i, chunk_result in enumerate(doc_chunks):
# Add chunk separator (except for first chunk)
if i > 0:
doc_content += chunk_separator
# Add chunk content with optional metadata
chunk_metadata = chunk_result.metadata
if chunk_metadata.get("success", False):
chunk_content = chunk_result.aiResult
# Add chunk metadata if enabled
if include_chunk_metadata:
chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]"
chunk_content = f"{chunk_info}\n{chunk_content}"
doc_content += chunk_content
else:
# Handle error chunks
error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]"
doc_content += error_msg
merged_documents.append(doc_header + doc_content)
# Join all documents
final_result = "\n\n".join(merged_documents)
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents")
return final_result.strip()
def _mergeChunkResultsClean(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results in CLEAN mode - no debug metadata or document headers."""
if not chunkResults:
return ""
# Get merging configuration from options
chunk_separator = "\n\n"
include_document_headers = False # CLEAN MODE: No document headers
include_chunk_metadata = False # CLEAN MODE: No chunk metadata
if options:
if hasattr(options, 'chunkSeparator'):
chunk_separator = options.chunkSeparator
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n")
# Group chunk results by document
results_by_document = {}
for chunk_result in chunkResults:
doc_id = chunk_result.documentId
if doc_id not in results_by_document:
results_by_document[doc_id] = []
results_by_document[doc_id].append(chunk_result)
# Sort chunks within each document by chunk index
for doc_id in results_by_document:
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
# Merge results for each document in CLEAN mode
merged_documents = []
for doc_id, doc_chunks in results_by_document.items():
# CLEAN MODE: No document headers
doc_header = ""
# Merge chunks for this document
doc_content = ""
for i, chunk_result in enumerate(doc_chunks):
# Add chunk separator (except for first chunk)
if i > 0:
doc_content += chunk_separator
# Add chunk content without metadata
chunk_metadata = chunk_result.metadata
if chunk_metadata.get("success", False):
chunk_content = chunk_result.aiResult
# CLEAN MODE: Skip container/binary chunks entirely
if chunk_content.startswith("[Skipped ") and "content:" in chunk_content:
continue # Skip container/binary chunks in clean mode
# CLEAN MODE: Skip empty or whitespace-only chunks
if not chunk_content.strip():
continue # Skip empty chunks in clean mode
# CLEAN MODE: No chunk metadata
doc_content += chunk_content
else:
# Handle error chunks silently in clean mode
continue
merged_documents.append(doc_header + doc_content)
# Join all documents
final_result = "\n\n".join(merged_documents)
return final_result.strip()
def _mergeChunkResultsJson(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""Merge chunk results in JSON mode - returns structured JSON document."""
import json
if not chunkResults:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Group chunk results by document
results_by_document = {}
for chunk_result in chunkResults:
doc_id = chunk_result.documentId
if doc_id not in results_by_document:
results_by_document[doc_id] = []
results_by_document[doc_id].append(chunk_result)
# Sort chunks within each document by chunk index
for doc_id in results_by_document:
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
# Merge JSON results for each document
all_documents = []
all_sections = []
document_titles = []
combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"}
for doc_id, doc_chunks in results_by_document.items():
# Process each chunk's JSON result
for chunk_result in doc_chunks:
chunk_metadata = chunk_result.metadata
if chunk_metadata.get("success", False):
try:
# Parse JSON from AI result
chunk_json = json.loads(chunk_result.aiResult)
# Check if this is a multi-file response (has "documents" key)
if isinstance(chunk_json, dict) and "documents" in chunk_json:
# This is a multi-file response - merge all documents
logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents")
# Add all documents from this chunk
for doc in chunk_json["documents"]:
# Add chunk context to document
doc["metadata"] = doc.get("metadata", {})
doc["metadata"]["source_chunk"] = chunk_result.chunkIndex
doc["metadata"]["source_document"] = doc_id
all_documents.append(doc)
# Update combined metadata
if "metadata" in chunk_json:
combined_metadata.update(chunk_json["metadata"])
# Extract sections from single-file response (fallback)
elif isinstance(chunk_json, dict) and "sections" in chunk_json:
for section in chunk_json["sections"]:
# Add document context to section
section["metadata"] = section.get("metadata", {})
section["metadata"]["source_document"] = doc_id
section["metadata"]["chunk_index"] = chunk_result.chunkIndex
all_sections.append(section)
# Extract document title
if isinstance(chunk_json, dict) and "metadata" in chunk_json:
title = chunk_json["metadata"].get("title", "")
if title and title not in document_titles:
document_titles.append(title)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from chunk {chunk_result.chunkIndex}: {str(e)}")
# Create a fallback section for invalid JSON
fallback_section = {
"id": f"error_section_{chunk_result.chunkIndex}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error parsing chunk {chunk_result.chunkIndex}: {str(e)}"
}],
"order": chunk_result.chunkIndex,
"metadata": {
"source_document": doc_id,
"chunk_index": chunk_result.chunkIndex,
"error": str(e)
}
}
all_sections.append(fallback_section)
else:
# Handle error chunks
error_section = {
"id": f"error_section_{chunk_result.chunkIndex}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}"
}],
"order": chunk_result.chunkIndex,
"metadata": {
"source_document": doc_id,
"chunk_index": chunk_result.chunkIndex,
"error": chunk_metadata.get('error', 'Unknown error')
}
}
all_sections.append(error_section)
# Sort sections by order
all_sections.sort(key=lambda x: x.get("order", 0))
# If we have merged documents from multi-file responses, return them
if all_documents:
logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks")
return {
"metadata": combined_metadata,
"documents": all_documents
}
# Otherwise, create merged document with sections (single-file fallback)
merged_document = {
"metadata": {
"title": document_titles[0] if document_titles else "Merged Document",
"source_documents": list(results_by_document.keys()),
"extraction_method": "ai_json_extraction",
"version": "1.0"
},
"sections": all_sections,
"summary": f"Merged document from {len(results_by_document)} source documents",
"tags": ["merged", "ai_generated"]
}
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (JSON mode)")
return merged_document
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
"""
Get model capabilities for content processing, including appropriate size limits for chunking.
"""
# Estimate total content size
prompt_size = len(prompt.encode('utf-8'))
document_size = 0
if documents:
# Rough estimate of document content size
for doc in documents:
document_size += doc.fileSize or 0
total_size = prompt_size + document_size
# Use AiObjects to select the best model for this content size
# We'll simulate the model selection by checking available models
from modules.interfaces.interfaceAiObjects import aiModels
# Find the best model for this content size and operation
best_model = None
best_context_length = 0
for model_name, model_info in aiModels.items():
context_length = model_info.get("contextLength", 0)
# Skip models with no context length or too small for content
if context_length == 0:
continue
# Check if model supports the operation type
capabilities = model_info.get("capabilities", [])
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
continue
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
continue
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
continue
elif "text_generation" not in capabilities:
continue
# Prefer models that can handle the content without chunking, but allow chunking if needed
if context_length >= total_size * 0.8: # 80% of content size
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
elif best_model is None: # Fallback to largest available model
if context_length > best_context_length:
best_model = model_info
best_context_length = context_length
# Fallback to a reasonable default if no model found
if best_model is None:
best_model = {
"contextLength": 128000, # GPT-4o default
"llmName": "gpt-4o"
}
# Calculate appropriate sizes
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
context_length_bytes = int(best_model["contextLength"] * 4)
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
return {
"maxContextBytes": max_context_bytes,
"textChunkSize": text_chunk_size,
"imageChunkSize": image_chunk_size
}