1266 lines
64 KiB
Python
1266 lines
64 KiB
Python
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority
|
|
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted
|
|
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubDocumentProcessing:
|
|
"""Document processing operations including chunking, processing, and merging."""
|
|
|
|
def __init__(self, services, aiObjects):
|
|
"""Initialize document processing service.
|
|
|
|
Args:
|
|
services: Service center instance for accessing other services
|
|
aiObjects: Initialized AiObjects instance
|
|
"""
|
|
self.services = services
|
|
self.aiObjects = aiObjects
|
|
self._extractionService = None
|
|
|
|
@property
|
|
def extractionService(self):
|
|
"""Lazy initialization of extraction service."""
|
|
if self._extractionService is None:
|
|
logger.info("Lazy initializing ExtractionService...")
|
|
self._extractionService = ExtractionService(self.services)
|
|
return self._extractionService
|
|
|
|
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
|
|
"""Calculate maximum context bytes based on model capabilities and options."""
|
|
if options and options.maxContextBytes:
|
|
return options.maxContextBytes
|
|
|
|
# Default model capabilities (this should be enhanced with actual model registry)
|
|
defaultMaxTokens = 4000
|
|
safetyMargin = options.safetyMargin if options else 0.1
|
|
|
|
# Calculate bytes (4 chars per token estimation)
|
|
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
|
|
|
|
return maxContextBytes
|
|
|
|
async def processDocumentsPerChunk(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
prompt: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> str:
|
|
"""
|
|
Process documents with per-chunk AI calls and merge results.
|
|
FIXED: Now preserves chunk relationships and document structure.
|
|
|
|
Args:
|
|
documents: List of ChatDocument objects to process
|
|
prompt: AI prompt for processing
|
|
options: AI call options
|
|
|
|
Returns:
|
|
Merged AI results as string with preserved document structure
|
|
"""
|
|
if not documents:
|
|
return ""
|
|
|
|
# Get model capabilities for size calculation
|
|
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
|
|
|
|
# Build extraction options for chunking with intelligent merging
|
|
extractionOptions: Dict[str, Any] = {
|
|
"prompt": prompt,
|
|
"operationType": options.operationType if options else "general",
|
|
"processDocumentsIndividually": True, # Process each document separately
|
|
"maxSize": model_capabilities["maxContextBytes"],
|
|
"chunkAllowed": True,
|
|
"textChunkSize": model_capabilities["textChunkSize"],
|
|
"imageChunkSize": model_capabilities["imageChunkSize"],
|
|
"imageMaxPixels": 1024 * 1024,
|
|
"imageQuality": 85,
|
|
"mergeStrategy": {
|
|
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
|
"modelCapabilities": model_capabilities,
|
|
"prompt": prompt,
|
|
"groupBy": "typeGroup",
|
|
"orderBy": "id",
|
|
"mergeType": "concatenate"
|
|
},
|
|
}
|
|
|
|
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
|
|
|
try:
|
|
# Extract content with chunking
|
|
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
return "[Error: No extraction results]"
|
|
|
|
# FIXED: Process chunks with proper mapping
|
|
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options)
|
|
|
|
# FIXED: Merge with preserved chunk relationships
|
|
mergedContent = self._mergeChunkResults(chunkResults, options)
|
|
|
|
# Save merged extraction content to debug
|
|
self.services.utils.writeDebugFile(mergedContent or '', "extractionMergedText")
|
|
|
|
return mergedContent
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in per-chunk processing: {str(e)}")
|
|
return f"[Error in per-chunk processing: {str(e)}]"
|
|
|
|
async def processDocumentsPerChunkJson(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
prompt: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process documents with per-chunk AI calls and merge results in JSON mode.
|
|
Returns structured JSON document instead of text.
|
|
"""
|
|
if not documents:
|
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
|
|
|
# Get model capabilities for size calculation
|
|
model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
|
|
|
|
# Build extraction options for chunking with intelligent merging
|
|
extractionOptions: Dict[str, Any] = {
|
|
"prompt": prompt,
|
|
"operationType": options.operationType if options else "general",
|
|
"processDocumentsIndividually": True, # Process each document separately
|
|
"maxSize": model_capabilities["maxContextBytes"],
|
|
"chunkAllowed": True,
|
|
"textChunkSize": model_capabilities["textChunkSize"],
|
|
"imageChunkSize": model_capabilities["imageChunkSize"],
|
|
"imageMaxPixels": 1024 * 1024,
|
|
"imageQuality": 85,
|
|
"mergeStrategy": {
|
|
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
|
"modelCapabilities": model_capabilities,
|
|
"prompt": prompt,
|
|
"groupBy": "typeGroup",
|
|
"orderBy": "id",
|
|
"mergeType": "concatenate"
|
|
},
|
|
}
|
|
|
|
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
|
|
|
try:
|
|
# Extract content with chunking
|
|
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
return {"metadata": {"title": "Error Document"}, "sections": []}
|
|
|
|
# Process chunks with proper mapping
|
|
chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options, generate_json=True)
|
|
|
|
# Merge with JSON mode
|
|
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
|
|
|
# Normalize merged JSON into a single canonical table (only if table content exists)
|
|
try:
|
|
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
|
|
normalizer = NormalizationService(self.services)
|
|
inventory = normalizer.discoverStructures(mergedJsonDocument)
|
|
|
|
# Check if any table content was discovered
|
|
tableHeaders = inventory.get("tableHeaders", [])
|
|
if not tableHeaders:
|
|
logger.info("No table content found in merged JSON, skipping normalization and returning original structure")
|
|
else:
|
|
# Use workflow id as cache key
|
|
cacheKey = self.services.currentWorkflow.id
|
|
# Provide the extraction/merge prompt context when available to help mapping
|
|
mergePrompt = prompt
|
|
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
|
|
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
|
|
report = normalizer.validateCanonical(canonical)
|
|
if report.get('success'):
|
|
mergedJsonDocument = canonical
|
|
else:
|
|
raise ValueError('Normalization produced zero rows')
|
|
except Exception as e:
|
|
# Log normalization failure but don't re-raise - continue with original merged JSON
|
|
logger.warning(f"Normalization failed (expected): {str(e)}")
|
|
logger.debug(f"Normalization error type: {type(e).__name__}")
|
|
# Continue with original merged JSON instead of re-raising
|
|
|
|
# Save merged JSON extraction content to debug
|
|
jsonStr = json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2)
|
|
self.services.utils.writeDebugFile(jsonStr, "extractionMergedJson")
|
|
|
|
return mergedJsonDocument
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}")
|
|
logger.error(f"Exception type: {type(e).__name__}")
|
|
logger.error(f"Exception args: {e.args}")
|
|
import traceback
|
|
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
return {"metadata": {"title": "Error Document"}, "sections": []}
|
|
|
|
async def processDocumentsPerChunkJsonWithPrompt(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
custom_prompt: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process documents with per-chunk AI calls and merge results in JSON mode.
|
|
Uses a custom prompt instead of the default extraction prompt.
|
|
Enhanced with partial results continuation logic.
|
|
"""
|
|
if not documents:
|
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
|
|
|
# Get model capabilities for size calculation
|
|
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
|
|
|
|
# Build extraction options for chunking with intelligent merging
|
|
extractionOptions: Dict[str, Any] = {
|
|
"prompt": custom_prompt, # Use the custom prompt instead of default
|
|
"operationType": options.operationType if options else "general",
|
|
"processDocumentsIndividually": True, # Process each document separately
|
|
"maxSize": model_capabilities["maxContextBytes"],
|
|
"chunkAllowed": True,
|
|
"textChunkSize": model_capabilities["textChunkSize"],
|
|
"imageChunkSize": model_capabilities["imageChunkSize"],
|
|
"imageMaxPixels": 1024 * 1024,
|
|
"imageQuality": 85,
|
|
"mergeStrategy": {
|
|
"useIntelligentMerging": True, # Enable intelligent token-aware merging
|
|
"modelCapabilities": model_capabilities,
|
|
"prompt": custom_prompt, # Use the custom prompt
|
|
"groupBy": "typeGroup",
|
|
"orderBy": "id",
|
|
"mergeType": "concatenate"
|
|
},
|
|
}
|
|
|
|
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
|
|
|
|
try:
|
|
# Extract content with chunking
|
|
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
|
|
|
|
if not isinstance(extractionResult, list):
|
|
return {"metadata": {"title": "Error Document"}, "sections": []}
|
|
|
|
# Process chunks with proper mapping
|
|
logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
|
|
logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
|
|
|
|
# Debug: Show what content is being processed (before filtering)
|
|
for i, ec in enumerate(extractionResult):
|
|
if hasattr(ec, 'parts'):
|
|
for j, part in enumerate(ec.parts):
|
|
if not (hasattr(part, 'data') and part.data):
|
|
# Check if this is an empty container chunk (which is expected)
|
|
part_type = getattr(part, 'typeGroup', None)
|
|
part_mime = getattr(part, 'mimeType', '')
|
|
|
|
is_empty_container = (
|
|
part_type == "container" and
|
|
part_mime and
|
|
'document' in part_mime.lower()
|
|
)
|
|
|
|
if not is_empty_container:
|
|
logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'")
|
|
|
|
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
|
|
|
|
# Debug: Show what chunks were actually processed (after filtering)
|
|
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
|
|
|
|
# Merge with JSON mode
|
|
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
|
|
|
|
# Debug: Show what the AI actually returned
|
|
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
|
|
if 'documents' in mergedJsonDocument:
|
|
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
|
|
elif 'sections' in mergedJsonDocument:
|
|
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
|
|
|
|
return mergedJsonDocument
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
|
|
return {"metadata": {"title": "Error Document"}, "sections": []}
|
|
|
|
async def processDocumentsWithContinuation(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
custom_prompt: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process documents with partial results continuation logic.
|
|
Handles AI responses that indicate partial completion and loops until complete.
|
|
"""
|
|
if not documents:
|
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
|
|
|
logger.info("Starting document processing with continuation logic")
|
|
|
|
# Build enhanced prompt with continuation instructions
|
|
enhanced_prompt = self._buildContinuationPrompt(custom_prompt)
|
|
|
|
# Process with continuation logic
|
|
return await self._processWithContinuationLoop(documents, enhanced_prompt, options)
|
|
|
|
def _buildContinuationPrompt(self, base_prompt: str) -> str:
|
|
"""
|
|
Build a prompt that includes partial results continuation instructions.
|
|
"""
|
|
continuation_instructions = """
|
|
|
|
IMPORTANT CHUNKING LOGIC:
|
|
- If the response is too large to generate completely in one response, set "continue": true
|
|
- When "continue": true, include a "continuation_context" field with:
|
|
- "last_section_id": "id of the last completed section"
|
|
- "last_element_index": "index of the last completed element in that section"
|
|
- "remaining_requirements": "brief description of what still needs to be generated"
|
|
- The AI will be called again with this context to continue generation
|
|
- Only set "continue": false when the response is completely generated
|
|
|
|
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
|
{
|
|
"metadata": {
|
|
"title": "Document Title"
|
|
},
|
|
"sections": [
|
|
{
|
|
"id": "section_1",
|
|
"content_type": "paragraph",
|
|
"elements": [
|
|
{
|
|
"text": "This is the actual content that should be generated."
|
|
}
|
|
],
|
|
"order": 1
|
|
}
|
|
],
|
|
"continue": false,
|
|
"continuation_context": {
|
|
"last_section_id": "section_1",
|
|
"last_element_index": 0,
|
|
"remaining_requirements": "description of what still needs to be generated"
|
|
}
|
|
}
|
|
|
|
The AI should generate content using the canonical format with "sections" and "elements".
|
|
"""
|
|
|
|
return f"{base_prompt}{continuation_instructions}"
|
|
|
|
async def _processWithContinuationLoop(
|
|
self,
|
|
documents: List[ChatDocument],
|
|
enhanced_prompt: str,
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process documents with continuation loop until complete.
|
|
"""
|
|
max_iterations = 10 # Prevent infinite loops
|
|
iteration = 0
|
|
accumulated_sections = []
|
|
continuation_context = None
|
|
|
|
while iteration < max_iterations:
|
|
iteration += 1
|
|
logger.info(f"Continuation iteration {iteration}/{max_iterations}")
|
|
|
|
# Build prompt for this iteration
|
|
if continuation_context:
|
|
iteration_prompt = self._buildContinuationIterationPrompt(
|
|
enhanced_prompt, continuation_context, accumulated_sections
|
|
)
|
|
else:
|
|
iteration_prompt = enhanced_prompt
|
|
|
|
# Process documents for this iteration
|
|
try:
|
|
# Use the existing processing method
|
|
result = await self.processDocumentsPerChunkJsonWithPrompt(
|
|
documents, iteration_prompt, options
|
|
)
|
|
|
|
# Check if this is a valid JSON response
|
|
if not isinstance(result, dict):
|
|
logger.warning(f"Iteration {iteration}: Invalid result type, stopping")
|
|
break
|
|
|
|
# Extract sections from result
|
|
sections = result.get("sections", [])
|
|
if not sections:
|
|
logger.warning(f"Iteration {iteration}: No sections found, stopping")
|
|
break
|
|
|
|
# Add sections to accumulated results
|
|
for section in sections:
|
|
# Update section order to maintain sequence
|
|
section["order"] = len(accumulated_sections) + 1
|
|
accumulated_sections.append(section)
|
|
|
|
# Check if continuation is needed
|
|
continue_flag = result.get("continue", False)
|
|
continuation_context = result.get("continuation_context")
|
|
|
|
logger.info(f"Iteration {iteration}: Added {len(sections)} sections, continue={continue_flag}")
|
|
|
|
if not continue_flag:
|
|
logger.info(f"Continuation complete after {iteration} iterations")
|
|
break
|
|
|
|
if not continuation_context:
|
|
logger.warning(f"Iteration {iteration}: continue=true but no continuation_context, stopping")
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.error(f"Iteration {iteration} failed: {str(e)}")
|
|
break
|
|
|
|
if iteration >= max_iterations:
|
|
logger.warning(f"Continuation stopped after maximum iterations ({max_iterations})")
|
|
|
|
# Build final result
|
|
final_result = {
|
|
"metadata": {
|
|
"title": "Generated Document",
|
|
"total_sections": len(accumulated_sections),
|
|
"iterations": iteration,
|
|
"continuation_used": iteration > 1
|
|
},
|
|
"sections": accumulated_sections,
|
|
"continue": False
|
|
}
|
|
|
|
logger.info(f"Final result: {len(accumulated_sections)} sections from {iteration} iterations")
|
|
return final_result
|
|
|
|
def _buildContinuationIterationPrompt(
|
|
self,
|
|
base_prompt: str,
|
|
continuation_context: Dict[str, Any],
|
|
accumulated_sections: List[Dict[str, Any]]
|
|
) -> str:
|
|
"""
|
|
Build a prompt for continuation iteration with context.
|
|
"""
|
|
last_section_id = continuation_context.get("last_section_id", "")
|
|
last_element_index = continuation_context.get("last_element_index", 0)
|
|
remaining_requirements = continuation_context.get("remaining_requirements", "")
|
|
|
|
# Build context of what's already been generated
|
|
context_summary = "PREVIOUSLY GENERATED CONTENT:\n"
|
|
for i, section in enumerate(accumulated_sections[-3:]): # Show last 3 sections for context
|
|
context_summary += f"Section {i+1}: {section.get('id', 'unknown')}\n"
|
|
if 'elements' in section and section['elements']:
|
|
first_element = section['elements'][0]
|
|
if 'text' in first_element:
|
|
preview = first_element['text'][:100] + "..." if len(first_element['text']) > 100 else first_element['text']
|
|
context_summary += f" Preview: {preview}\n"
|
|
|
|
continuation_prompt = f"""
|
|
{base_prompt}
|
|
|
|
{context_summary}
|
|
|
|
CONTINUATION INSTRUCTIONS:
|
|
- Continue from where you left off
|
|
- Last completed section: {last_section_id}
|
|
- Last completed element index: {last_element_index}
|
|
- Remaining requirements: {remaining_requirements}
|
|
- Generate the next part of the content
|
|
- Maintain consistency with previously generated content
|
|
- Use the same JSON format as before
|
|
- Set "continue": true if more content is needed, false if complete
|
|
"""
|
|
|
|
return continuation_prompt
|
|
|
|
async def callAiText(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions
|
|
) -> str:
|
|
"""
|
|
Handle text calls with document processing through ExtractionService.
|
|
UNIFIED PROCESSING: Always use per-chunk processing for consistency.
|
|
"""
|
|
# UNIFIED PROCESSING: Always use per-chunk processing for consistency
|
|
# This ensures MIME-type checking, chunk mapping, and parallel processing
|
|
return await self.processDocumentsPerChunk(documents, prompt, options)
|
|
|
|
async def _processChunksWithMapping(
|
|
self,
|
|
extractionResult: List[ContentExtracted],
|
|
prompt: str,
|
|
options: Optional[AiCallOptions] = None,
|
|
generate_json: bool = False
|
|
) -> List[ChunkResult]:
|
|
"""Process chunks with proper mapping to preserve relationships."""
|
|
from modules.datamodels.datamodelExtraction import ChunkResult
|
|
import asyncio
|
|
|
|
# Collect all chunks that need processing with proper indexing
|
|
chunks_to_process = []
|
|
chunk_index = 0
|
|
|
|
for ec in extractionResult:
|
|
# Get document MIME type from metadata
|
|
document_mime_type = None
|
|
for part in ec.parts:
|
|
if part.metadata and 'documentMimeType' in part.metadata:
|
|
document_mime_type = part.metadata['documentMimeType']
|
|
break
|
|
|
|
for part in ec.parts:
|
|
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
|
|
# Skip empty container chunks (they're just metadata containers)
|
|
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
|
logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
|
|
continue
|
|
|
|
chunks_to_process.append({
|
|
'part': part,
|
|
'chunk_index': chunk_index,
|
|
'document_id': ec.id,
|
|
'document_mime_type': document_mime_type
|
|
})
|
|
chunk_index += 1
|
|
|
|
logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping")
|
|
|
|
# Process chunks in parallel with proper mapping
|
|
async def process_single_chunk(chunk_info: Dict) -> ChunkResult:
|
|
part = chunk_info['part']
|
|
chunk_index = chunk_info['chunk_index']
|
|
document_id = chunk_info['document_id']
|
|
document_mime_type = chunk_info.get('document_mime_type', part.mimeType)
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# FIXED: Check MIME type first, then fallback to typeGroup
|
|
is_image = (
|
|
(document_mime_type and document_mime_type.startswith('image/')) or
|
|
(part.mimeType and part.mimeType.startswith('image/')) or
|
|
(part.typeGroup == "image")
|
|
)
|
|
|
|
# Debug logging
|
|
self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE")
|
|
logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}")
|
|
|
|
if is_image:
|
|
# Use the same extraction prompt for image analysis (contains table JSON format)
|
|
self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
|
|
|
# Check if image data is available
|
|
if not part.data:
|
|
error_msg = f"No image data available for chunk {chunk_index}"
|
|
logger.warning(error_msg)
|
|
ai_result = f"Error: {error_msg}"
|
|
else:
|
|
try:
|
|
# Import here to avoid circular imports
|
|
from modules.services.serviceAi.subCoreAi import SubCoreAi
|
|
core_ai = SubCoreAi(self.services, self.aiObjects)
|
|
|
|
ai_result = await core_ai.readImage(
|
|
prompt=prompt,
|
|
imageData=part.data,
|
|
mimeType=part.mimeType,
|
|
options=options
|
|
)
|
|
|
|
self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE")
|
|
# Save image extraction response to debug file
|
|
self.services.utils.writeDebugFile(ai_result or 'No response', f"extraction_image_chunk_{chunk_index}")
|
|
|
|
# Check if result is empty or None
|
|
if not ai_result or not ai_result.strip():
|
|
logger.warning(f"Image chunk {chunk_index} returned empty response from AI")
|
|
ai_result = "No content detected in image"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing image chunk {chunk_index}: {str(e)}")
|
|
ai_result = f"Error analyzing image: {str(e)}"
|
|
|
|
# If generating JSON, clean image analysis result
|
|
if generate_json:
|
|
try:
|
|
|
|
# Clean the response - remove markdown code blocks if present
|
|
cleaned_result = ai_result.strip()
|
|
|
|
# Remove various markdown patterns
|
|
if cleaned_result.startswith('```json'):
|
|
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
|
|
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
|
elif cleaned_result.startswith('```'):
|
|
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
|
|
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
|
|
|
# Remove any leading/trailing text that's not JSON
|
|
# Look for the first { and last } to extract JSON
|
|
first_brace = cleaned_result.find('{')
|
|
last_brace = cleaned_result.rfind('}')
|
|
|
|
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
|
|
cleaned_result = cleaned_result[first_brace:last_brace + 1]
|
|
|
|
# Additional cleaning for common AI response issues
|
|
cleaned_result = cleaned_result.strip()
|
|
|
|
# Validate JSON
|
|
json.loads(cleaned_result)
|
|
ai_result = cleaned_result # Use cleaned version
|
|
self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}")
|
|
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
|
|
|
|
# Create fallback JSON with the actual response content (not the error message)
|
|
# Use the original AI response content, not the error message
|
|
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
|
|
|
|
self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
|
|
|
|
ai_result = json.dumps({
|
|
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
|
|
"sections": [{
|
|
"id": f"image_section_{chunk_index}",
|
|
"content_type": "paragraph",
|
|
"elements": [{"text": fallback_content}]
|
|
}]
|
|
})
|
|
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
|
|
elif part.typeGroup in ("container", "binary"):
|
|
# Handle ALL container and binary content generically - let AI process any document type
|
|
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
|
|
|
# Skip empty container chunks (they're just metadata containers)
|
|
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
|
|
self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
|
logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
|
|
# Skip processing this chunk
|
|
pass
|
|
elif part.mimeType and part.data and len(part.data.strip()) > 0:
|
|
# Process any document container as text content
|
|
request_options = options if options is not None else AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE")
|
|
logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}")
|
|
|
|
# Log extraction prompt and context
|
|
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
|
|
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
|
|
|
|
# Strengthen prompt to forbid fabrication for text/container extraction
|
|
augmented_prompt = (
|
|
f"{prompt}\n\n"
|
|
"CRITICAL RULES (NO FABRICATION):\n"
|
|
"- Use ONLY content present in the provided CONTEXT.\n"
|
|
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
|
|
"- If a value is missing, leave the cell empty or omit the row.\n"
|
|
)
|
|
request = AiCallRequest(
|
|
prompt=augmented_prompt,
|
|
context=part.data,
|
|
options=request_options
|
|
)
|
|
response = await self.aiObjects.call(request)
|
|
ai_result = response.content
|
|
|
|
# Log extraction response
|
|
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
|
|
|
|
# Save extraction prompt and response to debug
|
|
self.services.utils.writeDebugFile(augmented_prompt, f"extraction-Chunk{chunk_index}-Prompt")
|
|
self.services.utils.writeDebugFile(ai_result or '', f"extraction-Chunk{chunk_index}-Response")
|
|
|
|
# If generating JSON, validate the response
|
|
if generate_json:
|
|
try:
|
|
|
|
# Clean the response - remove markdown code blocks if present
|
|
cleaned_result = ai_result.strip()
|
|
|
|
# Remove various markdown patterns
|
|
if cleaned_result.startswith('```json'):
|
|
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
|
|
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
|
elif cleaned_result.startswith('```'):
|
|
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
|
|
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
|
|
|
|
# Remove any leading/trailing text that's not JSON
|
|
# Look for the first { and last } to extract JSON
|
|
first_brace = cleaned_result.find('{')
|
|
last_brace = cleaned_result.rfind('}')
|
|
|
|
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
|
|
cleaned_result = cleaned_result[first_brace:last_brace + 1]
|
|
|
|
# Additional cleaning for common AI response issues
|
|
cleaned_result = cleaned_result.strip()
|
|
|
|
# Validate JSON
|
|
json.loads(cleaned_result)
|
|
ai_result = cleaned_result # Use cleaned version
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}")
|
|
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
|
|
|
|
# Create fallback JSON with the actual response content (not the error message)
|
|
# Use the original AI response content, not the error message
|
|
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
|
|
|
|
self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
|
|
|
|
ai_result = json.dumps({
|
|
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
|
|
"sections": [{
|
|
"id": f"analysis_section_{chunk_index}",
|
|
"content_type": "paragraph",
|
|
"elements": [{"text": fallback_content}]
|
|
}]
|
|
})
|
|
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
|
|
else:
|
|
# Skip empty or invalid container/binary content - don't create a result
|
|
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
|
|
# Return None to indicate this chunk should be completely skipped
|
|
return None
|
|
else:
|
|
# Ensure options is not None and set correct operation type for text
|
|
request_options = options if options is not None else AiCallOptions()
|
|
# FIXED: Set operation type to general for text processing
|
|
request_options.operationType = OperationType.GENERAL
|
|
self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE")
|
|
logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}")
|
|
|
|
# Log extraction context length
|
|
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
|
|
|
|
# Debug: Log the actual prompt being sent to AI
|
|
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
|
|
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
|
|
|
|
# Strengthen prompt to forbid fabrication for text extraction
|
|
augmented_prompt_text = (
|
|
f"{prompt}\n\n"
|
|
"CRITICAL RULES (NO FABRICATION):\n"
|
|
"- Use ONLY content present in the provided CONTEXT.\n"
|
|
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
|
|
"- If a value is missing, leave the cell empty or omit the row.\n"
|
|
)
|
|
request = AiCallRequest(
|
|
prompt=augmented_prompt_text,
|
|
context=part.data,
|
|
options=request_options
|
|
)
|
|
response = await self.aiObjects.call(request)
|
|
|
|
# Debug: Log what AI actually returned
|
|
logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
|
|
ai_result = response.content
|
|
|
|
# Log extraction response length
|
|
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
|
|
|
|
# Save extraction prompt and response to debug
|
|
self.services.utils.writeDebugFile(augmented_prompt_text, f"extractionChunk{chunk_index}-Prompt")
|
|
self.services.utils.writeDebugFile(ai_result or '', f"extractionChunk{chunk_index}-Response")
|
|
|
|
# If generating JSON, validate the response
|
|
if generate_json:
|
|
try:
|
|
|
|
# Clean the response - remove markdown code blocks and extra formatting
|
|
cleaned_result = ai_result.strip()
|
|
|
|
# Remove any markdown code block markers (```json, ```, etc.)
|
|
cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
|
|
cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
|
|
|
|
# Remove any remaining ``` markers anywhere in the text
|
|
cleaned_result = re.sub(r'```', '', cleaned_result)
|
|
|
|
# Try to extract JSON from the response if it's embedded in other text
|
|
json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
|
|
if json_match:
|
|
cleaned_result = json_match.group(0)
|
|
|
|
# Validate JSON
|
|
json.loads(cleaned_result)
|
|
ai_result = cleaned_result # Use cleaned version
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}")
|
|
# Create fallback JSON
|
|
ai_result = json.dumps({
|
|
"metadata": {"title": "Error Section"},
|
|
"sections": [{
|
|
"id": f"error_section_{chunk_index}",
|
|
"content_type": "paragraph",
|
|
"elements": [{"text": f"Error parsing JSON: {str(e)}"}]
|
|
}]
|
|
})
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s")
|
|
|
|
return ChunkResult(
|
|
originalChunk=part,
|
|
aiResult=ai_result,
|
|
chunkIndex=chunk_index,
|
|
documentId=document_id,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": True,
|
|
"chunkSize": len(part.data) if part.data else 0,
|
|
"resultSize": len(ai_result),
|
|
"typeGroup": part.typeGroup
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
processing_time = time.time() - start_time
|
|
logger.warning(f"Error processing chunk {chunk_index}: {str(e)}")
|
|
|
|
return ChunkResult(
|
|
originalChunk=part,
|
|
aiResult=f"[Error processing chunk: {str(e)}]",
|
|
chunkIndex=chunk_index,
|
|
documentId=document_id,
|
|
processingTime=processing_time,
|
|
metadata={
|
|
"success": False,
|
|
"error": str(e),
|
|
"chunkSize": len(part.data) if part.data else 0,
|
|
"typeGroup": part.typeGroup
|
|
}
|
|
)
|
|
|
|
# Process chunks with concurrency control
|
|
max_concurrent = 5 # Default concurrency
|
|
if options and hasattr(options, 'maxConcurrentChunks'):
|
|
max_concurrent = options.maxConcurrentChunks
|
|
elif options and hasattr(options, 'maxParallelChunks'):
|
|
max_concurrent = options.maxParallelChunks
|
|
|
|
logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}")
|
|
self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE")
|
|
for i, chunk_info in enumerate(chunks_to_process):
|
|
self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE")
|
|
|
|
# Create semaphore for concurrency control
|
|
semaphore = asyncio.Semaphore(max_concurrent)
|
|
|
|
async def process_with_semaphore(chunk_info):
|
|
async with semaphore:
|
|
return await process_single_chunk(chunk_info)
|
|
|
|
# Process all chunks in parallel with concurrency control
|
|
tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process]
|
|
self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE")
|
|
chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE")
|
|
|
|
# Handle any exceptions in the gather itself
|
|
processed_results = []
|
|
for i, result in enumerate(chunk_results):
|
|
if isinstance(result, Exception):
|
|
# Create error ChunkResult
|
|
chunk_info = chunks_to_process[i]
|
|
processed_results.append(ChunkResult(
|
|
originalChunk=chunk_info['part'],
|
|
aiResult=f"[Error in parallel processing: {str(result)}]",
|
|
chunkIndex=chunk_info['chunk_index'],
|
|
documentId=chunk_info['document_id'],
|
|
processingTime=0.0,
|
|
metadata={"success": False, "error": str(result)}
|
|
))
|
|
elif result is not None:
|
|
# Only add non-None results (skip empty containers)
|
|
processed_results.append(result)
|
|
|
|
logger.info(f"Completed processing {len(processed_results)} chunks")
|
|
return processed_results
|
|
|
|
def _mergeChunkResults(
|
|
self,
|
|
chunkResults: List[ChunkResult],
|
|
options: Optional[AiCallOptions] = None
|
|
) -> str:
|
|
"""Merge chunk results while preserving document structure and chunk order."""
|
|
|
|
if not chunkResults:
|
|
return ""
|
|
|
|
# Get merging configuration from options
|
|
chunk_separator = "\n\n---\n\n"
|
|
include_document_headers = True
|
|
include_chunk_metadata = False
|
|
|
|
if options:
|
|
if hasattr(options, 'chunkSeparator'):
|
|
chunk_separator = options.chunkSeparator
|
|
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
|
|
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n")
|
|
|
|
# Check for enhanced options
|
|
if hasattr(options, 'preserveChunkMetadata'):
|
|
include_chunk_metadata = options.preserveChunkMetadata
|
|
|
|
# Group chunk results by document
|
|
results_by_document = {}
|
|
for chunk_result in chunkResults:
|
|
doc_id = chunk_result.documentId
|
|
if doc_id not in results_by_document:
|
|
results_by_document[doc_id] = []
|
|
results_by_document[doc_id].append(chunk_result)
|
|
|
|
# Sort chunks within each document by chunk index
|
|
for doc_id in results_by_document:
|
|
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
|
|
|
|
# Merge results for each document
|
|
merged_documents = []
|
|
|
|
for doc_id, doc_chunks in results_by_document.items():
|
|
# Build document header if enabled
|
|
doc_header = ""
|
|
if include_document_headers:
|
|
doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n"
|
|
|
|
# Merge chunks for this document
|
|
doc_content = ""
|
|
for i, chunk_result in enumerate(doc_chunks):
|
|
# Add chunk separator (except for first chunk)
|
|
if i > 0:
|
|
doc_content += chunk_separator
|
|
|
|
# Add chunk content with optional metadata
|
|
chunk_metadata = chunk_result.metadata
|
|
if chunk_metadata.get("success", False):
|
|
chunk_content = chunk_result.aiResult
|
|
|
|
# Add chunk metadata if enabled
|
|
if include_chunk_metadata:
|
|
chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]"
|
|
chunk_content = f"{chunk_info}\n{chunk_content}"
|
|
|
|
doc_content += chunk_content
|
|
else:
|
|
# Handle error chunks
|
|
error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]"
|
|
doc_content += error_msg
|
|
|
|
merged_documents.append(doc_header + doc_content)
|
|
|
|
# Join all documents
|
|
final_result = "\n\n".join(merged_documents)
|
|
|
|
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents")
|
|
return final_result.strip()
|
|
|
|
def _mergeChunkResultsClean(
|
|
self,
|
|
chunkResults: List[ChunkResult],
|
|
options: Optional[AiCallOptions] = None
|
|
) -> str:
|
|
"""Merge chunk results in CLEAN mode - no debug metadata or document headers."""
|
|
|
|
if not chunkResults:
|
|
return ""
|
|
|
|
# Get merging configuration from options
|
|
chunk_separator = "\n\n"
|
|
include_document_headers = False # CLEAN MODE: No document headers
|
|
include_chunk_metadata = False # CLEAN MODE: No chunk metadata
|
|
|
|
if options:
|
|
if hasattr(options, 'chunkSeparator'):
|
|
chunk_separator = options.chunkSeparator
|
|
elif hasattr(options, 'mergeStrategy') and options.mergeStrategy:
|
|
chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n")
|
|
|
|
# Group chunk results by document
|
|
results_by_document = {}
|
|
for chunk_result in chunkResults:
|
|
doc_id = chunk_result.documentId
|
|
if doc_id not in results_by_document:
|
|
results_by_document[doc_id] = []
|
|
results_by_document[doc_id].append(chunk_result)
|
|
|
|
# Sort chunks within each document by chunk index
|
|
for doc_id in results_by_document:
|
|
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
|
|
|
|
# Merge results for each document in CLEAN mode
|
|
merged_documents = []
|
|
|
|
for doc_id, doc_chunks in results_by_document.items():
|
|
# CLEAN MODE: No document headers
|
|
doc_header = ""
|
|
|
|
# Merge chunks for this document
|
|
doc_content = ""
|
|
for i, chunk_result in enumerate(doc_chunks):
|
|
# Add chunk separator (except for first chunk)
|
|
if i > 0:
|
|
doc_content += chunk_separator
|
|
|
|
# Add chunk content without metadata
|
|
chunk_metadata = chunk_result.metadata
|
|
if chunk_metadata.get("success", False):
|
|
chunk_content = chunk_result.aiResult
|
|
|
|
# CLEAN MODE: Skip container/binary chunks entirely
|
|
if chunk_content.startswith("[Skipped ") and "content:" in chunk_content:
|
|
continue # Skip container/binary chunks in clean mode
|
|
|
|
# CLEAN MODE: Skip empty or whitespace-only chunks
|
|
if not chunk_content.strip():
|
|
continue # Skip empty chunks in clean mode
|
|
|
|
# CLEAN MODE: No chunk metadata
|
|
doc_content += chunk_content
|
|
else:
|
|
# Handle error chunks silently in clean mode
|
|
continue
|
|
|
|
merged_documents.append(doc_header + doc_content)
|
|
|
|
# Join all documents
|
|
final_result = "\n\n".join(merged_documents)
|
|
|
|
return final_result.strip()
|
|
|
|
def _mergeChunkResultsJson(
|
|
self,
|
|
chunkResults: List[ChunkResult],
|
|
options: Optional[AiCallOptions] = None
|
|
) -> Dict[str, Any]:
|
|
"""Merge chunk results in JSON mode - returns structured JSON document."""
|
|
|
|
if not chunkResults:
|
|
return {"metadata": {"title": "Empty Document"}, "sections": []}
|
|
|
|
# Group chunk results by document
|
|
results_by_document = {}
|
|
for chunk_result in chunkResults:
|
|
doc_id = chunk_result.documentId
|
|
if doc_id not in results_by_document:
|
|
results_by_document[doc_id] = []
|
|
results_by_document[doc_id].append(chunk_result)
|
|
|
|
# Sort chunks within each document by chunk index
|
|
for doc_id in results_by_document:
|
|
results_by_document[doc_id].sort(key=lambda x: x.chunkIndex)
|
|
|
|
# Merge JSON results for each document
|
|
all_documents = []
|
|
all_sections = []
|
|
document_titles = []
|
|
combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"}
|
|
|
|
for doc_id, doc_chunks in results_by_document.items():
|
|
# Process each chunk's JSON result
|
|
for chunk_result in doc_chunks:
|
|
chunk_metadata = chunk_result.metadata
|
|
if chunk_metadata.get("success", False):
|
|
try:
|
|
# Parse JSON from AI result
|
|
chunk_json = json.loads(chunk_result.aiResult)
|
|
|
|
# Check if this is a multi-file response (has "documents" key)
|
|
if isinstance(chunk_json, dict) and "documents" in chunk_json:
|
|
# This is a multi-file response - merge all documents
|
|
logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents")
|
|
|
|
# Add all documents from this chunk
|
|
for doc in chunk_json["documents"]:
|
|
# Add chunk context to document
|
|
doc["metadata"] = doc.get("metadata", {})
|
|
doc["metadata"]["source_chunk"] = chunk_result.chunkIndex
|
|
doc["metadata"]["source_document"] = doc_id
|
|
all_documents.append(doc)
|
|
|
|
# Update combined metadata
|
|
if "metadata" in chunk_json:
|
|
combined_metadata.update(chunk_json["metadata"])
|
|
|
|
# Extract sections from single-file response (fallback)
|
|
elif isinstance(chunk_json, dict) and "sections" in chunk_json:
|
|
for section in chunk_json["sections"]:
|
|
# Add document context to section
|
|
section["metadata"] = section.get("metadata", {})
|
|
section["metadata"]["source_document"] = doc_id
|
|
section["metadata"]["chunk_index"] = chunk_result.chunkIndex
|
|
all_sections.append(section)
|
|
|
|
# Extract document title
|
|
if isinstance(chunk_json, dict) and "metadata" in chunk_json:
|
|
title = chunk_json["metadata"].get("title", "")
|
|
if title and title not in document_titles:
|
|
document_titles.append(title)
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"Failed to parse JSON from chunk {chunk_result.chunkIndex}: {str(e)}")
|
|
# Create a fallback section for invalid JSON
|
|
fallback_section = {
|
|
"id": f"error_section_{chunk_result.chunkIndex}",
|
|
"title": "Error Section",
|
|
"content_type": "paragraph",
|
|
"elements": [{
|
|
"text": f"Error parsing chunk {chunk_result.chunkIndex}: {str(e)}"
|
|
}],
|
|
"order": chunk_result.chunkIndex,
|
|
"metadata": {
|
|
"source_document": doc_id,
|
|
"chunk_index": chunk_result.chunkIndex,
|
|
"error": str(e)
|
|
}
|
|
}
|
|
all_sections.append(fallback_section)
|
|
else:
|
|
# Handle error chunks
|
|
error_section = {
|
|
"id": f"error_section_{chunk_result.chunkIndex}",
|
|
"title": "Error Section",
|
|
"content_type": "paragraph",
|
|
"elements": [{
|
|
"text": f"Error in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}"
|
|
}],
|
|
"order": chunk_result.chunkIndex,
|
|
"metadata": {
|
|
"source_document": doc_id,
|
|
"chunk_index": chunk_result.chunkIndex,
|
|
"error": chunk_metadata.get('error', 'Unknown error')
|
|
}
|
|
}
|
|
all_sections.append(error_section)
|
|
|
|
# Sort sections by order
|
|
all_sections.sort(key=lambda x: x.get("order", 0))
|
|
|
|
# If we have merged documents from multi-file responses, return them
|
|
if all_documents:
|
|
logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks")
|
|
return {
|
|
"metadata": combined_metadata,
|
|
"documents": all_documents
|
|
}
|
|
|
|
# Otherwise, create merged document with sections (single-file fallback)
|
|
merged_document = {
|
|
"metadata": {
|
|
"title": document_titles[0] if document_titles else "Merged Document",
|
|
"source_documents": list(results_by_document.keys()),
|
|
"extraction_method": "ai_json_extraction",
|
|
"version": "1.0"
|
|
},
|
|
"sections": all_sections,
|
|
"summary": f"Merged document from {len(results_by_document)} source documents",
|
|
"tags": ["merged", "ai_generated"]
|
|
}
|
|
|
|
logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (JSON mode)")
|
|
return merged_document
|
|
|
|
def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
|
|
"""
|
|
Get model capabilities for content processing, including appropriate size limits for chunking.
|
|
"""
|
|
# Estimate total content size
|
|
prompt_size = len(prompt.encode('utf-8'))
|
|
document_size = 0
|
|
if documents:
|
|
# Rough estimate of document content size
|
|
for doc in documents:
|
|
document_size += doc.fileSize or 0
|
|
|
|
total_size = prompt_size + document_size
|
|
|
|
# Use AiObjects to select the best model for this content size
|
|
# We'll simulate the model selection by checking available models
|
|
from modules.interfaces.interfaceAiObjects import aiModels
|
|
|
|
# Find the best model for this content size and operation
|
|
best_model = None
|
|
best_context_length = 0
|
|
|
|
for model_name, model_info in aiModels.items():
|
|
context_length = model_info.get("contextLength", 0)
|
|
|
|
# Skip models with no context length or too small for content
|
|
if context_length == 0:
|
|
continue
|
|
|
|
# Check if model supports the operation type
|
|
capabilities = model_info.get("capabilities", [])
|
|
if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
|
|
continue
|
|
elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
|
|
continue
|
|
elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
|
|
continue
|
|
elif "text_generation" not in capabilities:
|
|
continue
|
|
|
|
# Prefer models that can handle the content without chunking, but allow chunking if needed
|
|
if context_length >= total_size * 0.8: # 80% of content size
|
|
if context_length > best_context_length:
|
|
best_model = model_info
|
|
best_context_length = context_length
|
|
elif best_model is None: # Fallback to largest available model
|
|
if context_length > best_context_length:
|
|
best_model = model_info
|
|
best_context_length = context_length
|
|
|
|
# Fallback to a reasonable default if no model found
|
|
if best_model is None:
|
|
best_model = {
|
|
"contextLength": 128000, # GPT-4o default
|
|
"llmName": "gpt-4o"
|
|
}
|
|
|
|
# Calculate appropriate sizes
|
|
# Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
|
|
context_length_bytes = int(best_model["contextLength"] * 4)
|
|
max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
|
|
text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
|
|
image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
|
|
|
|
logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
|
|
logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
|
|
logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
|
|
|
|
return {
|
|
"maxContextBytes": max_context_bytes,
|
|
"textChunkSize": text_chunk_size,
|
|
"imageChunkSize": image_chunk_size
|
|
}
|