gateway/modules/services/serviceAi/subDocumentProcessing.py

1459 lines
72 KiB
Python

import json
import logging
import re
import time
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
logger = logging.getLogger(__name__)
class SubDocumentProcessing:
"""Document processing operations including chunking, processing, and merging."""
def __init__(self, services, aiObjects):
"""Initialize document processing service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
"""
self.services = services
self.aiObjects = aiObjects
self._extractionService = None
@property
def extractionService(self):
"""Lazy initialization of extraction service."""
if self._extractionService is None:
logger.info("Lazy initializing ExtractionService...")
self._extractionService = ExtractionService(self.services)
return self._extractionService
def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int:
"""Calculate maximum context bytes based on model capabilities and options."""
if options and options.maxContextBytes:
return options.maxContextBytes
# Default model capabilities (this should be enhanced with actual model registry)
defaultMaxTokens = 4000
safetyMargin = options.safetyMargin if options else 0.1
# Calculate bytes (4 chars per token estimation)
maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4)
return maxContextBytes
async def processDocumentsPerChunk(
self,
documents: List[ChatDocument],
prompt: str,
options: Optional[AiCallOptions] = None
) -> str:
"""
Process documents with model-aware chunking and merge results.
NEW: Uses model-aware chunking in AI call phase instead of extraction phase.
Args:
documents: List of ChatDocument objects to process
prompt: AI prompt for processing
options: AI call options
Returns:
Merged AI results as string with preserved document structure
"""
if not documents:
return ""
# Build extraction options WITHOUT chunking parameters
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True,
# REMOVED: maxSize, textChunkSize, imageChunkSize
"mergeStrategy": {
"useIntelligentMerging": True,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content WITHOUT chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return "[Error: No extraction results]"
# Process parts (not chunks) with model-aware AI calls
partResults = await self._processPartsWithMapping(extractionResult, prompt, options)
# Merge results using existing merging system
mergedContent = self._mergePartResults(partResults, options)
# Save merged extraction content to debug
self.services.utils.writeDebugFile(mergedContent or '', "extractionMergedText")
return mergedContent
except Exception as e:
logger.error(f"Error in per-chunk processing: {str(e)}")
return f"[Error in per-chunk processing: {str(e)}]"
async def processDocumentsPerChunkJson(
self,
documents: List[ChatDocument],
prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with model-aware chunking and merge results in JSON mode.
Returns structured JSON document instead of text.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Build extraction options WITHOUT chunking parameters
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True,
"mergeStrategy": {
"useIntelligentMerging": True,
"prompt": prompt,
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content WITHOUT chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return {"metadata": {"title": "Error Document"}, "sections": []}
# Process parts with model-aware chunking
partResults = await self._processPartsWithMapping(extractionResult, prompt, options)
# Convert to JSON format (simplified for now)
mergedJsonDocument = self._convertPartResultsToJson(partResults, options)
# Normalize merged JSON into a single canonical table (only if table content exists)
try:
from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService
normalizer = NormalizationService(self.services)
inventory = normalizer.discoverStructures(mergedJsonDocument)
# Check if any table content was discovered
tableHeaders = inventory.get("tableHeaders", [])
if not tableHeaders:
logger.info("No table content found in merged JSON, skipping normalization and returning original structure")
else:
# Use workflow id as cache key
cacheKey = self.services.currentWorkflow.id
# Provide the extraction/merge prompt context when available to help mapping
mergePrompt = prompt
mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt)
canonical = normalizer.applyMapping(mergedJsonDocument, mapping)
report = normalizer.validateCanonical(canonical)
if report.get('success'):
mergedJsonDocument = canonical
else:
raise ValueError('Normalization produced zero rows')
except Exception as e:
# Log normalization failure but don't re-raise - continue with original merged JSON
logger.warning(f"Normalization failed (expected): {str(e)}")
logger.debug(f"Normalization error type: {type(e).__name__}")
# Continue with original merged JSON instead of re-raising
# Save merged JSON extraction content to debug
jsonStr = json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2)
self.services.utils.writeDebugFile(jsonStr, "extractionMergedJson")
return mergedJsonDocument
except Exception as e:
logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}")
logger.error(f"Exception type: {type(e).__name__}")
logger.error(f"Exception args: {e.args}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def processDocumentsPerChunkJsonWithPrompt(
self,
documents: List[ChatDocument],
custom_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with per-chunk AI calls and merge results in JSON mode.
Uses a custom prompt instead of the default extraction prompt.
Enhanced with partial results continuation logic.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": custom_prompt, # Use the custom prompt instead of default
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"capabilities": model_capabilities,
"prompt": custom_prompt, # Use the custom prompt
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return {"metadata": {"title": "Error Document"}, "sections": []}
# Process chunks with proper mapping
logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
# Debug: Show what content is being processed (before filtering)
for i, ec in enumerate(extractionResult):
if hasattr(ec, 'parts'):
for j, part in enumerate(ec.parts):
if not (hasattr(part, 'data') and part.data):
# Check if this is an empty container chunk (which is expected)
part_type = getattr(part, 'typeGroup', None)
part_mime = getattr(part, 'mimeType', '')
is_empty_container = (
part_type == "container" and
part_mime and
'document' in part_mime.lower()
)
if not is_empty_container:
logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'")
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
# Debug: Show what chunks were actually processed (after filtering)
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
# Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
# Debug: Show what the AI actually returned
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
if 'documents' in mergedJsonDocument:
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
elif 'sections' in mergedJsonDocument:
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
return mergedJsonDocument
except Exception as e:
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def processDocumentsWithContinuation(
self,
documents: List[ChatDocument],
custom_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with partial results continuation logic.
Handles AI responses that indicate partial completion and loops until complete.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
logger.info("Starting document processing with continuation logic")
# Build enhanced prompt with continuation instructions
enhanced_prompt = self._buildContinuationPrompt(custom_prompt)
# Process with continuation logic
return await self._processWithContinuationLoop(documents, enhanced_prompt, options)
def _buildContinuationPrompt(self, base_prompt: str) -> str:
"""
Build a prompt that includes partial results continuation instructions.
"""
continuation_instructions = """
IMPORTANT CHUNKING LOGIC:
- If the response is too large to generate completely in one response, set "continue": true
- When "continue": true, include a "continuation_context" field with:
- "last_section_id": "id of the last completed section"
- "last_element_index": "index of the last completed element in that section"
- "remaining_requirements": "brief description of what still needs to be generated"
- The AI will be called again with this context to continue generation
- Only set "continue": false when the response is completely generated
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{
"metadata": {
"title": "Document Title"
},
"sections": [
{
"id": "section_1",
"content_type": "paragraph",
"elements": [
{
"text": "This is the actual content that should be generated."
}
],
"order": 1
}
],
"continue": false,
"continuation_context": {
"last_section_id": "section_1",
"last_element_index": 0,
"remaining_requirements": "description of what still needs to be generated"
}
}
The AI should generate content using the canonical format with "sections" and "elements".
"""
return f"{base_prompt}{continuation_instructions}"
async def _processWithContinuationLoop(
self,
documents: List[ChatDocument],
enhanced_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with continuation loop until complete.
"""
max_iterations = 10 # Prevent infinite loops
iteration = 0
accumulated_sections = []
continuation_context = None
while iteration < max_iterations:
iteration += 1
logger.info(f"Continuation iteration {iteration}/{max_iterations}")
# Build prompt for this iteration
if continuation_context:
iteration_prompt = self._buildContinuationIterationPrompt(
enhanced_prompt, continuation_context, accumulated_sections
)
else:
iteration_prompt = enhanced_prompt
# Process documents for this iteration
try:
# Use the existing processing method
result = await self.processDocumentsPerChunkJsonWithPrompt(
documents, iteration_prompt, options
)
# Check if this is a valid JSON response
if not isinstance(result, dict):
logger.warning(f"Iteration {iteration}: Invalid result type, stopping")
break
# Extract sections from result
sections = result.get("sections", [])
if not sections:
logger.warning(f"Iteration {iteration}: No sections found, stopping")
break
# Add sections to accumulated results
for section in sections:
# Update section order to maintain sequence
section["order"] = len(accumulated_sections) + 1
accumulated_sections.append(section)
# Check if continuation is needed
continue_flag = result.get("continue", False)
continuation_context = result.get("continuation_context")
logger.info(f"Iteration {iteration}: Added {len(sections)} sections, continue={continue_flag}")
if not continue_flag:
logger.info(f"Continuation complete after {iteration} iterations")
break
if not continuation_context:
logger.warning(f"Iteration {iteration}: continue=true but no continuation_context, stopping")
break
except Exception as e:
logger.error(f"Iteration {iteration} failed: {str(e)}")
break
if iteration >= max_iterations:
logger.warning(f"Continuation stopped after maximum iterations ({max_iterations})")
# Build final result
final_result = {
"metadata": {
"title": "Generated Document",
"total_sections": len(accumulated_sections),
"iterations": iteration,
"continuation_used": iteration > 1
},
"sections": accumulated_sections,
"continue": False
}
logger.info(f"Final result: {len(accumulated_sections)} sections from {iteration} iterations")
return final_result
def _buildContinuationIterationPrompt(
self,
base_prompt: str,
continuation_context: Dict[str, Any],
accumulated_sections: List[Dict[str, Any]]
) -> str:
"""
Build a prompt for continuation iteration with context.
"""
last_section_id = continuation_context.get("last_section_id", "")
last_element_index = continuation_context.get("last_element_index", 0)
remaining_requirements = continuation_context.get("remaining_requirements", "")
# Build context of what's already been generated
context_summary = "PREVIOUSLY GENERATED CONTENT:\n"
for i, section in enumerate(accumulated_sections[-3:]): # Show last 3 sections for context
context_summary += f"Section {i+1}: {section.get('id', 'unknown')}\n"
if 'elements' in section and section['elements']:
first_element = section['elements'][0]
if 'text' in first_element:
preview = first_element['text'][:100] + "..." if len(first_element['text']) > 100 else first_element['text']
context_summary += f" Preview: {preview}\n"
continuation_prompt = f"""
{base_prompt}
{context_summary}
CONTINUATION INSTRUCTIONS:
- Continue from where you left off
- Last completed section: {last_section_id}
- Last completed element index: {last_element_index}
- Remaining requirements: {remaining_requirements}
- Generate the next part of the content
- Maintain consistency with previously generated content
- Use the same JSON format as before
- Set "continue": true if more content is needed, false if complete
"""
return continuation_prompt
async def callAiText(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> str:
"""
Handle text calls with document processing through ExtractionService.
UNIFIED PROCESSING: Always use per-chunk processing for consistency.
"""
return await self.processDocumentsPerChunk(documents, prompt, options)
async def _processPartsWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
options: Optional[AiCallOptions] = None
) -> List['PartResult']:
"""Process content parts with model-aware chunking and proper mapping."""
from modules.datamodels.datamodelExtraction import PartResult
import asyncio
# Collect all parts that need processing
parts_to_process = []
part_index = 0
for ec in extractionResult:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container parts
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container part: mimeType={part.mimeType}")
continue
parts_to_process.append({
'part': part,
'part_index': part_index,
'document_id': ec.id
})
part_index += 1
logger.info(f"Processing {len(parts_to_process)} parts with model-aware chunking")
# Process parts in parallel
async def process_single_part(part_info: Dict) -> PartResult:
part = part_info['part']
part_index = part_info['part_index']
document_id = part_info['document_id']
start_time = time.time()
try:
# Create AI call request with content part
from modules.datamodels.datamodelAi import AiCallRequest
request = AiCallRequest(
prompt=prompt,
context="", # Context is in the content part
options=options,
contentParts=[part] # Pass as list for unified processing
)
# Call AI with model-aware chunking
response = await self.aiObjects.call(request)
processing_time = time.time() - start_time
return PartResult(
originalPart=part,
aiResult=response.content,
partIndex=part_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": True,
"partSize": len(part.data) if part.data else 0,
"resultSize": len(response.content),
"typeGroup": part.typeGroup,
"modelName": response.modelName,
"priceUsd": response.priceUsd
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing part {part_index}: {str(e)}")
return PartResult(
originalPart=part,
aiResult=f"[Error processing part: {str(e)}]",
partIndex=part_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"partSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process parts with concurrency control
max_concurrent = 5
if options and hasattr(options, 'maxConcurrentParts'):
max_concurrent = options.maxConcurrentParts
semaphore = asyncio.Semaphore(max_concurrent)
async def process_with_semaphore(part_info):
async with semaphore:
return await process_single_part(part_info)
tasks = [process_with_semaphore(part_info) for part_info in parts_to_process]
part_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions
processed_results = []
for i, result in enumerate(part_results):
if isinstance(result, Exception):
part_info = parts_to_process[i]
processed_results.append(PartResult(
originalPart=part_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
partIndex=part_info['part_index'],
documentId=part_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
processed_results.append(result)
logger.info(f"Completed processing {len(processed_results)} parts")
return processed_results
async def _processChunksWithMapping(
self,
extractionResult: List[ContentExtracted],
prompt: str,
options: Optional[AiCallOptions] = None,
generate_json: bool = False
) -> List[ChunkResult]:
"""Process chunks with proper mapping to preserve relationships."""
from modules.datamodels.datamodelExtraction import ChunkResult
import asyncio
# Collect all chunks that need processing with proper indexing
chunks_to_process = []
chunk_index = 0
for ec in extractionResult:
# Get document MIME type from metadata
document_mime_type = None
for part in ec.parts:
if part.metadata and 'documentMimeType' in part.metadata:
document_mime_type = part.metadata['documentMimeType']
break
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
continue
chunks_to_process.append({
'part': part,
'chunk_index': chunk_index,
'document_id': ec.id,
'document_mime_type': document_mime_type
})
chunk_index += 1
logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping")
# Process chunks in parallel with proper mapping
async def process_single_chunk(chunk_info: Dict) -> ChunkResult:
part = chunk_info['part']
chunk_index = chunk_info['chunk_index']
document_id = chunk_info['document_id']
document_mime_type = chunk_info.get('document_mime_type', part.mimeType)
start_time = time.time()
try:
# FIXED: Check MIME type first, then fallback to typeGroup
is_image = (
(document_mime_type and document_mime_type.startswith('image/')) or
(part.mimeType and part.mimeType.startswith('image/')) or
(part.typeGroup == "image")
)
# Debug logging
self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}")
if is_image:
# Use the same extraction prompt for image analysis (contains table JSON format)
self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Check if image data is available
if not part.data:
error_msg = f"No image data available for chunk {chunk_index}"
logger.warning(error_msg)
ai_result = f"Error: {error_msg}"
else:
try:
# Import here to avoid circular imports
from modules.services.serviceAi.subCoreAi import SubCoreAi
core_ai = SubCoreAi(self.services, self.aiObjects)
ai_result = await core_ai.readImage(
prompt=prompt,
imageData=part.data,
mimeType=part.mimeType,
options=options
)
self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE")
# Save image extraction response to debug file
self.services.utils.writeDebugFile(ai_result or 'No response', f"extraction_image_chunk_{chunk_index}")
# Check if result is empty or None
if not ai_result or not ai_result.strip():
logger.warning(f"Image chunk {chunk_index} returned empty response from AI")
ai_result = "No content detected in image"
except Exception as e:
logger.error(f"Error processing image chunk {chunk_index}: {str(e)}")
ai_result = f"Error analyzing image: {str(e)}"
# If generating JSON, clean image analysis result
if generate_json:
try:
# Clean the response - remove markdown code blocks if present
cleaned_result = ai_result.strip()
# Remove various markdown patterns
if cleaned_result.startswith('```json'):
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
elif cleaned_result.startswith('```'):
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
# Remove any leading/trailing text that's not JSON
# Look for the first { and last } to extract JSON
first_brace = cleaned_result.find('{')
last_brace = cleaned_result.rfind('}')
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
cleaned_result = cleaned_result[first_brace:last_brace + 1]
# Additional cleaning for common AI response issues
cleaned_result = cleaned_result.strip()
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE")
except json.JSONDecodeError as e:
logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}")
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
# Create fallback JSON with the actual response content (not the error message)
# Use the original AI response content, not the error message
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
ai_result = json.dumps({
"metadata": {"title": f"Image Analysis - Chunk {chunk_index}"},
"sections": [{
"id": f"image_section_{chunk_index}",
"content_type": "paragraph",
"elements": [{"text": fallback_content}]
}]
})
self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE")
elif part.typeGroup in ("container", "binary"):
# Handle ALL container and binary content generically - let AI process any document type
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
# Skip processing this chunk
pass
elif part.mimeType and part.data and len(part.data.strip()) > 0:
# Process any document container as text content
request_options = options if options is not None else AiCallOptions()
request_options.operationType = OperationTypeEnum.GENERAL
self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}")
# Log extraction prompt and context
self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE")
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
# Strengthen prompt to forbid fabrication for text/container extraction
augmented_prompt = (
f"{prompt}\n\n"
"CRITICAL RULES (NO FABRICATION):\n"
"- Use ONLY content present in the provided CONTEXT.\n"
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
"- If a value is missing, leave the cell empty or omit the row.\n"
)
request = AiCallRequest(
prompt=augmented_prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
ai_result = response.content
# Log extraction response
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
# Save extraction prompt and response to debug
self.services.utils.writeDebugFile(augmented_prompt, f"extraction-Chunk{chunk_index}-Prompt")
self.services.utils.writeDebugFile(ai_result or '', f"extraction-Chunk{chunk_index}-Response")
# If generating JSON, validate the response
if generate_json:
try:
# Clean the response - remove markdown code blocks if present
cleaned_result = ai_result.strip()
# Remove various markdown patterns
if cleaned_result.startswith('```json'):
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
elif cleaned_result.startswith('```'):
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
# Remove any leading/trailing text that's not JSON
# Look for the first { and last } to extract JSON
first_brace = cleaned_result.find('{')
last_brace = cleaned_result.rfind('}')
if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
cleaned_result = cleaned_result[first_brace:last_brace + 1]
# Additional cleaning for common AI response issues
cleaned_result = cleaned_result.strip()
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
except json.JSONDecodeError as e:
logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}")
logger.warning(f"Raw response was: '{ai_result[:500]}...'")
# Create fallback JSON with the actual response content (not the error message)
# Use the original AI response content, not the error message
fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected"
self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE")
ai_result = json.dumps({
"metadata": {"title": f"Document Analysis - Chunk {chunk_index}"},
"sections": [{
"id": f"analysis_section_{chunk_index}",
"content_type": "paragraph",
"elements": [{"text": fallback_content}]
}]
})
self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE")
else:
# Skip empty or invalid container/binary content - don't create a result
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
# Return None to indicate this chunk should be completely skipped
return None
else:
# Ensure options is not None and set correct operation type for text
request_options = options if options is not None else AiCallOptions()
# FIXED: Set operation type to general for text processing
request_options.operationType = OperationTypeEnum.GENERAL
self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}")
# Log extraction context length
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
# Debug: Log the actual prompt being sent to AI
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
# Strengthen prompt to forbid fabrication for text extraction
augmented_prompt_text = (
f"{prompt}\n\n"
"CRITICAL RULES (NO FABRICATION):\n"
"- Use ONLY content present in the provided CONTEXT.\n"
"- Do NOT create, infer, or guess values not explicitly in the context.\n"
"- If a value is missing, leave the cell empty or omit the row.\n"
)
request = AiCallRequest(
prompt=augmented_prompt_text,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
# Debug: Log what AI actually returned
logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
ai_result = response.content
# Log extraction response length
self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE")
# Save extraction prompt and response to debug
self.services.utils.writeDebugFile(augmented_prompt_text, f"extractionChunk{chunk_index}-Prompt")
self.services.utils.writeDebugFile(ai_result or '', f"extractionChunk{chunk_index}-Response")
# If generating JSON, validate the response
if generate_json:
try:
# Clean the response - remove markdown code blocks and extra formatting
cleaned_result = ai_result.strip()
# Remove any markdown code block markers (```json, ```, etc.)
cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
# Remove any remaining ``` markers anywhere in the text
cleaned_result = re.sub(r'```', '', cleaned_result)
# Try to extract JSON from the response if it's embedded in other text
json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
if json_match:
cleaned_result = json_match.group(0)
# Validate JSON
json.loads(cleaned_result)
ai_result = cleaned_result # Use cleaned version
except json.JSONDecodeError as e:
logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}")
# Create fallback JSON
ai_result = json.dumps({
"metadata": {"title": "Error Section"},
"sections": [{
"id": f"error_section_{chunk_index}",
"content_type": "paragraph",
"elements": [{"text": f"Error parsing JSON: {str(e)}"}]
}]
})
processing_time = time.time() - start_time
logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s")
return ChunkResult(
originalChunk=part,
aiResult=ai_result,
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": True,
"chunkSize": len(part.data) if part.data else 0,
"resultSize": len(ai_result),
"typeGroup": part.typeGroup
}
)
except Exception as e:
processing_time = time.time() - start_time
logger.warning(f"Error processing chunk {chunk_index}: {str(e)}")
return ChunkResult(
originalChunk=part,
aiResult=f"[Error processing chunk: {str(e)}]",
chunkIndex=chunk_index,
documentId=document_id,
processingTime=processing_time,
metadata={
"success": False,
"error": str(e),
"chunkSize": len(part.data) if part.data else 0,
"typeGroup": part.typeGroup
}
)
# Process chunks with concurrency control
max_concurrent = 5 # Default concurrency
if options and hasattr(options, 'maxConcurrentChunks'):
max_concurrent = options.maxConcurrentChunks
elif options and hasattr(options, 'maxParallelChunks'):
max_concurrent = options.maxParallelChunks
logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}")
self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE")
for i, chunk_info in enumerate(chunks_to_process):
self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE")
# Create semaphore for concurrency control
semaphore = asyncio.Semaphore(max_concurrent)
async def process_with_semaphore(chunk_info):
async with semaphore:
return await process_single_chunk(chunk_info)
# Process all chunks in parallel with concurrency control
tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process]
self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE")
chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE")
# Handle any exceptions in the gather itself
processed_results = []
for i, result in enumerate(chunk_results):
if isinstance(result, Exception):
# Create error ChunkResult
chunk_info = chunks_to_process[i]
processed_results.append(ChunkResult(
originalChunk=chunk_info['part'],
aiResult=f"[Error in parallel processing: {str(result)}]",
chunkIndex=chunk_info['chunk_index'],
documentId=chunk_info['document_id'],
processingTime=0.0,
metadata={"success": False, "error": str(result)}
))
elif result is not None:
# Only add non-None results (skip empty containers)
processed_results.append(result)
logger.info(f"Completed processing {len(processed_results)} chunks")
return processed_results
def _mergePartResults(
self,
partResults: List['PartResult'],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge part results using existing sophisticated merging system."""
if not partResults:
return ""
# Convert PartResults back to ContentParts for existing merger system
from modules.datamodels.datamodelExtraction import ContentPart
content_parts = []
for part_result in partResults:
# Create ContentPart from PartResult with proper typeGroup
content_part = ContentPart(
id=part_result.originalPart.id,
parentId=part_result.originalPart.parentId,
label=part_result.originalPart.label,
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
mimeType=part_result.originalPart.mimeType,
data=part_result.aiResult, # Use AI result as data
metadata={
**part_result.originalPart.metadata,
"aiResult": True,
"partIndex": part_result.partIndex,
"documentId": part_result.documentId,
"processingTime": part_result.processingTime,
"success": part_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "partIndex", # Order by part index
"mergeType": "concatenate"
}
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
logger.info(f"Merged {len(partResults)} parts using existing sophisticated merging system")
return final_content.strip()
def _convertPartResultsToJson(
self,
partResults: List['PartResult'],
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""Convert part results to JSON format using existing sophisticated merging system."""
if not partResults:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Convert PartResults back to ContentParts for existing merger system
from modules.datamodels.datamodelExtraction import ContentPart
content_parts = []
for part_result in partResults:
# Create ContentPart from PartResult with proper typeGroup
content_part = ContentPart(
id=part_result.originalPart.id,
parentId=part_result.originalPart.parentId,
label=part_result.originalPart.label,
typeGroup=part_result.originalPart.typeGroup, # Use original typeGroup
mimeType=part_result.originalPart.mimeType,
data=part_result.aiResult, # Use AI result as data
metadata={
**part_result.originalPart.metadata,
"aiResult": True,
"partIndex": part_result.partIndex,
"documentId": part_result.documentId,
"processingTime": part_result.processingTime,
"success": part_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "partIndex", # Order by part index
"mergeType": "concatenate"
}
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts to JSON format
all_sections = []
document_titles = []
for part in merged_parts:
if part.metadata.get("success", False):
try:
# Parse JSON from AI result
part_json = json.loads(part.data)
# Check if this is a multi-file response (has "documents" key)
if isinstance(part_json, dict) and "documents" in part_json:
# This is a multi-file response - merge all documents
logger.debug(f"Processing multi-file response from part {part.id} with {len(part_json['documents'])} documents")
# Return multi-file response directly
return {
"metadata": part_json.get("metadata", {"title": "Merged Document"}),
"documents": part_json["documents"]
}
# Extract sections from single-file response
elif isinstance(part_json, dict) and "sections" in part_json:
for section in part_json["sections"]:
# Add part context to section
section["metadata"] = section.get("metadata", {})
section["metadata"]["source_part"] = part.id
section["metadata"]["source_document"] = part.metadata.get("documentId", "unknown")
section["metadata"]["part_index"] = part.metadata.get("partIndex", 0)
all_sections.append(section)
# Extract document title
if isinstance(part_json, dict) and "metadata" in part_json:
title = part_json["metadata"].get("title", "")
if title and title not in document_titles:
document_titles.append(title)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from part {part.id}: {str(e)}")
# Create a fallback section for invalid JSON
fallback_section = {
"id": f"error_section_{part.id}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error parsing part {part.id}: {str(e)}"
}],
"order": part.metadata.get("partIndex", 0),
"metadata": {
"source_document": part.metadata.get("documentId", "unknown"),
"part_id": part.id,
"error": str(e)
}
}
all_sections.append(fallback_section)
else:
# Handle error parts
error_section = {
"id": f"error_section_{part.id}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error in part {part.id}: {part.metadata.get('error', 'Unknown error')}"
}],
"order": part.metadata.get("partIndex", 0),
"metadata": {
"source_document": part.metadata.get("documentId", "unknown"),
"part_id": part.id,
"error": part.metadata.get('error', 'Unknown error')
}
}
all_sections.append(error_section)
# Sort sections by order
all_sections.sort(key=lambda x: x.get("order", 0))
# Create merged document with sections
merged_document = {
"metadata": {
"title": document_titles[0] if document_titles else "Merged Document",
"extraction_method": "model_aware_chunking_with_merging",
"version": "2.0"
},
"sections": all_sections,
"summary": f"Merged document using sophisticated merging system",
"tags": ["merged", "ai_generated", "model_aware", "sophisticated_merging"]
}
logger.info(f"Converted {len(partResults)} parts to JSON format using existing sophisticated merging system")
return merged_document
def _mergeChunkResults(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results using existing sophisticated merging system."""
if not chunkResults:
return ""
# Convert ChunkResults back to ContentParts for existing merger system
from modules.datamodels.datamodelExtraction import ContentPart
content_parts = []
for chunk_result in chunkResults:
# Create ContentPart from ChunkResult with proper typeGroup
content_part = ContentPart(
id=chunk_result.originalChunk.id,
parentId=chunk_result.originalChunk.parentId,
label=chunk_result.originalChunk.label,
typeGroup=chunk_result.originalChunk.typeGroup, # Use original typeGroup
mimeType=chunk_result.originalChunk.mimeType,
data=chunk_result.aiResult, # Use AI result as data
metadata={
**chunk_result.originalChunk.metadata,
"aiResult": True,
"chunk": True,
"chunkIndex": chunk_result.chunkIndex,
"documentId": chunk_result.documentId,
"processingTime": chunk_result.processingTime,
"success": chunk_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy from options
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system")
return final_content.strip()
def _mergeChunkResultsClean(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> str:
"""Merge chunk results in CLEAN mode using existing sophisticated merging system."""
if not chunkResults:
return ""
# Convert ChunkResults back to ContentParts for existing merger system
from modules.datamodels.datamodelExtraction import ContentPart
content_parts = []
for chunk_result in chunkResults:
# Skip empty or error chunks in clean mode
if not chunk_result.metadata.get("success", False):
continue
if not chunk_result.aiResult or not chunk_result.aiResult.strip():
continue
# Skip container/binary chunks in clean mode
if chunk_result.aiResult.startswith("[Skipped ") and "content:" in chunk_result.aiResult:
continue
# Create ContentPart from ChunkResult with proper typeGroup
content_part = ContentPart(
id=chunk_result.originalChunk.id,
parentId=chunk_result.originalChunk.parentId,
label=chunk_result.originalChunk.label,
typeGroup=chunk_result.originalChunk.typeGroup, # Use original typeGroup
mimeType=chunk_result.originalChunk.mimeType,
data=chunk_result.aiResult, # Use AI result as data
metadata={
**chunk_result.originalChunk.metadata,
"aiResult": True,
"chunk": True,
"chunkIndex": chunk_result.chunkIndex,
"documentId": chunk_result.documentId,
"processingTime": chunk_result.processingTime,
"success": chunk_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy for clean mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts back to final string
final_content = "\n\n".join([part.data for part in merged_parts])
logger.info(f"Merged {len(content_parts)} chunks in clean mode using existing sophisticated merging system")
return final_content.strip()
def _mergeChunkResultsJson(
self,
chunkResults: List[ChunkResult],
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""Merge chunk results in JSON mode using existing sophisticated merging system."""
if not chunkResults:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Convert ChunkResults back to ContentParts for existing merger system
from modules.datamodels.datamodelExtraction import ContentPart
content_parts = []
for chunk_result in chunkResults:
# Create ContentPart from ChunkResult with proper typeGroup
content_part = ContentPart(
id=chunk_result.originalChunk.id,
parentId=chunk_result.originalChunk.parentId,
label=chunk_result.originalChunk.label,
typeGroup=chunk_result.originalChunk.typeGroup, # Use original typeGroup
mimeType=chunk_result.originalChunk.mimeType,
data=chunk_result.aiResult, # Use AI result as data
metadata={
**chunk_result.originalChunk.metadata,
"aiResult": True,
"chunk": True,
"chunkIndex": chunk_result.chunkIndex,
"documentId": chunk_result.documentId,
"processingTime": chunk_result.processingTime,
"success": chunk_result.metadata.get("success", False)
}
)
content_parts.append(content_part)
# Use existing merging strategy for JSON mode
merge_strategy = {
"useIntelligentMerging": True,
"groupBy": "documentId", # Group by document
"orderBy": "chunkIndex", # Order by chunk index
"mergeType": "concatenate"
}
if options and hasattr(options, 'mergeStrategy'):
merge_strategy.update(options.mergeStrategy)
# Apply existing merging logic using the sophisticated merging system
from modules.services.serviceExtraction.subPipeline import _applyMerging
merged_parts = _applyMerging(content_parts, merge_strategy)
# Convert merged parts to JSON format
all_sections = []
document_titles = []
for part in merged_parts:
if part.metadata.get("success", False):
try:
# Parse JSON from AI result
chunk_json = json.loads(part.data)
# Check if this is a multi-file response (has "documents" key)
if isinstance(chunk_json, dict) and "documents" in chunk_json:
# This is a multi-file response - merge all documents
logger.debug(f"Processing multi-file response from part {part.id} with {len(chunk_json['documents'])} documents")
# Return multi-file response directly
return {
"metadata": chunk_json.get("metadata", {"title": "Merged Document"}),
"documents": chunk_json["documents"]
}
# Extract sections from single-file response
elif isinstance(chunk_json, dict) and "sections" in chunk_json:
for section in chunk_json["sections"]:
# Add part context to section
section["metadata"] = section.get("metadata", {})
section["metadata"]["source_part"] = part.id
section["metadata"]["source_document"] = part.metadata.get("documentId", "unknown")
section["metadata"]["chunk_index"] = part.metadata.get("chunkIndex", 0)
all_sections.append(section)
# Extract document title
if isinstance(chunk_json, dict) and "metadata" in chunk_json:
title = chunk_json["metadata"].get("title", "")
if title and title not in document_titles:
document_titles.append(title)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON from part {part.id}: {str(e)}")
# Create a fallback section for invalid JSON
fallback_section = {
"id": f"error_section_{part.id}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error parsing part {part.id}: {str(e)}"
}],
"order": part.metadata.get("chunkIndex", 0),
"metadata": {
"source_document": part.metadata.get("documentId", "unknown"),
"part_id": part.id,
"error": str(e)
}
}
all_sections.append(fallback_section)
else:
# Handle error parts
error_section = {
"id": f"error_section_{part.id}",
"title": "Error Section",
"content_type": "paragraph",
"elements": [{
"text": f"Error in part {part.id}: {part.metadata.get('error', 'Unknown error')}"
}],
"order": part.metadata.get("chunkIndex", 0),
"metadata": {
"source_document": part.metadata.get("documentId", "unknown"),
"part_id": part.id,
"error": part.metadata.get('error', 'Unknown error')
}
}
all_sections.append(error_section)
# Sort sections by order
all_sections.sort(key=lambda x: x.get("order", 0))
# Create merged document with sections
merged_document = {
"metadata": {
"title": document_titles[0] if document_titles else "Merged Document",
"extraction_method": "ai_json_extraction_with_merging",
"version": "2.0"
},
"sections": all_sections,
"summary": f"Merged document using sophisticated merging system",
"tags": ["merged", "ai_generated", "sophisticated_merging"]
}
logger.info(f"Merged {len(chunkResults)} chunks using existing sophisticated merging system (JSON mode)")
return merged_document
# REMOVED: _getModelCapabilitiesForContent method - no longer needed with model-aware chunking