import logging from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, ModelCapabilities, OperationType, Priority from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService logger = logging.getLogger(__name__) class SubDocumentProcessing: """Document processing operations including chunking, processing, and merging.""" def __init__(self, services, aiObjects): """Initialize document processing service. Args: services: Service center instance for accessing other services aiObjects: Initialized AiObjects instance """ self.services = services self.aiObjects = aiObjects self._extractionService = None @property def extractionService(self): """Lazy initialization of extraction service.""" if self._extractionService is None: logger.info("Lazy initializing ExtractionService...") self._extractionService = ExtractionService(self.services) return self._extractionService def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int: """Calculate maximum context bytes based on model capabilities and options.""" if options and options.maxContextBytes: return options.maxContextBytes # Default model capabilities (this should be enhanced with actual model registry) defaultMaxTokens = 4000 safetyMargin = options.safetyMargin if options else 0.1 # Calculate bytes (4 chars per token estimation) maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4) return maxContextBytes async def processDocumentsPerChunk( self, documents: List[ChatDocument], prompt: str, options: Optional[AiCallOptions] = None ) -> str: """ Process documents with per-chunk AI calls and merge results. FIXED: Now preserves chunk relationships and document structure. Args: documents: List of ChatDocument objects to process prompt: AI prompt for processing options: AI call options Returns: Merged AI results as string with preserved document structure """ if not documents: return "" # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": prompt, "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "modelCapabilities": model_capabilities, "prompt": prompt, "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return "[Error: No extraction results]" # FIXED: Process chunks with proper mapping chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options) # FIXED: Merge with preserved chunk relationships mergedContent = self._mergeChunkResults(chunkResults, options) # Save merged extraction content to debug file - only if debug enabled try: debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_merged.txt"), "w", encoding="utf-8") as f: f.write(mergedContent or "") except Exception: pass return mergedContent except Exception as e: logger.error(f"Error in per-chunk processing: {str(e)}") return f"[Error in per-chunk processing: {str(e)}]" async def processDocumentsPerChunkJson( self, documents: List[ChatDocument], prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with per-chunk AI calls and merge results in JSON mode. Returns structured JSON document instead of text. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": prompt, "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "modelCapabilities": model_capabilities, "prompt": prompt, "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return {"metadata": {"title": "Error Document"}, "sections": []} # Process chunks with proper mapping chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options, generate_json=True) # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) # Normalize merged JSON into a single canonical table (only if table content exists) try: from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService normalizer = NormalizationService(self.services) inventory = normalizer.discoverStructures(mergedJsonDocument) # Check if any table content was discovered tableHeaders = inventory.get("tableHeaders", []) if not tableHeaders: logger.info("No table content found in merged JSON, skipping normalization and returning original structure") else: # Use workflow id as cache key cacheKey = self.services.currentWorkflow.id # Provide the extraction/merge prompt context when available to help mapping mergePrompt = prompt mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) canonical = normalizer.applyMapping(mergedJsonDocument, mapping) report = normalizer.validateCanonical(canonical) if report.get('success'): mergedJsonDocument = canonical else: raise ValueError('Normalization produced zero rows') except Exception as e: # Surface normalization failure while leaving original merged JSON (single-path expectation is to fail) raise # Save merged JSON extraction content to debug file - only if debug enabled try: debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: import os import json as _json from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_merged.json"), "w", encoding="utf-8") as f: f.write(_json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2)) except Exception: pass return mergedJsonDocument except Exception as e: logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}") return {"metadata": {"title": "Error Document"}, "sections": []} async def processDocumentsPerChunkJsonWithPrompt( self, documents: List[ChatDocument], custom_prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with per-chunk AI calls and merge results in JSON mode. Uses a custom prompt instead of the default extraction prompt. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": custom_prompt, # Use the custom prompt instead of default "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "modelCapabilities": model_capabilities, "prompt": custom_prompt, # Use the custom prompt "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return {"metadata": {"title": "Error Document"}, "sections": []} # Process chunks with proper mapping logger.info(f"Processing {len(extractionResult)} chunks with custom prompt") logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...") # Debug: Show what content is being processed (before filtering) for i, ec in enumerate(extractionResult): if hasattr(ec, 'parts'): for j, part in enumerate(ec.parts): if not (hasattr(part, 'data') and part.data): # Check if this is an empty container chunk (which is expected) part_type = getattr(part, 'typeGroup', None) part_mime = getattr(part, 'mimeType', '') is_empty_container = ( part_type == "container" and part_mime and 'document' in part_mime.lower() ) if not is_empty_container: logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'") chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True) # Debug: Show what chunks were actually processed (after filtering) logger.info(f"After filtering: {len(chunkResults)} chunks will be processed") # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) # Debug: Show what the AI actually returned logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}") if 'documents' in mergedJsonDocument: logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}") elif 'sections' in mergedJsonDocument: logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") return mergedJsonDocument except Exception as e: logger.error(f"Error in per-chunk JSON processing: {str(e)}") return {"metadata": {"title": "Error Document"}, "sections": []} async def callAiText( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions ) -> str: """ Handle text calls with document processing through ExtractionService. UNIFIED PROCESSING: Always use per-chunk processing for consistency. """ # UNIFIED PROCESSING: Always use per-chunk processing for consistency # This ensures MIME-type checking, chunk mapping, and parallel processing return await self.processDocumentsPerChunk(documents, prompt, options) async def _processChunksWithMapping( self, extractionResult: List[ContentExtracted], prompt: str, options: Optional[AiCallOptions] = None, generate_json: bool = False ) -> List[ChunkResult]: """Process chunks with proper mapping to preserve relationships.""" from modules.datamodels.datamodelExtraction import ChunkResult import asyncio import time # Collect all chunks that need processing with proper indexing chunks_to_process = [] chunk_index = 0 for ec in extractionResult: # Get document MIME type from metadata document_mime_type = None for part in ec.parts: if part.metadata and 'documentMimeType' in part.metadata: document_mime_type = part.metadata['documentMimeType'] break for part in ec.parts: if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"): # Skip empty container chunks (they're just metadata containers) if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") continue chunks_to_process.append({ 'part': part, 'chunk_index': chunk_index, 'document_id': ec.id, 'document_mime_type': document_mime_type }) chunk_index += 1 logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping") # Process chunks in parallel with proper mapping async def process_single_chunk(chunk_info: Dict) -> ChunkResult: part = chunk_info['part'] chunk_index = chunk_info['chunk_index'] document_id = chunk_info['document_id'] document_mime_type = chunk_info.get('document_mime_type', part.mimeType) start_time = time.time() try: # FIXED: Check MIME type first, then fallback to typeGroup is_image = ( (document_mime_type and document_mime_type.startswith('image/')) or (part.mimeType and part.mimeType.startswith('image/')) or (part.typeGroup == "image") ) # Debug logging self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}") if is_image: # Use the same extraction prompt for image analysis (contains table JSON format) self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Check if image data is available if not part.data: error_msg = f"No image data available for chunk {chunk_index}" logger.warning(error_msg) ai_result = f"Error: {error_msg}" else: try: # Import here to avoid circular imports from modules.services.serviceAi.subCoreAi import SubCoreAi core_ai = SubCoreAi(self.services, self.aiObjects) ai_result = await core_ai.readImage( prompt=prompt, imageData=part.data, mimeType=part.mimeType, options=options ) self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE") # Save image extraction response to debug file - only if debug enabled debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: try: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_image_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: f.write(f"EXTRACTION IMAGE RESPONSE:\n{ai_result if ai_result else 'No response'}\n") except Exception: pass # Check if result is empty or None if not ai_result or not ai_result.strip(): logger.warning(f"Image chunk {chunk_index} returned empty response from AI") ai_result = "No content detected in image" except Exception as e: logger.error(f"Error processing image chunk {chunk_index}: {str(e)}") ai_result = f"Error analyzing image: {str(e)}" # If generating JSON, clean image analysis result if generate_json: try: import json import re # Clean the response - remove markdown code blocks if present cleaned_result = ai_result.strip() # Remove various markdown patterns if cleaned_result.startswith('```json'): cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) elif cleaned_result.startswith('```'): cleaned_result = re.sub(r'^```\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) # Remove any leading/trailing text that's not JSON # Look for the first { and last } to extract JSON first_brace = cleaned_result.find('{') last_brace = cleaned_result.rfind('}') if first_brace != -1 and last_brace != -1 and last_brace > first_brace: cleaned_result = cleaned_result[first_brace:last_brace + 1] # Additional cleaning for common AI response issues cleaned_result = cleaned_result.strip() # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE") except json.JSONDecodeError as e: logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}") logger.warning(f"Raw response was: '{ai_result[:500]}...'") # Create fallback JSON with the actual response content (not the error message) # Use the original AI response content, not the error message fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") ai_result = json.dumps({ "metadata": {"title": f"Image Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"image_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE") elif part.typeGroup in ("container", "binary"): # Handle ALL container and binary content generically - let AI process any document type self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Skip empty container chunks (they're just metadata containers) if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") # Skip processing this chunk pass elif part.mimeType and part.data and len(part.data.strip()) > 0: # Process any document container as text content request_options = options if options is not None else AiCallOptions() request_options.operationType = OperationType.GENERAL self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}") # Log extraction prompt and context self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE") self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") # Strengthen prompt to forbid fabrication for text/container extraction augmented_prompt = ( f"{prompt}\n\n" "CRITICAL RULES (NO FABRICATION):\n" "- Use ONLY content present in the provided CONTEXT.\n" "- Do NOT create, infer, or guess values not explicitly in the context.\n" "- If a value is missing, leave the cell empty or omit the row.\n" ) request = AiCallRequest( prompt=augmented_prompt, context=part.data, options=request_options ) response = await self.aiObjects.call(request) ai_result = response.content # Log extraction response self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") # Save full extraction prompt and response to debug file - only if debug enabled debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: try: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_container_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: f.write(f"EXTRACTION PROMPT:\n{prompt}\n\n") f.write(f"EXTRACTION CONTEXT:\n{part.data if part.data else 'No context'}\n\n") f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n") except Exception: pass # If generating JSON, validate the response if generate_json: try: import json import re # Clean the response - remove markdown code blocks if present cleaned_result = ai_result.strip() # Remove various markdown patterns if cleaned_result.startswith('```json'): cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) elif cleaned_result.startswith('```'): cleaned_result = re.sub(r'^```\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) # Remove any leading/trailing text that's not JSON # Look for the first { and last } to extract JSON first_brace = cleaned_result.find('{') last_brace = cleaned_result.rfind('}') if first_brace != -1 and last_brace != -1 and last_brace > first_brace: cleaned_result = cleaned_result[first_brace:last_brace + 1] # Additional cleaning for common AI response issues cleaned_result = cleaned_result.strip() # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version except json.JSONDecodeError as e: logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}") logger.warning(f"Raw response was: '{ai_result[:500]}...'") # Create fallback JSON with the actual response content (not the error message) # Use the original AI response content, not the error message fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") ai_result = json.dumps({ "metadata": {"title": f"Document Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"analysis_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE") else: # Skip empty or invalid container/binary content - don't create a result self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Return None to indicate this chunk should be completely skipped return None else: # Ensure options is not None and set correct operation type for text request_options = options if options is not None else AiCallOptions() # FIXED: Set operation type to general for text processing request_options.operationType = OperationType.GENERAL self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}") # Log extraction context length self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") # Debug: Log the actual prompt being sent to AI logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...") logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...") # Strengthen prompt to forbid fabrication for text extraction augmented_prompt_text = ( f"{prompt}\n\n" "CRITICAL RULES (NO FABRICATION):\n" "- Use ONLY content present in the provided CONTEXT.\n" "- Do NOT create, infer, or guess values not explicitly in the context.\n" "- If a value is missing, leave the cell empty or omit the row.\n" ) request = AiCallRequest( prompt=augmented_prompt_text, context=part.data, options=request_options ) response = await self.aiObjects.call(request) # Debug: Log what AI actually returned logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...") ai_result = response.content # Log extraction response length self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") # Save extraction response to debug file (without verbose prompt) - only if debug enabled debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) if debug_enabled: try: import os from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") debug_root = "./test-chat/ai" os.makedirs(debug_root, exist_ok=True) with open(os.path.join(debug_root, f"{ts}_extraction_chunk_{chunk_index}.txt"), "w", encoding="utf-8") as f: f.write(f"EXTRACTION RESPONSE:\n{ai_result if ai_result else 'No response'}\n") except Exception: pass # If generating JSON, validate the response if generate_json: try: import json import re # Clean the response - remove markdown code blocks and extra formatting cleaned_result = ai_result.strip() # Remove any markdown code block markers (```json, ```, etc.) cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE) cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE) # Remove any remaining ``` markers anywhere in the text cleaned_result = re.sub(r'```', '', cleaned_result) # Try to extract JSON from the response if it's embedded in other text json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL) if json_match: cleaned_result = json_match.group(0) # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version except json.JSONDecodeError as e: logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}") # Create fallback JSON ai_result = json.dumps({ "metadata": {"title": "Error Section"}, "sections": [{ "id": f"error_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": f"Error parsing JSON: {str(e)}"}] }] }) processing_time = time.time() - start_time logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s") return ChunkResult( originalChunk=part, aiResult=ai_result, chunkIndex=chunk_index, documentId=document_id, processingTime=processing_time, metadata={ "success": True, "chunkSize": len(part.data) if part.data else 0, "resultSize": len(ai_result), "typeGroup": part.typeGroup } ) except Exception as e: processing_time = time.time() - start_time logger.warning(f"Error processing chunk {chunk_index}: {str(e)}") return ChunkResult( originalChunk=part, aiResult=f"[Error processing chunk: {str(e)}]", chunkIndex=chunk_index, documentId=document_id, processingTime=processing_time, metadata={ "success": False, "error": str(e), "chunkSize": len(part.data) if part.data else 0, "typeGroup": part.typeGroup } ) # Process chunks with concurrency control max_concurrent = 5 # Default concurrency if options and hasattr(options, 'maxConcurrentChunks'): max_concurrent = options.maxConcurrentChunks elif options and hasattr(options, 'maxParallelChunks'): max_concurrent = options.maxParallelChunks logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}") self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE") for i, chunk_info in enumerate(chunks_to_process): self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE") # Create semaphore for concurrency control semaphore = asyncio.Semaphore(max_concurrent) async def process_with_semaphore(chunk_info): async with semaphore: return await process_single_chunk(chunk_info) # Process all chunks in parallel with concurrency control tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process] self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE") chunk_results = await asyncio.gather(*tasks, return_exceptions=True) self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE") # Handle any exceptions in the gather itself processed_results = [] for i, result in enumerate(chunk_results): if isinstance(result, Exception): # Create error ChunkResult chunk_info = chunks_to_process[i] processed_results.append(ChunkResult( originalChunk=chunk_info['part'], aiResult=f"[Error in parallel processing: {str(result)}]", chunkIndex=chunk_info['chunk_index'], documentId=chunk_info['document_id'], processingTime=0.0, metadata={"success": False, "error": str(result)} )) elif result is not None: # Only add non-None results (skip empty containers) processed_results.append(result) logger.info(f"Completed processing {len(processed_results)} chunks") return processed_results def _mergeChunkResults( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> str: """Merge chunk results while preserving document structure and chunk order.""" if not chunkResults: return "" # Get merging configuration from options chunk_separator = "\n\n---\n\n" include_document_headers = True include_chunk_metadata = False if options: if hasattr(options, 'chunkSeparator'): chunk_separator = options.chunkSeparator elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n") # Check for enhanced options if hasattr(options, 'preserveChunkMetadata'): include_chunk_metadata = options.preserveChunkMetadata # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge results for each document merged_documents = [] for doc_id, doc_chunks in results_by_document.items(): # Build document header if enabled doc_header = "" if include_document_headers: doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n" # Merge chunks for this document doc_content = "" for i, chunk_result in enumerate(doc_chunks): # Add chunk separator (except for first chunk) if i > 0: doc_content += chunk_separator # Add chunk content with optional metadata chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): chunk_content = chunk_result.aiResult # Add chunk metadata if enabled if include_chunk_metadata: chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]" chunk_content = f"{chunk_info}\n{chunk_content}" doc_content += chunk_content else: # Handle error chunks error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]" doc_content += error_msg merged_documents.append(doc_header + doc_content) # Join all documents final_result = "\n\n".join(merged_documents) logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents") return final_result.strip() def _mergeChunkResultsClean( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> str: """Merge chunk results in CLEAN mode - no debug metadata or document headers.""" if not chunkResults: return "" # Get merging configuration from options chunk_separator = "\n\n" include_document_headers = False # CLEAN MODE: No document headers include_chunk_metadata = False # CLEAN MODE: No chunk metadata if options: if hasattr(options, 'chunkSeparator'): chunk_separator = options.chunkSeparator elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n") # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge results for each document in CLEAN mode merged_documents = [] for doc_id, doc_chunks in results_by_document.items(): # CLEAN MODE: No document headers doc_header = "" # Merge chunks for this document doc_content = "" for i, chunk_result in enumerate(doc_chunks): # Add chunk separator (except for first chunk) if i > 0: doc_content += chunk_separator # Add chunk content without metadata chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): chunk_content = chunk_result.aiResult # CLEAN MODE: Skip container/binary chunks entirely if chunk_content.startswith("[Skipped ") and "content:" in chunk_content: continue # Skip container/binary chunks in clean mode # CLEAN MODE: Skip empty or whitespace-only chunks if not chunk_content.strip(): continue # Skip empty chunks in clean mode # CLEAN MODE: No chunk metadata doc_content += chunk_content else: # Handle error chunks silently in clean mode continue merged_documents.append(doc_header + doc_content) # Join all documents final_result = "\n\n".join(merged_documents) return final_result.strip() def _mergeChunkResultsJson( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """Merge chunk results in JSON mode - returns structured JSON document.""" import json if not chunkResults: return {"metadata": {"title": "Empty Document"}, "sections": []} # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge JSON results for each document all_documents = [] all_sections = [] document_titles = [] combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"} for doc_id, doc_chunks in results_by_document.items(): # Process each chunk's JSON result for chunk_result in doc_chunks: chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): try: # Parse JSON from AI result chunk_json = json.loads(chunk_result.aiResult) # Check if this is a multi-file response (has "documents" key) if isinstance(chunk_json, dict) and "documents" in chunk_json: # This is a multi-file response - merge all documents logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents") # Add all documents from this chunk for doc in chunk_json["documents"]: # Add chunk context to document doc["metadata"] = doc.get("metadata", {}) doc["metadata"]["source_chunk"] = chunk_result.chunkIndex doc["metadata"]["source_document"] = doc_id all_documents.append(doc) # Update combined metadata if "metadata" in chunk_json: combined_metadata.update(chunk_json["metadata"]) # Extract sections from single-file response (fallback) elif isinstance(chunk_json, dict) and "sections" in chunk_json: for section in chunk_json["sections"]: # Add document context to section section["metadata"] = section.get("metadata", {}) section["metadata"]["source_document"] = doc_id section["metadata"]["chunk_index"] = chunk_result.chunkIndex all_sections.append(section) # Extract document title if isinstance(chunk_json, dict) and "metadata" in chunk_json: title = chunk_json["metadata"].get("title", "") if title and title not in document_titles: document_titles.append(title) except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON from chunk {chunk_result.chunkIndex}: {str(e)}") # Create a fallback section for invalid JSON fallback_section = { "id": f"error_section_{chunk_result.chunkIndex}", "title": "Error Section", "content_type": "paragraph", "elements": [{ "text": f"Error parsing chunk {chunk_result.chunkIndex}: {str(e)}" }], "order": chunk_result.chunkIndex, "metadata": { "source_document": doc_id, "chunk_index": chunk_result.chunkIndex, "error": str(e) } } all_sections.append(fallback_section) else: # Handle error chunks error_section = { "id": f"error_section_{chunk_result.chunkIndex}", "title": "Error Section", "content_type": "paragraph", "elements": [{ "text": f"Error in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}" }], "order": chunk_result.chunkIndex, "metadata": { "source_document": doc_id, "chunk_index": chunk_result.chunkIndex, "error": chunk_metadata.get('error', 'Unknown error') } } all_sections.append(error_section) # Sort sections by order all_sections.sort(key=lambda x: x.get("order", 0)) # If we have merged documents from multi-file responses, return them if all_documents: logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks") return { "metadata": combined_metadata, "documents": all_documents } # Otherwise, create merged document with sections (single-file fallback) merged_document = { "metadata": { "title": document_titles[0] if document_titles else "Merged Document", "source_documents": list(results_by_document.keys()), "extraction_method": "ai_json_extraction", "version": "1.0" }, "sections": all_sections, "summary": f"Merged document from {len(results_by_document)} source documents", "tags": ["merged", "ai_generated"] } logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (JSON mode)") return merged_document def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: """ Get model capabilities for content processing, including appropriate size limits for chunking. """ # Estimate total content size prompt_size = len(prompt.encode('utf-8')) document_size = 0 if documents: # Rough estimate of document content size for doc in documents: document_size += doc.fileSize or 0 total_size = prompt_size + document_size # Use AiObjects to select the best model for this content size # We'll simulate the model selection by checking available models from modules.interfaces.interfaceAiObjects import aiModels # Find the best model for this content size and operation best_model = None best_context_length = 0 for model_name, model_info in aiModels.items(): context_length = model_info.get("contextLength", 0) # Skip models with no context length or too small for content if context_length == 0: continue # Check if model supports the operation type capabilities = model_info.get("capabilities", []) if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities: continue elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities: continue elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities: continue elif "text_generation" not in capabilities: continue # Prefer models that can handle the content without chunking, but allow chunking if needed if context_length >= total_size * 0.8: # 80% of content size if context_length > best_context_length: best_model = model_info best_context_length = context_length elif best_model is None: # Fallback to largest available model if context_length > best_context_length: best_model = model_info best_context_length = context_length # Fallback to a reasonable default if no model found if best_model is None: best_model = { "contextLength": 128000, # GPT-4o default "llmName": "gpt-4o" } # Calculate appropriate sizes # Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters) context_length_bytes = int(best_model["contextLength"] * 4) max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}") logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") return { "maxContextBytes": max_context_bytes, "textChunkSize": text_chunk_size, "imageChunkSize": image_chunk_size }