import json import logging import re import time from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum from modules.datamodels.datamodelExtraction import ChunkResult, ContentExtracted from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService logger = logging.getLogger(__name__) class SubDocumentProcessing: """Document processing operations including chunking, processing, and merging.""" def __init__(self, services, aiObjects): """Initialize document processing service. Args: services: Service center instance for accessing other services aiObjects: Initialized AiObjects instance """ self.services = services self.aiObjects = aiObjects self._extractionService = None @property def extractionService(self): """Lazy initialization of extraction service.""" if self._extractionService is None: logger.info("Lazy initializing ExtractionService...") self._extractionService = ExtractionService(self.services) return self._extractionService def _calculateMaxContextBytes(self, options: Optional[AiCallOptions]) -> int: """Calculate maximum context bytes based on model capabilities and options.""" if options and options.maxContextBytes: return options.maxContextBytes # Default model capabilities (this should be enhanced with actual model registry) defaultMaxTokens = 4000 safetyMargin = options.safetyMargin if options else 0.1 # Calculate bytes (4 chars per token estimation) maxContextBytes = int(defaultMaxTokens * (1 - safetyMargin) * 4) return maxContextBytes async def processDocumentsPerChunk( self, documents: List[ChatDocument], prompt: str, options: Optional[AiCallOptions] = None ) -> str: """ Process documents with per-chunk AI calls and merge results. FIXED: Now preserves chunk relationships and document structure. Args: documents: List of ChatDocument objects to process prompt: AI prompt for processing options: AI call options Returns: Merged AI results as string with preserved document structure """ if not documents: return "" # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": prompt, "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "capabilities": model_capabilities, "prompt": prompt, "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options: prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return "[Error: No extraction results]" # FIXED: Process chunks with proper mapping chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options) # FIXED: Merge with preserved chunk relationships mergedContent = self._mergeChunkResults(chunkResults, options) # Save merged extraction content to debug self.services.utils.writeDebugFile(mergedContent or '', "extractionMergedText") return mergedContent except Exception as e: logger.error(f"Error in per-chunk processing: {str(e)}") return f"[Error in per-chunk processing: {str(e)}]" async def processDocumentsPerChunkJson( self, documents: List[ChatDocument], prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with per-chunk AI calls and merge results in JSON mode. Returns structured JSON document instead of text. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": prompt, "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "capabilities": model_capabilities, "prompt": prompt, "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return {"metadata": {"title": "Error Document"}, "sections": []} # Process chunks with proper mapping chunkResults = await self._processChunksWithMapping(extractionResult, prompt, options, generate_json=True) # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) # Normalize merged JSON into a single canonical table (only if table content exists) try: from modules.services.serviceNormalization.mainServiceNormalization import NormalizationService normalizer = NormalizationService(self.services) inventory = normalizer.discoverStructures(mergedJsonDocument) # Check if any table content was discovered tableHeaders = inventory.get("tableHeaders", []) if not tableHeaders: logger.info("No table content found in merged JSON, skipping normalization and returning original structure") else: # Use workflow id as cache key cacheKey = self.services.currentWorkflow.id # Provide the extraction/merge prompt context when available to help mapping mergePrompt = prompt mapping = await normalizer.requestHeaderMapping(inventory, cacheKey, None, mergePrompt) canonical = normalizer.applyMapping(mergedJsonDocument, mapping) report = normalizer.validateCanonical(canonical) if report.get('success'): mergedJsonDocument = canonical else: raise ValueError('Normalization produced zero rows') except Exception as e: # Log normalization failure but don't re-raise - continue with original merged JSON logger.warning(f"Normalization failed (expected): {str(e)}") logger.debug(f"Normalization error type: {type(e).__name__}") # Continue with original merged JSON instead of re-raising # Save merged JSON extraction content to debug jsonStr = json.dumps(mergedJsonDocument, ensure_ascii=False, indent=2) self.services.utils.writeDebugFile(jsonStr, "extractionMergedJson") return mergedJsonDocument except Exception as e: logger.error(f"Error in per-chunk processing (JSON mode): {str(e)}") logger.error(f"Exception type: {type(e).__name__}") logger.error(f"Exception args: {e.args}") import traceback logger.error(f"Traceback: {traceback.format_exc()}") return {"metadata": {"title": "Error Document"}, "sections": []} async def processDocumentsPerChunkJsonWithPrompt( self, documents: List[ChatDocument], custom_prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with per-chunk AI calls and merge results in JSON mode. Uses a custom prompt instead of the default extraction prompt. Enhanced with partial results continuation logic. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} # Get model capabilities for size calculation model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options) # Build extraction options for chunking with intelligent merging extractionOptions: Dict[str, Any] = { "prompt": custom_prompt, # Use the custom prompt instead of default "operationType": options.operationType if options else "general", "processDocumentsIndividually": True, # Process each document separately "maxSize": model_capabilities["maxContextBytes"], "chunkAllowed": True, "textChunkSize": model_capabilities["textChunkSize"], "imageChunkSize": model_capabilities["imageChunkSize"], "imageMaxPixels": 1024 * 1024, "imageQuality": 85, "mergeStrategy": { "useIntelligentMerging": True, # Enable intelligent token-aware merging "capabilities": model_capabilities, "prompt": custom_prompt, # Use the custom prompt "groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate" }, } logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") try: # Extract content with chunking extractionResult = self.extractionService.extractContent(documents, extractionOptions) if not isinstance(extractionResult, list): return {"metadata": {"title": "Error Document"}, "sections": []} # Process chunks with proper mapping logger.info(f"Processing {len(extractionResult)} chunks with custom prompt") logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...") # Debug: Show what content is being processed (before filtering) for i, ec in enumerate(extractionResult): if hasattr(ec, 'parts'): for j, part in enumerate(ec.parts): if not (hasattr(part, 'data') and part.data): # Check if this is an empty container chunk (which is expected) part_type = getattr(part, 'typeGroup', None) part_mime = getattr(part, 'mimeType', '') is_empty_container = ( part_type == "container" and part_mime and 'document' in part_mime.lower() ) if not is_empty_container: logger.warning(f"Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}'") chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True) # Debug: Show what chunks were actually processed (after filtering) logger.info(f"After filtering: {len(chunkResults)} chunks will be processed") # Merge with JSON mode mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) # Debug: Show what the AI actually returned logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}") if 'documents' in mergedJsonDocument: logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}") elif 'sections' in mergedJsonDocument: logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") return mergedJsonDocument except Exception as e: logger.error(f"Error in per-chunk JSON processing: {str(e)}") return {"metadata": {"title": "Error Document"}, "sections": []} async def processDocumentsWithContinuation( self, documents: List[ChatDocument], custom_prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with partial results continuation logic. Handles AI responses that indicate partial completion and loops until complete. """ if not documents: return {"metadata": {"title": "Empty Document"}, "sections": []} logger.info("Starting document processing with continuation logic") # Build enhanced prompt with continuation instructions enhanced_prompt = self._buildContinuationPrompt(custom_prompt) # Process with continuation logic return await self._processWithContinuationLoop(documents, enhanced_prompt, options) def _buildContinuationPrompt(self, base_prompt: str) -> str: """ Build a prompt that includes partial results continuation instructions. """ continuation_instructions = """ IMPORTANT CHUNKING LOGIC: - If the response is too large to generate completely in one response, set "continue": true - When "continue": true, include a "continuation_context" field with: - "last_section_id": "id of the last completed section" - "last_element_index": "index of the last completed element in that section" - "remaining_requirements": "brief description of what still needs to be generated" - The AI will be called again with this context to continue generation - Only set "continue": false when the response is completely generated OUTPUT FORMAT: Return only valid JSON in this exact structure: { "metadata": { "title": "Document Title" }, "sections": [ { "id": "section_1", "content_type": "paragraph", "elements": [ { "text": "This is the actual content that should be generated." } ], "order": 1 } ], "continue": false, "continuation_context": { "last_section_id": "section_1", "last_element_index": 0, "remaining_requirements": "description of what still needs to be generated" } } The AI should generate content using the canonical format with "sections" and "elements". """ return f"{base_prompt}{continuation_instructions}" async def _processWithContinuationLoop( self, documents: List[ChatDocument], enhanced_prompt: str, options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """ Process documents with continuation loop until complete. """ max_iterations = 10 # Prevent infinite loops iteration = 0 accumulated_sections = [] continuation_context = None while iteration < max_iterations: iteration += 1 logger.info(f"Continuation iteration {iteration}/{max_iterations}") # Build prompt for this iteration if continuation_context: iteration_prompt = self._buildContinuationIterationPrompt( enhanced_prompt, continuation_context, accumulated_sections ) else: iteration_prompt = enhanced_prompt # Process documents for this iteration try: # Use the existing processing method result = await self.processDocumentsPerChunkJsonWithPrompt( documents, iteration_prompt, options ) # Check if this is a valid JSON response if not isinstance(result, dict): logger.warning(f"Iteration {iteration}: Invalid result type, stopping") break # Extract sections from result sections = result.get("sections", []) if not sections: logger.warning(f"Iteration {iteration}: No sections found, stopping") break # Add sections to accumulated results for section in sections: # Update section order to maintain sequence section["order"] = len(accumulated_sections) + 1 accumulated_sections.append(section) # Check if continuation is needed continue_flag = result.get("continue", False) continuation_context = result.get("continuation_context") logger.info(f"Iteration {iteration}: Added {len(sections)} sections, continue={continue_flag}") if not continue_flag: logger.info(f"Continuation complete after {iteration} iterations") break if not continuation_context: logger.warning(f"Iteration {iteration}: continue=true but no continuation_context, stopping") break except Exception as e: logger.error(f"Iteration {iteration} failed: {str(e)}") break if iteration >= max_iterations: logger.warning(f"Continuation stopped after maximum iterations ({max_iterations})") # Build final result final_result = { "metadata": { "title": "Generated Document", "total_sections": len(accumulated_sections), "iterations": iteration, "continuation_used": iteration > 1 }, "sections": accumulated_sections, "continue": False } logger.info(f"Final result: {len(accumulated_sections)} sections from {iteration} iterations") return final_result def _buildContinuationIterationPrompt( self, base_prompt: str, continuation_context: Dict[str, Any], accumulated_sections: List[Dict[str, Any]] ) -> str: """ Build a prompt for continuation iteration with context. """ last_section_id = continuation_context.get("last_section_id", "") last_element_index = continuation_context.get("last_element_index", 0) remaining_requirements = continuation_context.get("remaining_requirements", "") # Build context of what's already been generated context_summary = "PREVIOUSLY GENERATED CONTENT:\n" for i, section in enumerate(accumulated_sections[-3:]): # Show last 3 sections for context context_summary += f"Section {i+1}: {section.get('id', 'unknown')}\n" if 'elements' in section and section['elements']: first_element = section['elements'][0] if 'text' in first_element: preview = first_element['text'][:100] + "..." if len(first_element['text']) > 100 else first_element['text'] context_summary += f" Preview: {preview}\n" continuation_prompt = f""" {base_prompt} {context_summary} CONTINUATION INSTRUCTIONS: - Continue from where you left off - Last completed section: {last_section_id} - Last completed element index: {last_element_index} - Remaining requirements: {remaining_requirements} - Generate the next part of the content - Maintain consistency with previously generated content - Use the same JSON format as before - Set "continue": true if more content is needed, false if complete """ return continuation_prompt async def callAiText( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions ) -> str: """ Handle text calls with document processing through ExtractionService. UNIFIED PROCESSING: Always use per-chunk processing for consistency. """ return await self.processDocumentsPerChunk(documents, prompt, options) async def _processChunksWithMapping( self, extractionResult: List[ContentExtracted], prompt: str, options: Optional[AiCallOptions] = None, generate_json: bool = False ) -> List[ChunkResult]: """Process chunks with proper mapping to preserve relationships.""" from modules.datamodels.datamodelExtraction import ChunkResult import asyncio # Collect all chunks that need processing with proper indexing chunks_to_process = [] chunk_index = 0 for ec in extractionResult: # Get document MIME type from metadata document_mime_type = None for part in ec.parts: if part.metadata and 'documentMimeType' in part.metadata: document_mime_type = part.metadata['documentMimeType'] break for part in ec.parts: if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"): # Skip empty container chunks (they're just metadata containers) if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") continue chunks_to_process.append({ 'part': part, 'chunk_index': chunk_index, 'document_id': ec.id, 'document_mime_type': document_mime_type }) chunk_index += 1 logger.info(f"Processing {len(chunks_to_process)} chunks with proper mapping") # Process chunks in parallel with proper mapping async def process_single_chunk(chunk_info: Dict) -> ChunkResult: part = chunk_info['part'] chunk_index = chunk_info['chunk_index'] document_id = chunk_info['document_id'] document_mime_type = chunk_info.get('document_mime_type', part.mimeType) start_time = time.time() try: # FIXED: Check MIME type first, then fallback to typeGroup is_image = ( (document_mime_type and document_mime_type.startswith('image/')) or (part.mimeType and part.mimeType.startswith('image/')) or (part.typeGroup == "image") ) # Debug logging self.services.utils.debugLogToFile(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: document_mime_type={document_mime_type}, part.mimeType={part.mimeType}, part.typeGroup={part.typeGroup}, is_image={is_image}") if is_image: # Use the same extraction prompt for image analysis (contains table JSON format) self.services.utils.debugLogToFile(f"Processing image chunk {chunk_index}: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Check if image data is available if not part.data: error_msg = f"No image data available for chunk {chunk_index}" logger.warning(error_msg) ai_result = f"Error: {error_msg}" else: try: # Import here to avoid circular imports from modules.services.serviceAi.subCoreAi import SubCoreAi core_ai = SubCoreAi(self.services, self.aiObjects) ai_result = await core_ai.readImage( prompt=prompt, imageData=part.data, mimeType=part.mimeType, options=options ) self.services.utils.debugLogToFile(f"Image analysis result for chunk {chunk_index}: length={len(ai_result) if ai_result else 0}, preview={ai_result[:200] if ai_result else 'None'}...", "AI_SERVICE") # Save image extraction response to debug file self.services.utils.writeDebugFile(ai_result or 'No response', f"extraction_image_chunk_{chunk_index}") # Check if result is empty or None if not ai_result or not ai_result.strip(): logger.warning(f"Image chunk {chunk_index} returned empty response from AI") ai_result = "No content detected in image" except Exception as e: logger.error(f"Error processing image chunk {chunk_index}: {str(e)}") ai_result = f"Error analyzing image: {str(e)}" # If generating JSON, clean image analysis result if generate_json: try: # Clean the response - remove markdown code blocks if present cleaned_result = ai_result.strip() # Remove various markdown patterns if cleaned_result.startswith('```json'): cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) elif cleaned_result.startswith('```'): cleaned_result = re.sub(r'^```\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) # Remove any leading/trailing text that's not JSON # Look for the first { and last } to extract JSON first_brace = cleaned_result.find('{') last_brace = cleaned_result.rfind('}') if first_brace != -1 and last_brace != -1 and last_brace > first_brace: cleaned_result = cleaned_result[first_brace:last_brace + 1] # Additional cleaning for common AI response issues cleaned_result = cleaned_result.strip() # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version self.services.utils.debugLogToFile(f"Image chunk {chunk_index} JSON validation successful", "AI_SERVICE") except json.JSONDecodeError as e: logger.warning(f"Image chunk {chunk_index} returned invalid JSON: {str(e)}") logger.warning(f"Raw response was: '{ai_result[:500]}...'") # Create fallback JSON with the actual response content (not the error message) # Use the original AI response content, not the error message fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" self.services.utils.debugLogToFile(f"IMAGE FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") ai_result = json.dumps({ "metadata": {"title": f"Image Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"image_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for image chunk {chunk_index} with actual content", "AI_SERVICE") elif part.typeGroup in ("container", "binary"): # Handle ALL container and binary content generically - let AI process any document type self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Skip empty container chunks (they're just metadata containers) if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") # Skip processing this chunk pass elif part.mimeType and part.data and len(part.data.strip()) > 0: # Process any document container as text content request_options = options if options is not None else AiCallOptions() request_options.operationType = OperationTypeEnum.GENERAL self.services.utils.debugLogToFile(f"EXTRACTION CONTAINER CHUNK {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Processing {part.mimeType} container as text with generate_json={generate_json}") # Log extraction prompt and context self.services.utils.debugLogToFile(f"EXTRACTION PROMPT: {prompt}", "AI_SERVICE") self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") # Strengthen prompt to forbid fabrication for text/container extraction augmented_prompt = ( f"{prompt}\n\n" "CRITICAL RULES (NO FABRICATION):\n" "- Use ONLY content present in the provided CONTEXT.\n" "- Do NOT create, infer, or guess values not explicitly in the context.\n" "- If a value is missing, leave the cell empty or omit the row.\n" ) request = AiCallRequest( prompt=augmented_prompt, context=part.data, options=request_options ) response = await self.aiObjects.call(request) ai_result = response.content # Log extraction response self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") # Save extraction prompt and response to debug self.services.utils.writeDebugFile(augmented_prompt, f"extraction-Chunk{chunk_index}-Prompt") self.services.utils.writeDebugFile(ai_result or '', f"extraction-Chunk{chunk_index}-Response") # If generating JSON, validate the response if generate_json: try: # Clean the response - remove markdown code blocks if present cleaned_result = ai_result.strip() # Remove various markdown patterns if cleaned_result.startswith('```json'): cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) elif cleaned_result.startswith('```'): cleaned_result = re.sub(r'^```\s*', '', cleaned_result) cleaned_result = re.sub(r'\s*```$', '', cleaned_result) # Remove any leading/trailing text that's not JSON # Look for the first { and last } to extract JSON first_brace = cleaned_result.find('{') last_brace = cleaned_result.rfind('}') if first_brace != -1 and last_brace != -1 and last_brace > first_brace: cleaned_result = cleaned_result[first_brace:last_brace + 1] # Additional cleaning for common AI response issues cleaned_result = cleaned_result.strip() # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version except json.JSONDecodeError as e: logger.warning(f"Container chunk {chunk_index} ({part.mimeType}) returned invalid JSON: {str(e)}") logger.warning(f"Raw response was: '{ai_result[:500]}...'") # Create fallback JSON with the actual response content (not the error message) # Use the original AI response content, not the error message fallback_content = ai_result if ai_result and ai_result.strip() else "No content detected" self.services.utils.debugLogToFile(f"FALLBACK CONTENT PREVIEW: '{fallback_content[:200]}...'", "AI_SERVICE") ai_result = json.dumps({ "metadata": {"title": f"Document Analysis - Chunk {chunk_index}"}, "sections": [{ "id": f"analysis_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": fallback_content}] }] }) self.services.utils.debugLogToFile(f"Created fallback JSON for container chunk {chunk_index} with actual content", "AI_SERVICE") else: # Skip empty or invalid container/binary content - don't create a result self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") # Return None to indicate this chunk should be completely skipped return None else: # Ensure options is not None and set correct operation type for text request_options = options if options is not None else AiCallOptions() # FIXED: Set operation type to general for text processing request_options.operationType = OperationTypeEnum.GENERAL self.services.utils.debugLogToFile(f"EXTRACTION CHUNK {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}", "AI_SERVICE") logger.info(f"Chunk {chunk_index}: Calling aiObjects.call with operationType={request_options.operationType}, generate_json={generate_json}") # Log extraction context length self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") # Debug: Log the actual prompt being sent to AI logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...") logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...") # Strengthen prompt to forbid fabrication for text extraction augmented_prompt_text = ( f"{prompt}\n\n" "CRITICAL RULES (NO FABRICATION):\n" "- Use ONLY content present in the provided CONTEXT.\n" "- Do NOT create, infer, or guess values not explicitly in the context.\n" "- If a value is missing, leave the cell empty or omit the row.\n" ) request = AiCallRequest( prompt=augmented_prompt_text, context=part.data, options=request_options ) response = await self.aiObjects.call(request) # Debug: Log what AI actually returned logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...") ai_result = response.content # Log extraction response length self.services.utils.debugLogToFile(f"EXTRACTION RESPONSE LENGTH: {len(ai_result) if ai_result else 0} characters", "AI_SERVICE") # Save extraction prompt and response to debug self.services.utils.writeDebugFile(augmented_prompt_text, f"extractionChunk{chunk_index}-Prompt") self.services.utils.writeDebugFile(ai_result or '', f"extractionChunk{chunk_index}-Response") # If generating JSON, validate the response if generate_json: try: # Clean the response - remove markdown code blocks and extra formatting cleaned_result = ai_result.strip() # Remove any markdown code block markers (```json, ```, etc.) cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE) cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE) # Remove any remaining ``` markers anywhere in the text cleaned_result = re.sub(r'```', '', cleaned_result) # Try to extract JSON from the response if it's embedded in other text json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL) if json_match: cleaned_result = json_match.group(0) # Validate JSON json.loads(cleaned_result) ai_result = cleaned_result # Use cleaned version except json.JSONDecodeError as e: logger.warning(f"Chunk {chunk_index} returned invalid JSON: {str(e)}") # Create fallback JSON ai_result = json.dumps({ "metadata": {"title": "Error Section"}, "sections": [{ "id": f"error_section_{chunk_index}", "content_type": "paragraph", "elements": [{"text": f"Error parsing JSON: {str(e)}"}] }] }) processing_time = time.time() - start_time logger.info(f"Chunk {chunk_index} processed: {len(ai_result)} chars in {processing_time:.2f}s") return ChunkResult( originalChunk=part, aiResult=ai_result, chunkIndex=chunk_index, documentId=document_id, processingTime=processing_time, metadata={ "success": True, "chunkSize": len(part.data) if part.data else 0, "resultSize": len(ai_result), "typeGroup": part.typeGroup } ) except Exception as e: processing_time = time.time() - start_time logger.warning(f"Error processing chunk {chunk_index}: {str(e)}") return ChunkResult( originalChunk=part, aiResult=f"[Error processing chunk: {str(e)}]", chunkIndex=chunk_index, documentId=document_id, processingTime=processing_time, metadata={ "success": False, "error": str(e), "chunkSize": len(part.data) if part.data else 0, "typeGroup": part.typeGroup } ) # Process chunks with concurrency control max_concurrent = 5 # Default concurrency if options and hasattr(options, 'maxConcurrentChunks'): max_concurrent = options.maxConcurrentChunks elif options and hasattr(options, 'maxParallelChunks'): max_concurrent = options.maxParallelChunks logger.info(f"Processing {len(chunks_to_process)} chunks with max concurrency: {max_concurrent}") self.services.utils.debugLogToFile(f"DEBUG: Chunks to process: {len(chunks_to_process)}", "AI_SERVICE") for i, chunk_info in enumerate(chunks_to_process): self.services.utils.debugLogToFile(f"DEBUG: Chunk {i}: typeGroup={chunk_info['part'].typeGroup}, mimeType={chunk_info['part'].mimeType}, data_length={len(chunk_info['part'].data) if chunk_info['part'].data else 0}", "AI_SERVICE") # Create semaphore for concurrency control semaphore = asyncio.Semaphore(max_concurrent) async def process_with_semaphore(chunk_info): async with semaphore: return await process_single_chunk(chunk_info) # Process all chunks in parallel with concurrency control tasks = [process_with_semaphore(chunk_info) for chunk_info in chunks_to_process] self.services.utils.debugLogToFile(f"DEBUG: Created {len(tasks)} tasks for parallel processing", "AI_SERVICE") chunk_results = await asyncio.gather(*tasks, return_exceptions=True) self.services.utils.debugLogToFile(f"DEBUG: Got {len(chunk_results)} results from parallel processing", "AI_SERVICE") # Handle any exceptions in the gather itself processed_results = [] for i, result in enumerate(chunk_results): if isinstance(result, Exception): # Create error ChunkResult chunk_info = chunks_to_process[i] processed_results.append(ChunkResult( originalChunk=chunk_info['part'], aiResult=f"[Error in parallel processing: {str(result)}]", chunkIndex=chunk_info['chunk_index'], documentId=chunk_info['document_id'], processingTime=0.0, metadata={"success": False, "error": str(result)} )) elif result is not None: # Only add non-None results (skip empty containers) processed_results.append(result) logger.info(f"Completed processing {len(processed_results)} chunks") return processed_results def _mergeChunkResults( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> str: """Merge chunk results while preserving document structure and chunk order.""" if not chunkResults: return "" # Get merging configuration from options chunk_separator = "\n\n---\n\n" include_document_headers = True include_chunk_metadata = False if options: if hasattr(options, 'chunkSeparator'): chunk_separator = options.chunkSeparator elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n---\n\n") # Check for enhanced options if hasattr(options, 'preserveChunkMetadata'): include_chunk_metadata = options.preserveChunkMetadata # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge results for each document merged_documents = [] for doc_id, doc_chunks in results_by_document.items(): # Build document header if enabled doc_header = "" if include_document_headers: doc_header = f"\n\n=== DOCUMENT: {doc_id} ===\n\n" # Merge chunks for this document doc_content = "" for i, chunk_result in enumerate(doc_chunks): # Add chunk separator (except for first chunk) if i > 0: doc_content += chunk_separator # Add chunk content with optional metadata chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): chunk_content = chunk_result.aiResult # Add chunk metadata if enabled if include_chunk_metadata: chunk_info = f"[Chunk {chunk_result.chunkIndex} - {chunk_metadata.get('typeGroup', 'unknown')} - {chunk_metadata.get('chunkSize', 0)} chars]" chunk_content = f"{chunk_info}\n{chunk_content}" doc_content += chunk_content else: # Handle error chunks error_msg = f"[ERROR in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}]" doc_content += error_msg merged_documents.append(doc_header + doc_content) # Join all documents final_result = "\n\n".join(merged_documents) logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents") return final_result.strip() def _mergeChunkResultsClean( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> str: """Merge chunk results in CLEAN mode - no debug metadata or document headers.""" if not chunkResults: return "" # Get merging configuration from options chunk_separator = "\n\n" include_document_headers = False # CLEAN MODE: No document headers include_chunk_metadata = False # CLEAN MODE: No chunk metadata if options: if hasattr(options, 'chunkSeparator'): chunk_separator = options.chunkSeparator elif hasattr(options, 'mergeStrategy') and options.mergeStrategy: chunk_separator = options.mergeStrategy.get("chunkSeparator", "\n\n") # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge results for each document in CLEAN mode merged_documents = [] for doc_id, doc_chunks in results_by_document.items(): # CLEAN MODE: No document headers doc_header = "" # Merge chunks for this document doc_content = "" for i, chunk_result in enumerate(doc_chunks): # Add chunk separator (except for first chunk) if i > 0: doc_content += chunk_separator # Add chunk content without metadata chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): chunk_content = chunk_result.aiResult # CLEAN MODE: Skip container/binary chunks entirely if chunk_content.startswith("[Skipped ") and "content:" in chunk_content: continue # Skip container/binary chunks in clean mode # CLEAN MODE: Skip empty or whitespace-only chunks if not chunk_content.strip(): continue # Skip empty chunks in clean mode # CLEAN MODE: No chunk metadata doc_content += chunk_content else: # Handle error chunks silently in clean mode continue merged_documents.append(doc_header + doc_content) # Join all documents final_result = "\n\n".join(merged_documents) return final_result.strip() def _mergeChunkResultsJson( self, chunkResults: List[ChunkResult], options: Optional[AiCallOptions] = None ) -> Dict[str, Any]: """Merge chunk results in JSON mode - returns structured JSON document.""" if not chunkResults: return {"metadata": {"title": "Empty Document"}, "sections": []} # Group chunk results by document results_by_document = {} for chunk_result in chunkResults: doc_id = chunk_result.documentId if doc_id not in results_by_document: results_by_document[doc_id] = [] results_by_document[doc_id].append(chunk_result) # Sort chunks within each document by chunk index for doc_id in results_by_document: results_by_document[doc_id].sort(key=lambda x: x.chunkIndex) # Merge JSON results for each document all_documents = [] all_sections = [] document_titles = [] combined_metadata = {"title": "Merged Document", "splitStrategy": "by_section"} for doc_id, doc_chunks in results_by_document.items(): # Process each chunk's JSON result for chunk_result in doc_chunks: chunk_metadata = chunk_result.metadata if chunk_metadata.get("success", False): try: # Parse JSON from AI result chunk_json = json.loads(chunk_result.aiResult) # Check if this is a multi-file response (has "documents" key) if isinstance(chunk_json, dict) and "documents" in chunk_json: # This is a multi-file response - merge all documents logger.debug(f"Processing multi-file response from chunk {chunk_result.chunkIndex} with {len(chunk_json['documents'])} documents") # Add all documents from this chunk for doc in chunk_json["documents"]: # Add chunk context to document doc["metadata"] = doc.get("metadata", {}) doc["metadata"]["source_chunk"] = chunk_result.chunkIndex doc["metadata"]["source_document"] = doc_id all_documents.append(doc) # Update combined metadata if "metadata" in chunk_json: combined_metadata.update(chunk_json["metadata"]) # Extract sections from single-file response (fallback) elif isinstance(chunk_json, dict) and "sections" in chunk_json: for section in chunk_json["sections"]: # Add document context to section section["metadata"] = section.get("metadata", {}) section["metadata"]["source_document"] = doc_id section["metadata"]["chunk_index"] = chunk_result.chunkIndex all_sections.append(section) # Extract document title if isinstance(chunk_json, dict) and "metadata" in chunk_json: title = chunk_json["metadata"].get("title", "") if title and title not in document_titles: document_titles.append(title) except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON from chunk {chunk_result.chunkIndex}: {str(e)}") # Create a fallback section for invalid JSON fallback_section = { "id": f"error_section_{chunk_result.chunkIndex}", "title": "Error Section", "content_type": "paragraph", "elements": [{ "text": f"Error parsing chunk {chunk_result.chunkIndex}: {str(e)}" }], "order": chunk_result.chunkIndex, "metadata": { "source_document": doc_id, "chunk_index": chunk_result.chunkIndex, "error": str(e) } } all_sections.append(fallback_section) else: # Handle error chunks error_section = { "id": f"error_section_{chunk_result.chunkIndex}", "title": "Error Section", "content_type": "paragraph", "elements": [{ "text": f"Error in chunk {chunk_result.chunkIndex}: {chunk_metadata.get('error', 'Unknown error')}" }], "order": chunk_result.chunkIndex, "metadata": { "source_document": doc_id, "chunk_index": chunk_result.chunkIndex, "error": chunk_metadata.get('error', 'Unknown error') } } all_sections.append(error_section) # Sort sections by order all_sections.sort(key=lambda x: x.get("order", 0)) # If we have merged documents from multi-file responses, return them if all_documents: logger.info(f"Merged {len(all_documents)} documents from {len(chunkResults)} chunks") return { "metadata": combined_metadata, "documents": all_documents } # Otherwise, create merged document with sections (single-file fallback) merged_document = { "metadata": { "title": document_titles[0] if document_titles else "Merged Document", "source_documents": list(results_by_document.keys()), "extraction_method": "ai_json_extraction", "version": "1.0" }, "sections": all_sections, "summary": f"Merged document from {len(results_by_document)} source documents", "tags": ["merged", "ai_generated"] } logger.info(f"Merged {len(chunkResults)} chunks from {len(results_by_document)} documents (JSON mode)") return merged_document def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]: """ Get model capabilities for content processing, including appropriate size limits for chunking. Uses centralized model selection to determine chunking parameters. """ # Estimate total content size prompt_size = len(prompt.encode('utf-8')) document_size = 0 if documents: # Rough estimate of document content size for doc in documents: document_size += doc.fileSize or 0 total_size = prompt_size + document_size # Use centralized model selection to get the best model for chunking parameters try: from modules.aicore.aicoreModelRegistry import modelRegistry from modules.aicore.aicoreModelSelector import model_selector # Get available models and select the best one for this operation availableModels = modelRegistry.getAvailableModels() selectedModel = model_selector.selectModel(prompt, "", options, availableModels) if selectedModel: context_length = selectedModel.contextLength model_name = selectedModel.name logger.debug(f"Selected model for chunking: {model_name} with context length: {context_length}") else: # Fallback to conservative default if no model selected context_length = 128000 # GPT-4o default model_name = "fallback" logger.warning(f"No model selected for chunking, using fallback context length: {context_length}") except Exception as e: # Fallback to conservative default if model selection fails context_length = 128000 # GPT-4o default model_name = "fallback" logger.error(f"Model selection failed for chunking: {e}, using fallback context length: {context_length}") # Calculate appropriate sizes # Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters) context_length_bytes = int(context_length * 4) max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes") logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes") return { "maxContextBytes": max_context_bytes, "textChunkSize": text_chunk_size, "imageChunkSize": image_chunk_size }