import logging from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType logger = logging.getLogger(__name__) class SubDocumentGeneration: """Document generation operations including single-file and multi-file generation.""" def __init__(self, services, aiObjects, documentProcessor): """Initialize document generation service. Args: services: Service center instance for accessing other services aiObjects: Initialized AiObjects instance documentProcessor: Document processing service instance """ self.services = services self.aiObjects = aiObjects self.documentProcessor = documentProcessor async def callAiWithDocumentGeneration( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions, outputFormat: str, title: Optional[str] ) -> Dict[str, Any]: """ Unified document generation method that handles both single and multi-file cases. Always uses multi-file approach internally. Args: prompt: The main prompt for the AI call documents: Optional list of documents to process options: AI call configuration options outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx) title: Optional title for generated documents Returns: Dict with generated documents and metadata in unified structure """ try: # 1. Analyze prompt intent promptAnalysis = await self._analyzePromptIntent(prompt, self) logger.info(f"Prompt analysis result: {promptAnalysis}") # 2. Get unified extraction prompt from modules.services.serviceGeneration.mainServiceGeneration import GenerationService generationService = GenerationService(self.services) extractionPrompt = await generationService.getAdaptiveExtractionPrompt( outputFormat=outputFormat, userPrompt=prompt, title=title, promptAnalysis=promptAnalysis, aiService=self ) # 3. Process with unified pipeline (always multi-file approach) aiResponse = await self._processDocumentsUnified( documents, extractionPrompt, options ) # 4. Return unified result structure return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis) except Exception as e: logger.error(f"Error in unified document generation: {str(e)}") return self._buildErrorResult(str(e), outputFormat, title) async def _processDocumentsUnified( self, documents: Optional[List[ChatDocument]], extractionPrompt: str, options: AiCallOptions ) -> Dict[str, Any]: """ Unified document processing that handles both single and multi-file cases. Always processes as multi-file structure internally. """ import time # Create progress logger workflow = self.services.currentWorkflow progressLogger = self.services.workflow.createProgressLogger(workflow) operationId = f"docGenUnified_{workflow.id}_{int(time.time())}" try: # Start progress tracking progressLogger.startOperation( operationId, "Generate", "Unified Document Generation", f"Processing {len(documents) if documents else 0} documents" ) # Update progress - generating extraction prompt progressLogger.updateProgress(operationId, 0.1, "Generating prompt") # Write prompt to debug file from modules.shared.debugLogger import writeDebugFile writeDebugFile(extractionPrompt, "extraction_prompt", documents) # Process with unified JSON pipeline using continuation logic aiResponse = await self.documentProcessor.processDocumentsWithContinuation( documents, extractionPrompt, options ) # Update progress - AI processing completed progressLogger.updateProgress(operationId, 0.6, "Processing done") # Write AI response to debug file from modules.shared.debugLogger import writeDebugFile import json response_json = json.dumps(aiResponse, indent=2, ensure_ascii=False) if isinstance(aiResponse, dict) else str(aiResponse) writeDebugFile(response_json, "ai_response", documents) # Validate response structure if not self._validateUnifiedResponseStructure(aiResponse): raise Exception("AI response is not valid unified document structure") # Emit raw extracted data as a chat message attachment try: await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified") except Exception: logger.warning("Failed to emit raw extraction chat message (unified)") # Complete progress tracking progressLogger.completeOperation(operationId, True) return aiResponse except Exception as e: logger.error(f"Error in unified document processing: {str(e)}") progressLogger.completeOperation(operationId, False) raise def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool: """ Unified validation that checks for document structure. Handles both multi-file (documents array) and single-file (sections array) structures. """ try: if not isinstance(response, dict): logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") return False # Check for documents array (multi-file structure) hasDocuments = "documents" in response isDocumentsList = isinstance(response.get("documents"), list) # Check for sections array (single-file structure) hasSections = "sections" in response isSectionsList = isinstance(response.get("sections"), list) if hasDocuments and isDocumentsList: # Multi-file structure documents = response.get("documents", []) if not documents: logger.warning("Unified validation failed: documents array is empty") return False # Validate each document individually validDocuments = 0 for i, doc in enumerate(documents): if self._validateDocumentStructure(doc, i): validDocuments += 1 else: logger.warning(f"Document {i} failed validation, but continuing with others") # Process succeeds if at least one document is valid if validDocuments == 0: logger.error("Unified validation failed: no valid documents found") return False logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid") return True elif hasSections and isSectionsList: # Single-file structure - convert to multi-file format logger.info("Converting single-file structure to multi-file format") sections = response.get("sections", []) if not sections: logger.warning("Unified validation failed: sections array is empty") return False # Convert to documents array format response["documents"] = [{ "id": "document_1", "title": response.get("metadata", {}).get("title", "Generated Document"), "filename": "document_1", "sections": sections }] logger.info("Successfully converted single-file structure to multi-file format") return True else: # No valid structure found - fail with clear error details logger.error("Unified validation failed: No valid structure found") logger.error(f"Response type: {type(response)}") logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}") logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}") logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}") logger.error(f"Full response: {response}") return False except Exception as e: logger.warning(f"Unified response validation failed with exception: {str(e)}") return False def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool: """ Validate individual document structure. Returns True if document is valid, False otherwise. Does not fail the entire process if one document is invalid. """ try: if not isinstance(document, dict): logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}") logger.error(f"Document {documentIndex} content: {document}") return False # Check for required fields hasTitle = "title" in document hasSections = "sections" in document isSectionsList = isinstance(document.get("sections"), list) logger.debug(f"Document {documentIndex} structure check:") logger.debug(f" - hasTitle: {hasTitle}") logger.debug(f" - hasSections: {hasSections}") logger.debug(f" - isSectionsList: {isSectionsList}") logger.debug(f" - available keys: {list(document.keys())}") if not (hasTitle and hasSections and isSectionsList): logger.error(f"Document {documentIndex} validation failed:") logger.error(f" - title present: {hasTitle}") logger.error(f" - sections present: {hasSections}") logger.error(f" - sections is list: {isSectionsList}") logger.error(f" - document content: {document}") return False sections = document.get("sections", []) if not sections: logger.error(f"Document {documentIndex} validation failed: sections array is empty") logger.error(f" - document content: {document}") return False logger.info(f"Document {documentIndex} validation passed") return True except Exception as e: logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}") logger.error(f" - document content: {document}") return False async def _buildUnifiedResult( self, aiResponse: Dict[str, Any], outputFormat: str, title: str, promptAnalysis: Dict[str, Any] ) -> Dict[str, Any]: """ Build unified result structure that always returns array-based format. Content is always a multi-document structure. """ try: # Process all documents uniformly generatedDocuments = [] documents = aiResponse.get("documents", []) for i, docData in enumerate(documents): try: processedDocument = await self._processDocument( docData, outputFormat, title, i ) generatedDocuments.append(processedDocument) except Exception as e: logger.warning(f"Failed to process document {i}: {str(e)}, skipping") continue if not generatedDocuments: raise Exception("No documents could be processed successfully") # Build unified result result = { "success": True, "content": aiResponse, # Always multi-document structure "documents": generatedDocuments, # Always array "is_multi_file": len(generatedDocuments) > 1, "format": outputFormat, "title": title, "split_strategy": promptAnalysis.get("strategy", "single"), "total_documents": len(generatedDocuments), "processed_documents": len(generatedDocuments) } return result except Exception as e: logger.error(f"Error building unified result: {str(e)}") return self._buildErrorResult(str(e), outputFormat, title) async def _processDocument( self, docData: Dict[str, Any], outputFormat: str, title: str, documentIndex: int ) -> Dict[str, Any]: """ Process individual document with content enhancement and rendering. """ try: # Get generation service from modules.services.serviceGeneration.mainServiceGeneration import GenerationService generationService = GenerationService(self.services) # Use AI generation to enhance the extracted JSON before rendering enhancedContent = docData # Default to original if docData.get("sections"): try: # Get generation prompt generationPrompt = await generationService.getGenerationPrompt( outputFormat=outputFormat, userPrompt=title, title=docData.get("title", title), aiService=self ) # Prepare the AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType requestOptions = AiCallOptions() requestOptions.operationType = OperationType.GENERAL # Create context with the extracted JSON content import json context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}" request = AiCallRequest( prompt=generationPrompt, context=context, options=requestOptions ) # Call AI to enhance the content response = await self.aiObjects.call(request) if response and response.content: # Parse the AI response as JSON try: import re result = response.content.strip() # Extract JSON from markdown if present jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) if jsonMatch: result = jsonMatch.group(1).strip() elif result.startswith('```json'): result = re.sub(r'^```json\s*', '', result) result = re.sub(r'\s*```$', '', result) elif result.startswith('```'): result = re.sub(r'^```\s*', '', result) result = re.sub(r'\s*```$', '', result) # Try to parse JSON enhancedContent = json.loads(result) logger.info(f"AI enhanced JSON content successfully for document {documentIndex}") except json.JSONDecodeError as e: logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content") enhancedContent = docData else: logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content") enhancedContent = docData except Exception as e: logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content") enhancedContent = docData # Render the enhanced JSON content renderedContent, mimeType = await generationService.renderReport( extractedContent=enhancedContent, outputFormat=outputFormat, title=docData.get("title", title), userPrompt=title, aiService=self ) # Generate proper filename baseFilename = docData.get("filename", f"document_{documentIndex + 1}") if '.' in baseFilename: baseFilename = baseFilename.rsplit('.', 1)[0] # Add proper extension based on output format if outputFormat.lower() == "docx": filename = f"{baseFilename}.docx" elif outputFormat.lower() == "pdf": filename = f"{baseFilename}.pdf" elif outputFormat.lower() == "html": filename = f"{baseFilename}.html" else: filename = f"{baseFilename}.{outputFormat}" return { "documentName": filename, "documentData": renderedContent, "mimeType": mimeType, "title": docData.get("title", title), "documentIndex": documentIndex } except Exception as e: logger.error(f"Error processing document {documentIndex}: {str(e)}") raise def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]: """ Build error result with unified structure. """ return { "success": False, "error": errorMessage, "content": {}, "documents": [], "is_multi_file": False, "format": outputFormat, "title": title, "split_strategy": "error", "total_documents": 0, "processed_documents": 0 } async def _callAiJson( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions ) -> Dict[str, Any]: """ Handle AI calls with document processing for JSON output. Returns structured JSON document instead of text. """ # Process documents with JSON merging return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options) async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]: """Use AI to analyze user prompt and determine processing requirements.""" if not ai_service: return {"is_multi_file": False, "strategy": "single", "criteria": None} try: analysis_prompt = f""" Analyze this user request and determine if it requires multiple file output or single file output. User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}" Respond with JSON only in this exact format: {{ "is_multi_file": true/false, "strategy": "single|per_entity|by_section|by_criteria|custom", "criteria": "description of how to split content", "file_naming_pattern": "suggested pattern for filenames", "reasoning": "brief explanation of the analysis" }} Consider: - Does the user want separate files for different entities (customers, products, etc.)? - Does the user want to split content into multiple documents? - What would be the most logical way to organize the content? - What language is the request in? (analyze in the original language) Return only the JSON response. """ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType request_options = AiCallOptions() request_options.operationType = OperationType.GENERAL request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await ai_service.aiObjects.call(request) if response and response.content: import json import re # Extract JSON from response result = response.content.strip() json_match = re.search(r'\{.*\}', result, re.DOTALL) if json_match: result = json_match.group(0) analysis = json.loads(result) return analysis else: return {"is_multi_file": False, "strategy": "single", "criteria": None} except Exception as e: logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file") return {"is_multi_file": False, "strategy": "single", "criteria": None} async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None: """ Create a ChatMessage with the extracted raw JSON attached as a file so the user has access to the data even if downstream processing fails. """ try: services = self.services workflow = services.currentWorkflow # Serialize payload import json as _json from datetime import datetime, UTC ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") content_text = _json.dumps(payload, ensure_ascii=False, indent=2) content_bytes = content_text.encode('utf-8') # Store as file via component storage file_name = f"{label}_{ts}.json" file_item = services.interfaceDbComponent.createFile( name=file_name, mimeType="application/json", content=content_bytes ) services.interfaceDbComponent.createFileData(file_item.id, content_bytes) # Lookup file info for ChatDocument file_info = services.workflow.getFileInfo(file_item.id) doc = ChatDocument( messageId="", # set after message creation fileId=file_item.id, fileName=file_info.get("fileName", file_name) if file_info else file_name, fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes), mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json" ) # Create message referencing the file - include document in initial call messageData = { "workflowId": workflow.id, "role": "assistant", "message": "Raw extraction data saved", "status": "data", "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1, "publishedAt": services.utils.getUtcTimestamp(), "documentsLabel": label, "documents": [] } # Store message with document included from the start services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc]) except Exception: # Non-fatal; ignore if storage or chat creation fails return