import re import json import logging import time from datetime import datetime, UTC from typing import Dict, Any, List, Optional, Tuple, Union from modules.datamodels.datamodelChat import ChatDocument from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum logger = logging.getLogger(__name__) class SubDocumentGeneration: """Document generation operations including single-file and multi-file generation.""" def __init__(self, services, aiObjects, documentProcessor): """Initialize document generation service. Args: services: Service center instance for accessing other services aiObjects: Initialized AiObjects instance documentProcessor: Document processing service instance """ self.services = services self.aiObjects = aiObjects self.documentProcessor = documentProcessor async def callAiWithDocumentGeneration( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions, outputFormat: str, title: Optional[str] ) -> Dict[str, Any]: """ Unified document generation method that handles both single and multi-file cases. Always uses multi-file approach internally. Args: prompt: The main prompt for the AI call documents: Optional list of documents to process options: AI call configuration options outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx) title: Optional title for generated documents Returns: Dict with generated documents and metadata in unified structure """ try: # 1. Analyze prompt intent promptAnalysis = await self._analyzePromptIntent(prompt, self) logger.info(f"Prompt analysis result: {promptAnalysis}") # 2. Get unified extraction prompt from modules.services.serviceGeneration.mainServiceGeneration import GenerationService generationService = GenerationService(self.services) extractionPrompt = await generationService.getAdaptiveExtractionPrompt( outputFormat=outputFormat, userPrompt=prompt, title=title, promptAnalysis=promptAnalysis, aiService=self ) # 3. Process with unified pipeline (always multi-file approach) aiResponse = await self._processDocumentsUnified( documents, extractionPrompt, options ) # 4. Return unified result structure return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis) except Exception as e: logger.error(f"Error in unified document generation: {str(e)}") return self._buildErrorResult(str(e), outputFormat, title) async def _processDocumentsUnified( self, documents: Optional[List[ChatDocument]], extractionPrompt: str, options: AiCallOptions ) -> Dict[str, Any]: """ Unified document processing that handles both single and multi-file cases. Always processes as multi-file structure internally. """ # Init progress logger workflow = self.services.currentWorkflow operationId = f"docGenUnified_{workflow.id}_{int(time.time())}" try: # Start progress tracking self.services.workflow.progressLogStart( operationId, "Generate", "Unified Document Generation", f"Processing {len(documents) if documents else 0} documents" ) # Update progress - generating extraction prompt self.services.workflow.progressLogUpdate(operationId, 0.1, "Generating prompt") # Write prompt to debug file self.services.utils.writeDebugFile(extractionPrompt, "extraction_prompt", documents) # Process with unified JSON pipeline using continuation logic aiResponse = await self.documentProcessor.processDocumentsWithContinuation( documents, extractionPrompt, options ) # Update progress - AI processing completed self.services.workflow.progressLogUpdate(operationId, 0.6, "Processing done") # Write AI response to debug file response_json = json.dumps(aiResponse, indent=2, ensure_ascii=False) if isinstance(aiResponse, dict) else str(aiResponse) self.services.utils.writeDebugFile(response_json, "ai_response", documents) # Validate response structure if not self._validateUnifiedResponseStructure(aiResponse): raise Exception("AI response is not valid unified document structure") # Emit raw extracted data as a chat message attachment try: await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified") except Exception: logger.warning("Failed to emit raw extraction chat message (unified)") # Complete progress tracking self.services.workflow.progressLogFinish(operationId, True) return aiResponse except Exception as e: logger.error(f"Error in unified document processing: {str(e)}") self.services.workflow.progressLogFinish(operationId, False) raise def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool: """ Unified validation that checks for document structure. Handles both multi-file (documents array) and single-file (sections array) structures. """ try: if not isinstance(response, dict): logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") return False # Check for documents array (multi-file structure) hasDocuments = "documents" in response isDocumentsList = isinstance(response.get("documents"), list) # Check for sections array (single-file structure) hasSections = "sections" in response isSectionsList = isinstance(response.get("sections"), list) if hasDocuments and isDocumentsList: # Multi-file structure documents = response.get("documents", []) if not documents: logger.warning("Unified validation failed: documents array is empty") return False # Validate each document individually validDocuments = 0 for i, doc in enumerate(documents): if self._validateDocumentStructure(doc, i): validDocuments += 1 else: logger.warning(f"Document {i} failed validation, but continuing with others") # Process succeeds if at least one document is valid if validDocuments == 0: logger.error("Unified validation failed: no valid documents found") return False logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid") return True elif hasSections and isSectionsList: # Single-file structure - convert to multi-file format logger.info("Converting single-file structure to multi-file format") sections = response.get("sections", []) if not sections: logger.warning("Unified validation failed: sections array is empty") return False # Convert to documents array format response["documents"] = [{ "id": "document_1", "title": response.get("metadata", {}).get("title", "Generated Document"), "filename": "document_1", "sections": sections }] logger.info("Successfully converted single-file structure to multi-file format") return True else: # No valid structure found - fail with clear error details logger.error("Unified validation failed: No valid structure found") logger.error(f"Response type: {type(response)}") logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}") logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}") logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}") logger.error(f"Full response: {response}") return False except Exception as e: logger.warning(f"Unified response validation failed with exception: {str(e)}") return False def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool: """ Validate individual document structure. Returns True if document is valid, False otherwise. Does not fail the entire process if one document is invalid. """ try: if not isinstance(document, dict): logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}") logger.error(f"Document {documentIndex} content: {document}") return False # Check for required fields hasTitle = "title" in document hasSections = "sections" in document isSectionsList = isinstance(document.get("sections"), list) logger.debug(f"Document {documentIndex} structure check:") logger.debug(f" - hasTitle: {hasTitle}") logger.debug(f" - hasSections: {hasSections}") logger.debug(f" - isSectionsList: {isSectionsList}") logger.debug(f" - available keys: {list(document.keys())}") if not (hasTitle and hasSections and isSectionsList): logger.error(f"Document {documentIndex} validation failed:") logger.error(f" - title present: {hasTitle}") logger.error(f" - sections present: {hasSections}") logger.error(f" - sections is list: {isSectionsList}") logger.error(f" - document content: {document}") return False sections = document.get("sections", []) if not sections: logger.error(f"Document {documentIndex} validation failed: sections array is empty") logger.error(f" - document content: {document}") return False logger.info(f"Document {documentIndex} validation passed") return True except Exception as e: logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}") logger.error(f" - document content: {document}") return False async def _buildUnifiedResult( self, aiResponse: Dict[str, Any], outputFormat: str, title: str, promptAnalysis: Dict[str, Any] ) -> Dict[str, Any]: """ Build unified result structure that always returns array-based format. Content is always a multi-document structure. """ try: # Process all documents uniformly generatedDocuments = [] documents = aiResponse.get("documents", []) for i, docData in enumerate(documents): try: processedDocument = await self._processDocument( docData, outputFormat, title, i ) generatedDocuments.append(processedDocument) except Exception as e: logger.warning(f"Failed to process document {i}: {str(e)}, skipping") continue if not generatedDocuments: raise Exception("No documents could be processed successfully") # Build unified result result = { "success": True, "content": aiResponse, # Always multi-document structure "documents": generatedDocuments, # Always array "is_multi_file": len(generatedDocuments) > 1, "format": outputFormat, "title": title, "split_strategy": promptAnalysis.get("strategy", "single"), "total_documents": len(generatedDocuments), "processed_documents": len(generatedDocuments) } return result except Exception as e: logger.error(f"Error building unified result: {str(e)}") return self._buildErrorResult(str(e), outputFormat, title) async def _processDocument( self, docData: Dict[str, Any], outputFormat: str, title: str, documentIndex: int ) -> Dict[str, Any]: """ Process individual document with content enhancement and rendering. """ try: # Get generation service from modules.services.serviceGeneration.mainServiceGeneration import GenerationService generationService = GenerationService(self.services) # Use AI generation to enhance the extracted JSON before rendering enhancedContent = docData # Default to original if docData.get("sections"): try: # Get generation prompt generationPrompt = await generationService.getGenerationPrompt( outputFormat=outputFormat, userPrompt=title, title=docData.get("title", title), aiService=self ) # Prepare the AI call from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum requestOptions = AiCallOptions() requestOptions.operationType = OperationTypeEnum.DATA_GENERATE # Create context with the extracted JSON content context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}" request = AiCallRequest( prompt=generationPrompt, context=context, options=requestOptions ) # Write document generation prompt to debug file self.services.utils.writeDebugFile(generationPrompt, "document_generation_enhancement_prompt") # Call AI to enhance the content response = await self.aiObjects.call(request) # Write document generation response to debug file self.services.utils.writeDebugFile(response.content or '', "document_generation_enhancement_response") if response and response.content: # Parse the AI response as JSON try: result = response.content.strip() # Extract JSON from markdown if present jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL) if jsonMatch: result = jsonMatch.group(1).strip() elif result.startswith('```json'): result = re.sub(r'^```json\s*', '', result) result = re.sub(r'\s*```$', '', result) elif result.startswith('```'): result = re.sub(r'^```\s*', '', result) result = re.sub(r'\s*```$', '', result) # Try to parse JSON enhancedContent = json.loads(result) logger.info(f"AI enhanced JSON content successfully for document {documentIndex}") except json.JSONDecodeError as e: logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content") enhancedContent = docData else: logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content") enhancedContent = docData except Exception as e: logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content") enhancedContent = docData # Render the enhanced JSON content renderedContent, mimeType = await generationService.renderReport( extractedContent=enhancedContent, outputFormat=outputFormat, title=docData.get("title", title), userPrompt=title, aiService=self ) # Generate proper filename baseFilename = docData.get("filename", f"document_{documentIndex + 1}") if '.' in baseFilename: baseFilename = baseFilename.rsplit('.', 1)[0] # Add proper extension based on output format if outputFormat.lower() == "docx": filename = f"{baseFilename}.docx" elif outputFormat.lower() == "pdf": filename = f"{baseFilename}.pdf" elif outputFormat.lower() == "html": filename = f"{baseFilename}.html" else: filename = f"{baseFilename}.{outputFormat}" return { "documentName": filename, "documentData": renderedContent, "mimeType": mimeType, "title": docData.get("title", title), "documentIndex": documentIndex } except Exception as e: logger.error(f"Error processing document {documentIndex}: {str(e)}") raise def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]: """ Build error result with unified structure. """ return { "success": False, "error": errorMessage, "content": {}, "documents": [], "is_multi_file": False, "format": outputFormat, "title": title, "split_strategy": "error", "total_documents": 0, "processed_documents": 0 } async def _callAiJson( self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions ) -> Dict[str, Any]: """ Handle AI calls with document processing for JSON output. Returns structured JSON document instead of text. """ # Process documents with JSON merging return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options) async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]: """Use AI to analyze user prompt and determine processing requirements.""" if not ai_service: return {"is_multi_file": False, "strategy": "single", "criteria": None} try: analysis_prompt = f""" Analyze this user request and determine if it requires multiple file output or single file output. User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}" Respond with JSON only in this exact format: {{ "is_multi_file": true/false, "strategy": "single|per_entity|by_section|by_criteria|custom", "criteria": "description of how to split content", "file_naming_pattern": "suggested pattern for filenames", "reasoning": "brief explanation of the analysis" }} Consider: - Does the user want separate files for different entities (customers, products, etc.)? - Does the user want to split content into multiple documents? - What would be the most logical way to organize the content? - What language is the request in? (analyze in the original language) Return only the JSON response. """ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum request_options = AiCallOptions() request_options.operationType = OperationTypeEnum.DATA_GENERATE request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) response = await ai_service.aiObjects.call(request) if response and response.content: # Extract JSON from response result = response.content.strip() json_match = re.search(r'\{.*\}', result, re.DOTALL) if json_match: result = json_match.group(0) analysis = json.loads(result) return analysis else: return {"is_multi_file": False, "strategy": "single", "criteria": None} except Exception as e: logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file") return {"is_multi_file": False, "strategy": "single", "criteria": None} async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None: """ Create a ChatMessage with the extracted raw JSON attached as a file so the user has access to the data even if downstream processing fails. """ try: services = self.services workflow = services.currentWorkflow # Serialize payload ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") content_text = json.dumps(payload, ensure_ascii=False, indent=2) content_bytes = content_text.encode('utf-8') # Store as file via component storage file_name = f"{label}_{ts}.json" file_item = services.interfaceDbComponent.createFile( name=file_name, mimeType="application/json", content=content_bytes ) services.interfaceDbComponent.createFileData(file_item.id, content_bytes) # Lookup file info for ChatDocument file_info = services.workflow.getFileInfo(file_item.id) doc = ChatDocument( messageId="", # set after message creation fileId=file_item.id, fileName=file_info.get("fileName", file_name) if file_info else file_name, fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes), mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json" ) # Create message referencing the file - include document in initial call messageData = { "workflowId": workflow.id, "role": "assistant", "message": "Raw extraction data saved", "status": "data", "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1, "publishedAt": services.utils.timestampGetUtc(), "documentsLabel": label, "documents": [] } # Store message with document included from the start services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc]) except Exception: # Non-fatal; ignore if storage or chat creation fails return