gateway/modules/services/serviceAi/subDocumentGeneration.py

import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType

logger = logging.getLogger(__name__)


class SubDocumentGeneration:
    """Document generation operations including single-file and multi-file generation."""

    def __init__(self, services, aiObjects, documentProcessor):
        """Initialize document generation service.

        Args:
            services: Service center instance for accessing other services
            aiObjects: Initialized AiObjects instance
            documentProcessor: Document processing service instance
        """
        self.services = services
        self.aiObjects = aiObjects
        self.documentProcessor = documentProcessor

    async def callAiWithDocumentGeneration(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions,
        outputFormat: str,
        title: Optional[str]
        ) -> Dict[str, Any]:
        """
        Unified document generation method that handles both single and multi-file cases.
        Always uses multi-file approach internally.

        Args:
            prompt: The main prompt for the AI call
            documents: Optional list of documents to process
            options: AI call configuration options
            outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
            title: Optional title for generated documents

        Returns:
            Dict with generated documents and metadata in unified structure
        """
        try:
            # 1. Analyze prompt intent
            promptAnalysis = await self._analyzePromptIntent(prompt, self)
            logger.info(f"Prompt analysis result: {promptAnalysis}")

            # 2. Get unified extraction prompt
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generationService = GenerationService(self.services)

            extractionPrompt = await generationService.getAdaptiveExtractionPrompt(
                outputFormat=outputFormat,
                userPrompt=prompt,
                title=title,
                promptAnalysis=promptAnalysis,
                aiService=self
            )

            # 3. Process with unified pipeline (always multi-file approach)
            aiResponse = await self._processDocumentsUnified(
                documents, extractionPrompt, options
            )

            # 4. Return unified result structure
            return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis)

        except Exception as e:
            logger.error(f"Error in unified document generation: {str(e)}")
            return self._buildErrorResult(str(e), outputFormat, title)

    async def _processDocumentsUnified(
        self,
        documents: Optional[List[ChatDocument]],
        extractionPrompt: str,
        options: AiCallOptions
        ) -> Dict[str, Any]:
        """
        Unified document processing that handles both single and multi-file cases.
        Always processes as multi-file structure internally.
        """
        import time

        # Create progress logger
        workflow = self.services.currentWorkflow
        progressLogger = self.services.workflow.createProgressLogger(workflow)
        operationId = f"docGenUnified_{workflow.id}_{int(time.time())}"

        try:
            # Start progress tracking
            progressLogger.startOperation(
                operationId,
                "Generate",
                "Unified Document Generation",
                f"Processing {len(documents) if documents else 0} documents"
            )

            # Update progress - generating extraction prompt
            progressLogger.updateProgress(operationId, 0.1, "Generating prompt")

            # Process with unified JSON pipeline using continuation logic
            aiResponse = await self.documentProcessor.processDocumentsWithContinuation(
                documents, extractionPrompt, options
            )

            # Update progress - AI processing completed
            progressLogger.updateProgress(operationId, 0.6, "Processing done")

            # Log the AI response for debugging
            logger.info(f"AI response received for validation:")
            logger.info(f"  - Type: {type(aiResponse)}")
            logger.info(f"  - Keys: {list(aiResponse.keys()) if isinstance(aiResponse, dict) else 'Not a dict'}")
            logger.info(f"  - Content: {aiResponse}")

            # Validate response structure
            if not self._validateUnifiedResponseStructure(aiResponse):
                raise Exception("AI response is not valid unified document structure")

            # Emit raw extracted data as a chat message attachment
            try:
                await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified")
            except Exception:
                logger.warning("Failed to emit raw extraction chat message (unified)")

            # Complete progress tracking
            progressLogger.completeOperation(operationId, True)

            return aiResponse

        except Exception as e:
            logger.error(f"Error in unified document processing: {str(e)}")
            progressLogger.completeOperation(operationId, False)
            raise

    def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool:
        """
        Unified validation that checks for document structure.
        Handles both multi-file (documents array) and single-file (sections array) structures.
        """
        try:
            if not isinstance(response, dict):
                logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
                return False

            # Check for documents array (multi-file structure)
            hasDocuments = "documents" in response
            isDocumentsList = isinstance(response.get("documents"), list)

            # Check for sections array (single-file structure)
            hasSections = "sections" in response
            isSectionsList = isinstance(response.get("sections"), list)

            if hasDocuments and isDocumentsList:
                # Multi-file structure
                documents = response.get("documents", [])
                if not documents:
                    logger.warning("Unified validation failed: documents array is empty")
                    return False

                # Validate each document individually
                validDocuments = 0
                for i, doc in enumerate(documents):
                    if self._validateDocumentStructure(doc, i):
                        validDocuments += 1
                    else:
                        logger.warning(f"Document {i} failed validation, but continuing with others")

                # Process succeeds if at least one document is valid
                if validDocuments == 0:
                    logger.error("Unified validation failed: no valid documents found")
                    return False

                logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid")
                return True

            elif hasSections and isSectionsList:
                # Single-file structure - convert to multi-file format
                logger.info("Converting single-file structure to multi-file format")
                sections = response.get("sections", [])
                if not sections:
                    logger.warning("Unified validation failed: sections array is empty")
                    return False

                # Convert to documents array format
                response["documents"] = [{
                    "id": "document_1",
                    "title": response.get("metadata", {}).get("title", "Generated Document"),
                    "filename": "document_1",
                    "sections": sections
                }]

                logger.info("Successfully converted single-file structure to multi-file format")
                return True

            else:
                # No valid structure found - fail with clear error details
                logger.error("Unified validation failed: No valid structure found")
                logger.error(f"Response type: {type(response)}")
                logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
                logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}")
                logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}")
                logger.error(f"Full response: {response}")
                return False

        except Exception as e:
            logger.warning(f"Unified response validation failed with exception: {str(e)}")
            return False

    def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool:
        """
        Validate individual document structure.
        Returns True if document is valid, False otherwise.
        Does not fail the entire process if one document is invalid.
        """
        try:
            if not isinstance(document, dict):
                logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}")
                logger.error(f"Document {documentIndex} content: {document}")
                return False

            # Check for required fields
            hasTitle = "title" in document
            hasSections = "sections" in document
            isSectionsList = isinstance(document.get("sections"), list)

            logger.debug(f"Document {documentIndex} structure check:")
            logger.debug(f"  - hasTitle: {hasTitle}")
            logger.debug(f"  - hasSections: {hasSections}")
            logger.debug(f"  - isSectionsList: {isSectionsList}")
            logger.debug(f"  - available keys: {list(document.keys())}")

            if not (hasTitle and hasSections and isSectionsList):
                logger.error(f"Document {documentIndex} validation failed:")
                logger.error(f"  - title present: {hasTitle}")
                logger.error(f"  - sections present: {hasSections}")
                logger.error(f"  - sections is list: {isSectionsList}")
                logger.error(f"  - document content: {document}")
                return False

            sections = document.get("sections", [])
            if not sections:
                logger.error(f"Document {documentIndex} validation failed: sections array is empty")
                logger.error(f"  - document content: {document}")
                return False

            logger.info(f"Document {documentIndex} validation passed")
            return True

        except Exception as e:
            logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}")
            logger.error(f"  - document content: {document}")
            return False

    async def _buildUnifiedResult(
        self,
        aiResponse: Dict[str, Any],
        outputFormat: str,
        title: str,
        promptAnalysis: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Build unified result structure that always returns array-based format.
        Content is always a multi-document structure.
        """
        try:
            # Process all documents uniformly
            generatedDocuments = []
            documents = aiResponse.get("documents", [])

            for i, docData in enumerate(documents):
                try:
                    processedDocument = await self._processDocument(
                        docData, outputFormat, title, i
                    )
                    generatedDocuments.append(processedDocument)
                except Exception as e:
                    logger.warning(f"Failed to process document {i}: {str(e)}, skipping")
                    continue

            if not generatedDocuments:
                raise Exception("No documents could be processed successfully")

            # Build unified result
            result = {
                "success": True,
                "content": aiResponse,  # Always multi-document structure
                "documents": generatedDocuments,  # Always array
                "is_multi_file": len(generatedDocuments) > 1,
                "format": outputFormat,
                "title": title,
                "split_strategy": promptAnalysis.get("strategy", "single"),
                "total_documents": len(generatedDocuments),
                "processed_documents": len(generatedDocuments)
            }

            return result

        except Exception as e:
            logger.error(f"Error building unified result: {str(e)}")
            return self._buildErrorResult(str(e), outputFormat, title)

    async def _processDocument(
        self,
        docData: Dict[str, Any],
        outputFormat: str,
        title: str,
        documentIndex: int
    ) -> Dict[str, Any]:
        """
        Process individual document with content enhancement and rendering.
        """
        try:
            # Get generation service
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generationService = GenerationService(self.services)

            # Use AI generation to enhance the extracted JSON before rendering
            enhancedContent = docData  # Default to original
            if docData.get("sections"):
                try:
                    # Get generation prompt
                    generationPrompt = await generationService.getGenerationPrompt(
                        outputFormat=outputFormat,
                        userPrompt=title,
                        title=docData.get("title", title),
                        aiService=self
                    )

                    # Prepare the AI call
                    from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
                    requestOptions = AiCallOptions()
                    requestOptions.operationType = OperationType.GENERAL

                    # Create context with the extracted JSON content
                    import json
                    context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}"

                    request = AiCallRequest(
                        prompt=generationPrompt,
                        context=context,
                        options=requestOptions
                    )

                    # Call AI to enhance the content
                    response = await self.aiObjects.call(request)

                    if response and response.content:
                        # Parse the AI response as JSON
                        try:
                            import re
                            result = response.content.strip()

                            # Extract JSON from markdown if present
                            jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
                            if jsonMatch:
                                result = jsonMatch.group(1).strip()
                            elif result.startswith('```json'):
                                result = re.sub(r'^```json\s*', '', result)
                                result = re.sub(r'\s*```$', '', result)
                            elif result.startswith('```'):
                                result = re.sub(r'^```\s*', '', result)
                                result = re.sub(r'\s*```$', '', result)

                            # Try to parse JSON
                            enhancedContent = json.loads(result)
                            logger.info(f"AI enhanced JSON content successfully for document {documentIndex}")

                        except json.JSONDecodeError as e:
                            logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content")
                            enhancedContent = docData
                    else:
                        logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content")
                        enhancedContent = docData

                except Exception as e:
                    logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content")
                    enhancedContent = docData

            # Render the enhanced JSON content
            renderedContent, mimeType = await generationService.renderReport(
                extractedContent=enhancedContent,
                outputFormat=outputFormat,
                title=docData.get("title", title),
                userPrompt=title,
                aiService=self
            )

            # Generate proper filename
            baseFilename = docData.get("filename", f"document_{documentIndex + 1}")
            if '.' in baseFilename:
                baseFilename = baseFilename.rsplit('.', 1)[0]

            # Add proper extension based on output format
            if outputFormat.lower() == "docx":
                filename = f"{baseFilename}.docx"
            elif outputFormat.lower() == "pdf":
                filename = f"{baseFilename}.pdf"
            elif outputFormat.lower() == "html":
                filename = f"{baseFilename}.html"
            else:
                filename = f"{baseFilename}.{outputFormat}"

            return {
                "documentName": filename,
                "documentData": renderedContent,
                "mimeType": mimeType,
                "title": docData.get("title", title),
                "documentIndex": documentIndex
            }

        except Exception as e:
            logger.error(f"Error processing document {documentIndex}: {str(e)}")
            raise

    def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]:
        """
        Build error result with unified structure.
        """
        return {
            "success": False,
            "error": errorMessage,
            "content": {},
            "documents": [],
            "is_multi_file": False,
            "format": outputFormat,
            "title": title,
            "split_strategy": "error",
            "total_documents": 0,
            "processed_documents": 0
        }

    async def _callAiJson(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions
        ) -> Dict[str, Any]:
        """
        Handle AI calls with document processing for JSON output.
        Returns structured JSON document instead of text.
        """
        # Process documents with JSON merging
        return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)

    async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
        """Use AI to analyze user prompt and determine processing requirements."""
        if not ai_service:
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

        try:
            analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.

User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}"

Respond with JSON only in this exact format:
{{
    "is_multi_file": true/false,
    "strategy": "single|per_entity|by_section|by_criteria|custom",
    "criteria": "description of how to split content",
    "file_naming_pattern": "suggested pattern for filenames",
    "reasoning": "brief explanation of the analysis"
}}

Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)

Return only the JSON response.
"""

            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL

            request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
            response = await ai_service.aiObjects.call(request)

            if response and response.content:
                import json
                import re

                # Extract JSON from response
                result = response.content.strip()
                json_match = re.search(r'\{.*\}', result, re.DOTALL)
                if json_match:
                    result = json_match.group(0)

                analysis = json.loads(result)
                return analysis
            else:
                return {"is_multi_file": False, "strategy": "single", "criteria": None}

        except Exception as e:
            logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

    async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
        """
        Create a ChatMessage with the extracted raw JSON attached as a file so the user
        has access to the data even if downstream processing fails.
        """
        try:
            services = self.services
            workflow = services.currentWorkflow

            # Serialize payload
            import json as _json
            from datetime import datetime, UTC
            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
            content_bytes = content_text.encode('utf-8')

            # Store as file via component storage
            file_name = f"{label}_{ts}.json"
            file_item = services.interfaceDbComponent.createFile(
                name=file_name,
                mimeType="application/json",
                content=content_bytes
            )
            services.interfaceDbComponent.createFileData(file_item.id, content_bytes)

            # Lookup file info for ChatDocument
            file_info = services.workflow.getFileInfo(file_item.id)
            doc = ChatDocument(
                messageId="",  # set after message creation
                fileId=file_item.id,
                fileName=file_info.get("fileName", file_name) if file_info else file_name,
                fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
                mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
            )

            # Create message referencing the file - include document in initial call
            messageData = {
                "workflowId": workflow.id,
                "role": "assistant",
                "message": "Raw extraction data saved",
                "status": "data",
                "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
                "publishedAt": services.utils.getUtcTimestamp(),
                "documentsLabel": label,
                "documents": []
            }

            # Store message with document included from the start
            services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
        except Exception:
            # Non-fatal; ignore if storage or chat creation fails
            return