gateway/modules/services/serviceAi/subDocumentGeneration.py

import re
import json
import logging
import time
from datetime import datetime, UTC
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum

logger = logging.getLogger(__name__)


class SubDocumentGeneration:
    """Document generation operations including single-file and multi-file generation."""

    def __init__(self, services, aiObjects, documentProcessor):
        """Initialize document generation service.

        Args:
            services: Service center instance for accessing other services
            aiObjects: Initialized AiObjects instance
            documentProcessor: Document processing service instance
        """
        self.services = services
        self.aiObjects = aiObjects
        self.documentProcessor = documentProcessor

    async def callAiWithDocumentGeneration(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions,
        outputFormat: str,
        title: Optional[str]
        ) -> Dict[str, Any]:
        """
        Unified document generation method that handles both single and multi-file cases.
        Always uses multi-file approach internally.

        Args:
            prompt: The main prompt for the AI call
            documents: Optional list of documents to process
            options: AI call configuration options
            outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
            title: Optional title for generated documents

        Returns:
            Dict with generated documents and metadata in unified structure
        """
        try:
            # 1. Analyze prompt intent
            promptAnalysis = await self._analyzePromptIntent(prompt, self)
            logger.info(f"Prompt analysis result: {promptAnalysis}")

            # 2. Get unified extraction prompt
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generationService = GenerationService(self.services)

            extractionPrompt = await generationService.getAdaptiveExtractionPrompt(
                outputFormat=outputFormat,
                userPrompt=prompt,
                title=title,
                promptAnalysis=promptAnalysis,
                aiService=self
            )

            # 3. Process with unified pipeline (always multi-file approach)
            aiResponse = await self._processDocumentsUnified(
                documents, extractionPrompt, options
            )

            # 4. Return unified result structure
            return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis)

        except Exception as e:
            logger.error(f"Error in unified document generation: {str(e)}")
            return self._buildErrorResult(str(e), outputFormat, title)

    async def _processDocumentsUnified(
        self,
        documents: Optional[List[ChatDocument]],
        extractionPrompt: str,
        options: AiCallOptions
        ) -> Dict[str, Any]:
        """
        Unified document processing that handles both single and multi-file cases.
        Always processes as multi-file structure internally.
        """

        # Init progress logger
        workflow = self.services.currentWorkflow
        operationId = f"docGenUnified_{workflow.id}_{int(time.time())}"

        try:
            # Start progress tracking
            self.services.workflow.progressLogStart(
                operationId,
                "Generate",
                "Unified Document Generation",
                f"Processing {len(documents) if documents else 0} documents"
            )

            # Update progress - generating extraction prompt
            self.services.workflow.progressLogUpdate(operationId, 0.1, "Generating prompt")

            # Write prompt to debug file
            self.services.utils.writeDebugFile(extractionPrompt, "extraction_prompt", documents)

            # Process with unified JSON pipeline using continuation logic
            aiResponse = await self.documentProcessor.processDocumentsWithContinuation(
                documents, extractionPrompt, options
            )

            # Update progress - AI processing completed
            self.services.workflow.progressLogUpdate(operationId, 0.6, "Processing done")


            # Write AI response to debug file
            response_json = json.dumps(aiResponse, indent=2, ensure_ascii=False) if isinstance(aiResponse, dict) else str(aiResponse)
            self.services.utils.writeDebugFile(response_json, "ai_response", documents)

            # Validate response structure
            if not self._validateUnifiedResponseStructure(aiResponse):
                raise Exception("AI response is not valid unified document structure")

            # Emit raw extracted data as a chat message attachment
            try:
                await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified")
            except Exception:
                logger.warning("Failed to emit raw extraction chat message (unified)")

            # Complete progress tracking
            self.services.workflow.progressLogFinish(operationId, True)

            return aiResponse

        except Exception as e:
            logger.error(f"Error in unified document processing: {str(e)}")
            self.services.workflow.progressLogFinish(operationId, False)
            raise

    def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool:
        """
        Unified validation that checks for document structure.
        Handles both multi-file (documents array) and single-file (sections array) structures.
        """
        try:
            if not isinstance(response, dict):
                logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
                return False

            # Check for documents array (multi-file structure)
            hasDocuments = "documents" in response
            isDocumentsList = isinstance(response.get("documents"), list)

            # Check for sections array (single-file structure)
            hasSections = "sections" in response
            isSectionsList = isinstance(response.get("sections"), list)

            if hasDocuments and isDocumentsList:
                # Multi-file structure
                documents = response.get("documents", [])
                if not documents:
                    logger.warning("Unified validation failed: documents array is empty")
                    return False

                # Validate each document individually
                validDocuments = 0
                for i, doc in enumerate(documents):
                    if self._validateDocumentStructure(doc, i):
                        validDocuments += 1
                    else:
                        logger.warning(f"Document {i} failed validation, but continuing with others")

                # Process succeeds if at least one document is valid
                if validDocuments == 0:
                    logger.error("Unified validation failed: no valid documents found")
                    return False

                logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid")
                return True

            elif hasSections and isSectionsList:
                # Single-file structure - convert to multi-file format
                logger.info("Converting single-file structure to multi-file format")
                sections = response.get("sections", [])
                if not sections:
                    logger.warning("Unified validation failed: sections array is empty")
                    return False

                # Convert to documents array format
                response["documents"] = [{
                    "id": "document_1",
                    "title": response.get("metadata", {}).get("title", "Generated Document"),
                    "filename": "document_1",
                    "sections": sections
                }]

                logger.info("Successfully converted single-file structure to multi-file format")
                return True

            else:
                # No valid structure found - fail with clear error details
                logger.error("Unified validation failed: No valid structure found")
                logger.error(f"Response type: {type(response)}")
                logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
                logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}")
                logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}")
                logger.error(f"Full response: {response}")
                return False

        except Exception as e:
            logger.warning(f"Unified response validation failed with exception: {str(e)}")
            return False

    def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool:
        """
        Validate individual document structure.
        Returns True if document is valid, False otherwise.
        Does not fail the entire process if one document is invalid.
        """
        try:
            if not isinstance(document, dict):
                logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}")
                logger.error(f"Document {documentIndex} content: {document}")
                return False

            # Check for required fields
            hasTitle = "title" in document
            hasSections = "sections" in document
            isSectionsList = isinstance(document.get("sections"), list)

            logger.debug(f"Document {documentIndex} structure check:")
            logger.debug(f"  - hasTitle: {hasTitle}")
            logger.debug(f"  - hasSections: {hasSections}")
            logger.debug(f"  - isSectionsList: {isSectionsList}")
            logger.debug(f"  - available keys: {list(document.keys())}")

            if not (hasTitle and hasSections and isSectionsList):
                logger.error(f"Document {documentIndex} validation failed:")
                logger.error(f"  - title present: {hasTitle}")
                logger.error(f"  - sections present: {hasSections}")
                logger.error(f"  - sections is list: {isSectionsList}")
                logger.error(f"  - document content: {document}")
                return False

            sections = document.get("sections", [])
            if not sections:
                logger.error(f"Document {documentIndex} validation failed: sections array is empty")
                logger.error(f"  - document content: {document}")
                return False

            logger.info(f"Document {documentIndex} validation passed")
            return True

        except Exception as e:
            logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}")
            logger.error(f"  - document content: {document}")
            return False

    async def _buildUnifiedResult(
        self,
        aiResponse: Dict[str, Any],
        outputFormat: str,
        title: str,
        promptAnalysis: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Build unified result structure that always returns array-based format.
        Content is always a multi-document structure.
        """
        try:
            # Process all documents uniformly
            generatedDocuments = []
            documents = aiResponse.get("documents", [])

            for i, docData in enumerate(documents):
                try:
                    processedDocument = await self._processDocument(
                        docData, outputFormat, title, i
                    )
                    generatedDocuments.append(processedDocument)
                except Exception as e:
                    logger.warning(f"Failed to process document {i}: {str(e)}, skipping")
                    continue

            if not generatedDocuments:
                raise Exception("No documents could be processed successfully")

            # Build unified result
            result = {
                "success": True,
                "content": aiResponse,  # Always multi-document structure
                "documents": generatedDocuments,  # Always array
                "is_multi_file": len(generatedDocuments) > 1,
                "format": outputFormat,
                "title": title,
                "split_strategy": promptAnalysis.get("strategy", "single"),
                "total_documents": len(generatedDocuments),
                "processed_documents": len(generatedDocuments)
            }

            return result

        except Exception as e:
            logger.error(f"Error building unified result: {str(e)}")
            return self._buildErrorResult(str(e), outputFormat, title)

    async def _processDocument(
        self,
        docData: Dict[str, Any],
        outputFormat: str,
        title: str,
        documentIndex: int
    ) -> Dict[str, Any]:
        """
        Process individual document with content enhancement and rendering.
        """
        try:
            # Get generation service
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generationService = GenerationService(self.services)

            # Use AI generation to enhance the extracted JSON before rendering
            enhancedContent = docData  # Default to original
            if docData.get("sections"):
                try:
                    # Get generation prompt
                    generationPrompt = await generationService.getGenerationPrompt(
                        outputFormat=outputFormat,
                        userPrompt=title,
                        title=docData.get("title", title),
                        aiService=self
                    )

                    # Prepare the AI call
                    from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
                    requestOptions = AiCallOptions()
                    requestOptions.operationType = OperationTypeEnum.GENERAL

                    # Create context with the extracted JSON content
                    context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}"

                    request = AiCallRequest(
                        prompt=generationPrompt,
                        context=context,
                        options=requestOptions
                    )

                    # Call AI to enhance the content
                    response = await self.aiObjects.call(request)

                    if response and response.content:
                        # Parse the AI response as JSON
                        try:
                            result = response.content.strip()

                            # Extract JSON from markdown if present
                            jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
                            if jsonMatch:
                                result = jsonMatch.group(1).strip()
                            elif result.startswith('```json'):
                                result = re.sub(r'^```json\s*', '', result)
                                result = re.sub(r'\s*```$', '', result)
                            elif result.startswith('```'):
                                result = re.sub(r'^```\s*', '', result)
                                result = re.sub(r'\s*```$', '', result)

                            # Try to parse JSON
                            enhancedContent = json.loads(result)
                            logger.info(f"AI enhanced JSON content successfully for document {documentIndex}")

                        except json.JSONDecodeError as e:
                            logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content")
                            enhancedContent = docData
                    else:
                        logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content")
                        enhancedContent = docData

                except Exception as e:
                    logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content")
                    enhancedContent = docData

            # Render the enhanced JSON content
            renderedContent, mimeType = await generationService.renderReport(
                extractedContent=enhancedContent,
                outputFormat=outputFormat,
                title=docData.get("title", title),
                userPrompt=title,
                aiService=self
            )

            # Generate proper filename
            baseFilename = docData.get("filename", f"document_{documentIndex + 1}")
            if '.' in baseFilename:
                baseFilename = baseFilename.rsplit('.', 1)[0]

            # Add proper extension based on output format
            if outputFormat.lower() == "docx":
                filename = f"{baseFilename}.docx"
            elif outputFormat.lower() == "pdf":
                filename = f"{baseFilename}.pdf"
            elif outputFormat.lower() == "html":
                filename = f"{baseFilename}.html"
            else:
                filename = f"{baseFilename}.{outputFormat}"

            return {
                "documentName": filename,
                "documentData": renderedContent,
                "mimeType": mimeType,
                "title": docData.get("title", title),
                "documentIndex": documentIndex
            }

        except Exception as e:
            logger.error(f"Error processing document {documentIndex}: {str(e)}")
            raise

    def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]:
        """
        Build error result with unified structure.
        """
        return {
            "success": False,
            "error": errorMessage,
            "content": {},
            "documents": [],
            "is_multi_file": False,
            "format": outputFormat,
            "title": title,
            "split_strategy": "error",
            "total_documents": 0,
            "processed_documents": 0
        }

    async def _callAiJson(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions
        ) -> Dict[str, Any]:
        """
        Handle AI calls with document processing for JSON output.
        Returns structured JSON document instead of text.
        """
        # Process documents with JSON merging
        return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)

    async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
        """Use AI to analyze user prompt and determine processing requirements."""
        if not ai_service:
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

        try:
            analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.

User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}"

Respond with JSON only in this exact format:
{{
    "is_multi_file": true/false,
    "strategy": "single|per_entity|by_section|by_criteria|custom",
    "criteria": "description of how to split content",
    "file_naming_pattern": "suggested pattern for filenames",
    "reasoning": "brief explanation of the analysis"
}}

Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)

Return only the JSON response.
"""

            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
            request_options = AiCallOptions()
            request_options.operationType = OperationTypeEnum.GENERAL

            request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
            response = await ai_service.aiObjects.call(request)

            if response and response.content:
                # Extract JSON from response
                result = response.content.strip()
                json_match = re.search(r'\{.*\}', result, re.DOTALL)
                if json_match:
                    result = json_match.group(0)

                analysis = json.loads(result)
                return analysis
            else:
                return {"is_multi_file": False, "strategy": "single", "criteria": None}

        except Exception as e:
            logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

    async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
        """
        Create a ChatMessage with the extracted raw JSON attached as a file so the user
        has access to the data even if downstream processing fails.
        """
        try:
            services = self.services
            workflow = services.currentWorkflow

            # Serialize payload
            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            content_text = json.dumps(payload, ensure_ascii=False, indent=2)
            content_bytes = content_text.encode('utf-8')

            # Store as file via component storage
            file_name = f"{label}_{ts}.json"
            file_item = services.interfaceDbComponent.createFile(
                name=file_name,
                mimeType="application/json",
                content=content_bytes
            )
            services.interfaceDbComponent.createFileData(file_item.id, content_bytes)

            # Lookup file info for ChatDocument
            file_info = services.workflow.getFileInfo(file_item.id)
            doc = ChatDocument(
                messageId="",  # set after message creation
                fileId=file_item.id,
                fileName=file_info.get("fileName", file_name) if file_info else file_name,
                fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
                mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
            )

            # Create message referencing the file - include document in initial call
            messageData = {
                "workflowId": workflow.id,
                "role": "assistant",
                "message": "Raw extraction data saved",
                "status": "data",
                "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
                "publishedAt": services.utils.timestampGetUtc(),
                "documentsLabel": label,
                "documents": []
            }

            # Store message with document included from the start
            services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
        except Exception:
            # Non-fatal; ignore if storage or chat creation fails
            return