gateway/modules/services/serviceAi/subDocumentGeneration.py

import logging
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType

logger = logging.getLogger(__name__)


class SubDocumentGeneration:
    """Document generation operations including single-file and multi-file generation."""

    def __init__(self, services, aiObjects, documentProcessor):
        """Initialize document generation service.

        Args:
            services: Service center instance for accessing other services
            aiObjects: Initialized AiObjects instance
            documentProcessor: Document processing service instance
        """
        self.services = services
        self.aiObjects = aiObjects
        self.documentProcessor = documentProcessor

    async def callAiWithDocumentGeneration(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions,
        outputFormat: str,
        title: Optional[str]
        ) -> Dict[str, Any]:
        """
        Handle AI calls with document generation in specific output format.
        Now supports both single-file and multi-file generation.

        Args:
            prompt: The main prompt for the AI call
            documents: Optional list of documents to process
            options: AI call configuration options
            outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
            title: Optional title for generated documents

        Returns:
            Dict with generated documents and metadata
        """
        try:
            # Use AI to analyze prompt intent
            prompt_analysis = await self._analyzePromptIntent(prompt, self)
            logger.info(f"Prompt analysis result: {prompt_analysis}")

            if prompt_analysis.get("is_multi_file", False):
                return await self._callAiWithMultiFileGeneration(
                    prompt, documents, options, outputFormat, title, prompt_analysis
                )
            else:
                return await self._callAiWithSingleFileGeneration(
                    prompt, documents, options, outputFormat, title
                )

        except Exception as e:
            logger.error(f"Error in document generation: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "content": "",
                "rendered_content": "",
                "mime_type": "text/plain",
                "filename": f"error_{outputFormat}",
                "format": outputFormat,
                "title": title or "Error",
                "documents": []
            }

    async def _callAiWithSingleFileGeneration(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions,
        outputFormat: str,
        title: Optional[str],
        generationPrompt: Optional[str] = None
        ) -> Dict[str, Any]:
        """Handle single-file document generation (existing functionality)."""
        import time

        # Create progress logger
        workflow = self.services.currentWorkflow
        progressLogger = self.services.workflow.createProgressLogger(workflow)
        operationId = f"docGenSingle_{workflow.id}_{int(time.time())}"

        try:
            # Start progress tracking
            progressLogger.startOperation(
                operationId,
                "Generate",
                "Single-file Generation",
                f"Processing {len(documents) if documents else 0} documents"
            )

            # Get format-specific extraction prompt from generation service
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generation_service = GenerationService(self.services)

            # Use default title if not provided
            if not title:
                title = "AI Generated Document"

            # Update progress - generating extraction prompt
            progressLogger.updateProgress(operationId, 0.1, "Generating prompt")

            # Get format-specific extraction prompt
            extractionPrompt = await generation_service.getExtractionPrompt(
                outputFormat=outputFormat,
                userPrompt=prompt,
                title=title,
                aiService=self
            )

            # Update progress - starting AI processing
            progressLogger.updateProgress(operationId, 0.3, "AI processing")

            # Process documents with format-specific prompt using JSON mode
            # This ensures structured JSON output instead of text
            aiResponseJson = await self._callAiJson(extractionPrompt, documents, options)

            # Update progress - AI processing completed
            progressLogger.updateProgress(operationId, 0.6, "Processing done")

            # Validate JSON response
            if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
                raise Exception("AI response is not valid JSON document structure")

            # Emit raw extracted data as a chat message attachment before rendering
            try:
                await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
            except Exception:
                logger.warning("Failed to emit raw extraction chat message (single-file)")

            # Generate filename from document metadata
            parsedFilename = None
            try:
                if aiResponseJson.get("metadata", {}).get("title"):
                    title = aiResponseJson["metadata"]["title"]
                    # Clean title for filename
                    import re
                    parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title)
                    parsed = re.sub(r"-+", "-", parsed).strip('-')
                    if parsed:
                        parsedFilename = f"{parsed}.{outputFormat}"
            except Exception:
                parsedFilename = None

            # Use AI generation to enhance the extracted JSON before rendering
            enhancedContent = aiResponseJson  # Default to original
            if prompt:
                try:
                    from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType

                    # Get generation prompt
                    generationPrompt = await generation_service.getGenerationPrompt(
                        outputFormat=outputFormat,
                        userPrompt=prompt,
                        title=title,
                        aiService=self
                    )

                    # Prepare the AI call
                    request_options = AiCallOptions()
                    request_options.operationType = OperationType.GENERAL

                    # Create context with the extracted JSON content
                    import json
                    context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"

                    request = AiCallRequest(
                        prompt=generationPrompt,
                        context=context,
                        options=request_options
                    )

                    # Call AI to enhance the content
                    response = await self.aiObjects.call(request)

                    # Save generation prompt and response to debug
                    try:
                        from modules.shared.debugLogger import writeDebugFile
                        debugData = {
                            "output_format": outputFormat,
                            "title": title,
                            "context_length": len(context),
                            "extracted_content_keys": list(aiResponseJson.keys()) if isinstance(aiResponseJson, dict) else []
                        }
                        writeDebugFile(generationPrompt, "generation_single", debugData)
                        writeDebugFile(response.content or '', "generation_single_response")
                    except Exception:
                        pass

                    if response and response.content:
                        # Parse the AI response as JSON
                        try:
                            import re
                            result = response.content.strip()

                            # Check if result is empty after stripping
                            if not result:
                                logger.warning("AI generation returned empty content after stripping, using original content")
                                enhancedContent = aiResponseJson
                            else:
                                # Extract JSON from markdown if present
                                json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
                                if json_match:
                                    result = json_match.group(1).strip()
                                elif result.startswith('```json'):
                                    result = re.sub(r'^```json\s*', '', result)
                                    result = re.sub(r'\s*```$', '', result)
                                elif result.startswith('```'):
                                    result = re.sub(r'^```\s*', '', result)
                                    result = re.sub(r'\s*```$', '', result)

                                # Check if result is still empty after markdown extraction
                                if not result:
                                    logger.warning("AI generation returned empty content after markdown extraction, using original content")
                                    enhancedContent = aiResponseJson
                                else:
                                    # Try to parse JSON with better error handling
                                    try:
                                        enhancedContent = json.loads(result)
                                        logger.info(f"AI enhanced JSON content successfully")
                                    except json.JSONDecodeError as jsonError:
                                        # Try to fix common JSON issues
                                        fixed_result = self._attemptJsonFix(result)
                                        if fixed_result != result:
                                            try:
                                                enhancedContent = json.loads(fixed_result)
                                                logger.info(f"AI enhanced JSON content successfully after fixing")
                                            except json.JSONDecodeError:
                                                logger.warning(f"AI generation returned invalid JSON even after fixing: {str(jsonError)}, using original content")
                                                enhancedContent = aiResponseJson
                                        else:
                                            logger.warning(f"AI generation returned invalid JSON: {str(jsonError)}, using original content")
                                            enhancedContent = aiResponseJson

                        except json.JSONDecodeError as e:
                            logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
                            enhancedContent = aiResponseJson
                    else:
                        logger.warning("AI generation returned empty response, using original content")
                        enhancedContent = aiResponseJson

                except Exception as e:
                    logger.warning(f"AI generation failed: {str(e)}, using original content")
                    enhancedContent = aiResponseJson

            # Render the enhanced JSON content
            renderedContent, mimeType = await generation_service.renderReport(
                extractedContent=enhancedContent,
                outputFormat=outputFormat,
                title=title,
                userPrompt=prompt,
                aiService=self
            )

            # Generate meaningful filename (use AI-provided if valid, else fallback)
            from datetime import datetime, UTC
            timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
                filename = parsedFilename
            else:
                safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
                filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"

            # Update progress - generation completed
            progressLogger.updateProgress(operationId, 0.9, "Rendering")

            result = {
                "success": True,
                "content": aiResponseJson,  # Structured JSON document
                "rendered_content": renderedContent,  # Formatted content
                "mime_type": mimeType,
                "filename": filename,
                "format": outputFormat,
                "title": title,
                "documents": [{
                    "documentName": filename,
                    "documentData": renderedContent,
                    "mimeType": mimeType
                }],
                "is_multi_file": False
            }

            # Complete progress tracking
            progressLogger.completeOperation(operationId, True)

            return result

        except Exception as e:
            logger.error(f"Error in single-file document generation: {str(e)}")
            # Complete progress tracking with failure
            progressLogger.completeOperation(operationId, False)
            raise

    async def _callAiWithMultiFileGeneration(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions,
        outputFormat: str,
        title: Optional[str],
        prompt_analysis: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Handle multi-file document generation using AI analysis."""
        import time

        # Create progress logger
        workflow = self.services.currentWorkflow
        progressLogger = self.services.workflow.createProgressLogger(workflow)
        operationId = f"docGen_{workflow.id}_{int(time.time())}"

        try:
            # Start progress tracking
            progressLogger.startOperation(
                operationId,
                "Generate",
                "Multi-file Generation",
                f"Processing {len(documents) if documents else 0} documents"
            )

            # Get multi-file extraction prompt based on AI analysis
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generation_service = GenerationService(self.services)

            # Use default title if not provided
            if not title:
                title = "AI Generated Documents"

            # Update progress - generating extraction prompt
            progressLogger.updateProgress(operationId, 0.1, "Generating prompt")

            # Get adaptive extraction prompt
            extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
                outputFormat=outputFormat,
                userPrompt=prompt,
                title=title,
                promptAnalysis=prompt_analysis,
                aiService=self
            )

            logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
            logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")

            # Update progress - starting document processing
            progressLogger.updateProgress(operationId, 0.2, "Processing docs")

            # Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
            logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
            logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")

            # Use the existing pipeline but replace the prompt with our adaptive one
            # This ensures proper document processing while using the multi-file prompt
            ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)

            logger.info(f"AI response type: {type(ai_response)}")
            logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
            logger.debug(f"AI response preview: {str(ai_response)[:500]}...")

            # Validate response structure
            if not self._validateResponseStructure(ai_response, prompt_analysis):
                # Fallback to single-file if multi-file fails
                logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
                logger.warning(f"Prompt analysis: {prompt_analysis}")
                logger.warning("Falling back to single-file generation")
                return await self._callAiWithSingleFileGeneration(
                    prompt, documents, options, outputFormat, title
                )

            # Emit raw extracted data as a chat message attachment before transformation/rendering
            try:
                await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
            except Exception:
                logger.warning("Failed to emit raw extraction chat message (multi-file)")

            # Process multiple documents
            generated_documents = []
            for i, doc_data in enumerate(ai_response.get("documents", [])):
                # Transform AI-generated sections to renderer-compatible format
                transformed_sections = []
                for section in doc_data.get("sections", []):
                    # Convert AI format to renderer format
                    transformed_section = {
                        "id": section.get("id", f"section_{len(transformed_sections) + 1}"),
                        "content_type": section.get("content_type", "paragraph"),
                        "elements": section.get("elements", []),
                        "order": section.get("order", len(transformed_sections) + 1)
                    }

                    # Extract text from elements for simple text-based sections
                    if section.get("content_type") in ["paragraph", "heading"]:
                        text_parts = []
                        for element in section.get("elements", []):
                            if "text" in element:
                                text_parts.append(element["text"])
                        # Add text to the first element or create a new one
                        if transformed_section["elements"]:
                            transformed_section["elements"][0]["text"] = "\n".join(text_parts)
                        else:
                            transformed_section["elements"] = [{"text": "\n".join(text_parts)}]

                    transformed_sections.append(transformed_section)

                # Create complete document structure for rendering
                complete_document = {
                    "metadata": {
                        "title": doc_data["title"],
                        "source_document": "multi_file_generation",
                        "document_id": doc_data.get("id", f"doc_{i+1}"),
                        "filename": doc_data.get("filename", f"document_{i+1}"),
                        "split_strategy": prompt_analysis.get("strategy", "custom")
                    },
                    "sections": transformed_sections,
                    "summary": f"Generated document: {doc_data['title']}",
                    "tags": ["multi_file", "ai_generated"]
                }

                # Use AI generation to enhance the extracted JSON before rendering
                enhancedContent = complete_document  # Default to original
                if prompt:
                    try:
                        from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType

                        # Get generation prompt
                        generationPrompt = await generation_service.getGenerationPrompt(
                            outputFormat=outputFormat,
                            userPrompt=prompt,
                            title=doc_data["title"],
                            aiService=self
                        )

                        # Prepare the AI call
                        request_options = AiCallOptions()
                        request_options.operationType = OperationType.GENERAL

                        # Create context with the extracted JSON content
                        import json
                        context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"

                        request = AiCallRequest(
                            prompt=generationPrompt,
                            context=context,
                            options=request_options
                        )

                        # Call AI to enhance the content
                        response = await self.aiObjects.call(request)

                        # Save generation prompt and response to debug
                        try:
                            from modules.shared.debugLogger import writeDebugFile
                            debugData = {
                                "output_format": outputFormat,
                                "title": doc_data["title"],
                                "document_index": i,
                                "context_length": len(context),
                                "extracted_content_keys": list(complete_document.keys()) if isinstance(complete_document, dict) else []
                            }
                            writeDebugFile(generationPrompt, f"generation_multi_doc_{i}", debugData)
                            writeDebugFile(response.content or '', f"generation_multi_doc_{i}_response")
                        except Exception:
                            pass

                        if response and response.content:
                            # Parse the AI response as JSON
                            try:
                                import re
                                result = response.content.strip()

                                # Extract JSON from markdown if present
                                json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
                                if json_match:
                                    result = json_match.group(1).strip()
                                elif result.startswith('```json'):
                                    result = re.sub(r'^```json\s*', '', result)
                                    result = re.sub(r'\s*```$', '', result)
                                elif result.startswith('```'):
                                    result = re.sub(r'^```\s*', '', result)
                                    result = re.sub(r'\s*```$', '', result)

                                # Try to parse JSON
                                enhancedContent = json.loads(result)
                                logger.info(f"AI enhanced JSON content successfully")

                            except json.JSONDecodeError as e:
                                logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...")
                                # Try to repair common JSON issues
                                try:
                                    repaired_result = self._repairJson(result)
                                    enhancedContent = json.loads(repaired_result)
                                    logger.info(f"Successfully repaired JSON content")
                                except (json.JSONDecodeError, Exception) as repair_error:
                                    logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...")
                                    # Try AI-powered JSON repair as last resort
                                    try:
                                        ai_repaired = await self._repairJsonWithAI(result)
                                        enhancedContent = json.loads(ai_repaired)
                                        logger.info(f"AI successfully repaired JSON content")
                                    except Exception as ai_repair_error:
                                        logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content")
                                        enhancedContent = complete_document
                        else:
                            logger.warning("AI generation returned empty response, using original content")
                            enhancedContent = complete_document

                    except Exception as e:
                        logger.warning(f"AI generation failed: {str(e)}, using original content")
                        enhancedContent = complete_document

                # Render the enhanced JSON content
                rendered_content, mime_type = await generation_service.renderReport(
                    extractedContent=enhancedContent,
                    outputFormat=outputFormat,
                    title=doc_data["title"],
                    userPrompt=prompt,
                    aiService=self
                )

                # Generate proper filename with correct extension
                base_filename = doc_data.get("filename", f"document_{i+1}")
                # Remove any existing extension and add the correct one
                if '.' in base_filename:
                    base_filename = base_filename.rsplit('.', 1)[0]

                # Add proper extension based on output format
                if outputFormat.lower() == "docx":
                    filename = f"{base_filename}.docx"
                elif outputFormat.lower() == "pdf":
                    filename = f"{base_filename}.pdf"
                elif outputFormat.lower() == "html":
                    filename = f"{base_filename}.html"
                else:
                    filename = f"{base_filename}.{outputFormat}"

                generated_documents.append({
                    "documentName": filename,
                    "documentData": rendered_content,
                    "mimeType": mime_type
                })

            # Save debug files for multi-file generation - only if debug enabled
            debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
            if debug_enabled:
                try:
                    import os
                    from datetime import datetime, UTC
                    ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
                    debug_root = "./test-chat/ai"
                    debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
                    os.makedirs(debug_dir, exist_ok=True)

                    # Save metadata
                    with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
                        f.write(f"title: {title}\n")
                        f.write(f"format: {outputFormat}\n")
                        f.write(f"documents_count: {len(generated_documents)}\n")
                        f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
                        f.write(f"prompt_analysis: {prompt_analysis}\n")

                    # Save each generated document
                    for i, doc in enumerate(generated_documents):
                        doc_filename = doc["documentName"]
                        doc_data = doc["documentData"]
                        doc_mime = doc["mimeType"]

                        # Determine file extension
                        if outputFormat.lower() == "docx":
                            file_ext = ".docx"
                        elif outputFormat.lower() == "pdf":
                            file_ext = ".pdf"
                        elif outputFormat.lower() == "html":
                            file_ext = ".html"
                        else:
                            file_ext = f".{outputFormat}"

                        # Save the rendered document
                        output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")

                        if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
                            # Text-based formats
                            with open(output_path, 'w', encoding='utf-8') as f:
                                f.write(doc_data)
                        else:
                            # Binary formats - decode from base64 if needed
                            try:
                                import base64
                                doc_bytes = base64.b64decode(doc_data)
                                with open(output_path, 'wb') as f:
                                    f.write(doc_bytes)
                            except Exception:
                                # If not base64, save as text
                                with open(output_path, 'w', encoding='utf-8') as f:
                                    f.write(doc_data)

                        logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")

                    logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")

                except Exception as e:
                    logger.warning(f"Failed to save multi-file debug output: {e}")

            # Update progress - generation completed
            progressLogger.updateProgress(operationId, 0.9, "Rendering")

            result = {
                "success": True,
                "content": ai_response,
                "rendered_content": None,  # Not applicable for multi-file
                "mime_type": None,  # Not applicable for multi-file
                "filename": None,  # Not applicable for multi-file
                "format": outputFormat,
                "title": title,
                "documents": generated_documents,
                "is_multi_file": True,
                "split_strategy": prompt_analysis.get("strategy", "custom")
            }

            # Complete progress tracking
            progressLogger.completeOperation(operationId, True)

            return result

        except Exception as e:
            logger.error(f"Error in multi-file document generation: {str(e)}")
            # Complete progress tracking with failure
            progressLogger.completeOperation(operationId, False)
            # Fallback to single-file
            return await self._callAiWithSingleFileGeneration(
                prompt, documents, options, outputFormat, title
            )

    async def _callAiJson(
        self,
        prompt: str,
        documents: Optional[List[ChatDocument]],
        options: AiCallOptions
        ) -> Dict[str, Any]:
        """
        Handle AI calls with document processing for JSON output.
        Returns structured JSON document instead of text.
        """
        # Process documents with JSON merging
        return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)

    async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
        """Use AI to analyze user prompt and determine processing requirements."""
        if not ai_service:
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

        try:
            analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.

User request: "{prompt}"

Respond with JSON only in this exact format:
{{
    "is_multi_file": true/false,
    "strategy": "single|per_entity|by_section|by_criteria|custom",
    "criteria": "description of how to split content",
    "file_naming_pattern": "suggested pattern for filenames",
    "reasoning": "brief explanation of the analysis"
}}

Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)

Return only the JSON response.
"""

            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
            request_options = AiCallOptions()
            request_options.operationType = OperationType.GENERAL

            request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
            response = await ai_service.aiObjects.call(request)

            if response and response.content:
                import json
                import re

                # Extract JSON from response
                result = response.content.strip()
                json_match = re.search(r'\{.*\}', result, re.DOTALL)
                if json_match:
                    result = json_match.group(0)

                analysis = json.loads(result)
                return analysis
            else:
                return {"is_multi_file": False, "strategy": "single", "criteria": None}

        except Exception as e:
            logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
            return {"is_multi_file": False, "strategy": "single", "criteria": None}

    def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
        """Validate that AI response matches the expected structure."""
        try:
            if not isinstance(response, dict):
                logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
                return False

            # Check for multi-file structure
            if prompt_analysis.get("is_multi_file", False):
                has_documents = "documents" in response
                is_documents_list = isinstance(response.get("documents"), list)
                logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
                if has_documents and is_documents_list:
                    logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
                else:
                    logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
                    logger.warning(f"Available keys: {list(response.keys())}")
                return has_documents and is_documents_list
            else:
                has_sections = "sections" in response
                is_sections_list = isinstance(response.get("sections"), list)
                logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
                return has_sections and is_sections_list
        except Exception as e:
            logger.warning(f"Response validation failed with exception: {str(e)}")
            return False

    async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
        """
        Create a ChatMessage with the extracted raw JSON attached as a file so the user
        has access to the data even if downstream processing fails.
        """
        try:
            services = self.services
            workflow = services.currentWorkflow

            # Serialize payload
            import json as _json
            from datetime import datetime, UTC
            ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
            content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
            content_bytes = content_text.encode('utf-8')

            # Store as file via component storage
            file_name = f"{label}_{ts}.json"
            file_item = services.interfaceDbComponent.createFile(
                name=file_name,
                mimeType="application/json",
                content=content_bytes
            )
            services.interfaceDbComponent.createFileData(file_item.id, content_bytes)

            # Lookup file info for ChatDocument
            file_info = services.workflow.getFileInfo(file_item.id)
            doc = ChatDocument(
                messageId="",  # set after message creation
                fileId=file_item.id,
                fileName=file_info.get("fileName", file_name) if file_info else file_name,
                fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
                mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
            )

            # Create message referencing the file - include document in initial call
            messageData = {
                "workflowId": workflow.id,
                "role": "assistant",
                "message": "Raw extraction data saved",
                "status": "data",
                "sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
                "publishedAt": services.utils.getUtcTimestamp(),
                "documentsLabel": label,
                "documents": []
            }

            # Store message with document included from the start
            services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
        except Exception:
            # Non-fatal; ignore if storage or chat creation fails
            return

    def _repairJson(self, json_string: str) -> str:
        """Repair common JSON syntax errors efficiently for large JSON."""
        try:
            import re
            import json

            # Remove any leading/trailing whitespace
            json_string = json_string.strip()

            # For large JSON, skip substring extraction and go straight to targeted repairs
            logger.info(f"Attempting JSON repair for {len(json_string)} characters...")

            # Try to parse first to see what specific error we get
            try:
                json.loads(json_string)
                return json_string  # Already valid
            except json.JSONDecodeError as e:
                error_msg = str(e)
                logger.info(f"JSON error: {error_msg}")

                # Apply targeted fixes based on the specific error
                if "Expecting ',' delimiter" in error_msg:
                    # Fix missing commas between array elements
                    json_string = re.sub(r'\]\s*\[', '], [', json_string)
                    json_string = re.sub(r'\}\s*\{', '}, {', json_string)
                    # Fix missing commas between object properties
                    json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string)

                if "Expecting value" in error_msg:
                    # Fix missing values (replace empty with null)
                    json_string = re.sub(r':\s*,', ': null,', json_string)
                    json_string = re.sub(r':\s*}', ': null}', json_string)

                if "Expecting property name" in error_msg:
                    # Fix unquoted property names
                    json_string = re.sub(r'(\w+):', r'"\1":', json_string)

                # Fix trailing commas before closing brackets/braces
                json_string = re.sub(r',(\s*[}\]])', r'\1', json_string)

                # Fix missing closing brackets/braces (only if reasonable)
                open_braces = json_string.count('{')
                close_braces = json_string.count('}')
                open_brackets = json_string.count('[')
                close_brackets = json_string.count(']')

                # Only add missing brackets if the difference is small (avoid runaway)
                if 0 < (open_braces - close_braces) <= 5:
                    missing_braces = open_braces - close_braces
                    json_string += '}' * missing_braces

                if 0 < (open_brackets - close_brackets) <= 5:
                    missing_brackets = open_brackets - close_brackets
                    json_string += ']' * missing_brackets

                # Try to parse again
                try:
                    json.loads(json_string)
                    logger.info("JSON repair successful")
                    return json_string
                except json.JSONDecodeError:
                    logger.warning("JSON repair failed - will try AI repair")
                    return json_string

        except Exception as e:
            logger.warning(f"JSON repair failed: {str(e)}")
            return json_string

    async def _repairJsonWithAI(self, malformed_json: str) -> str:
        """Use AI to repair malformed JSON efficiently for large files."""
        try:
            # Limit JSON size for AI processing (max 50KB to avoid token limits)
            max_json_size = 50000
            json_to_repair = malformed_json

            if len(malformed_json) > max_json_size:
                logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair")
                # Try to find a good truncation point (end of a complete object/array)
                truncate_at = max_json_size
                for i in range(max_json_size, max(0, max_json_size - 1000), -1):
                    if malformed_json[i] in ['}', ']']:
                        truncate_at = i + 1
                        break
                json_to_repair = malformed_json[:truncate_at] + "..."

            repair_prompt = f"""
You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations.

Malformed JSON:
{json_to_repair}

Return only the valid JSON:
"""

            # Use AI to repair the JSON
            repaired_json = await self.services.ai.callAi(
                prompt=repair_prompt,
                documents=None,
                options={
                    "process_type": "text",
                    "operation_type": "generate_content",
                    "priority": "speed",
                    "max_cost": 0.01
                }
            )

            # Clean up the response (remove any markdown formatting)
            repaired_json = repaired_json.strip()
            if repaired_json.startswith('```json'):
                repaired_json = repaired_json[7:]
            if repaired_json.endswith('```'):
                repaired_json = repaired_json[:-3]
            repaired_json = repaired_json.strip()

            # Validate the repaired JSON
            import json
            json.loads(repaired_json)
            logger.info("AI JSON repair successful")
            return repaired_json

        except Exception as e:
            logger.warning(f"AI JSON repair failed: {str(e)}")
            return malformed_json

    def _attemptJsonFix(self, json_string: str) -> str:
        """Attempt to fix common JSON issues"""
        try:
            # Remove any trailing commas before closing braces/brackets
            import re
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_string)

            # Try to fix unterminated strings by adding quotes at the end
            if '"' in fixed and not fixed.strip().endswith('"'):
                # Count quotes to see if we have an odd number (unterminated string)
                quote_count = fixed.count('"')
                if quote_count % 2 == 1:
                    # Find the last quote and add a closing quote
                    last_quote_pos = fixed.rfind('"')
                    if last_quote_pos != -1:
                        # Check if there's content after the last quote that needs to be quoted
                        after_quote = fixed[last_quote_pos + 1:].strip()
                        if after_quote and not after_quote.startswith(','):
                            # Add closing quote before any trailing content
                            fixed = fixed[:last_quote_pos + 1] + '"' + after_quote

            return fixed
        except Exception:
            return json_string