gateway/modules/workflows/methods/methodDocument.py

"""
Document processing method module.
Handles document operations using the document service.
"""

import logging
import os
from typing import Dict, Any, List, Optional
from datetime import datetime, UTC

from modules.workflows.methods.methodBase import MethodBase, action
from modules.datamodels.datamodelWorkflow import ActionResult, ActionDocument
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallOptions, OperationType, Priority

logger = logging.getLogger(__name__)

class MethodDocument(MethodBase):
    """Document method implementation for document operations"""

    def __init__(self, services):
        """Initialize the document method"""
        super().__init__(services)
        self.name = "document"
        self.description = "Handle document operations like extraction and analysis"

    def _format_timestamp_for_filename(self) -> str:
        """Format current timestamp as YYYYMMDD-hhmmss for filenames."""
        return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")

    @action
    async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Extract content from any document using AI prompt.

        Parameters:
            documentList (list): Document list reference(s) - List of document references to extract content from
            prompt (str): AI prompt for extraction - Specific prompt describing what content to extract and how to process it
            operationType (str, optional): Type of operation - Use 'extract_content', 'analyze_document', 'summarize_content', etc. (default: 'extract_content')
            processDocumentsIndividually (bool, optional): Process each document separately - Set to True for individual processing, False for batch processing (default: True)
            chunkAllowed (bool, optional): Allow content chunking - Set to True to allow AI service to chunk large content, False to process as-is (default: True)
            mergeStrategy (dict, optional): Strategy for merging results - Specify how to merge chunked content: groupBy, orderBy, mergeType (default: concatenate)
            expectedDocumentFormats (list, optional): Expected output formats - List of format specifications with extension, mimeType, description
            includeMetadata (bool, optional): Include document metadata - Set to True to include file metadata in results (default: True)
        """
        try:
            documentList = parameters.get("documentList")
            if isinstance(documentList, str):
                documentList = [documentList]
            prompt = parameters.get("prompt")
            operationType = parameters.get("operationType", "extract_content")
            processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
            chunkAllowed = parameters.get("chunkAllowed", True)
            mergeStrategy = parameters.get("mergeStrategy", {
                "groupBy": "typeGroup",
                "orderBy": "id",
                "mergeType": "concatenate"
            })
            expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
            includeMetadata = parameters.get("includeMetadata", True)

            if not documentList:
                return ActionResult.isFailure(
                    error="Document list reference is required"
                )

            if not prompt:
                return ActionResult.isFailure(
                    error="Prompt is required"
                )

            chatDocuments = self.services.workflow.getChatDocumentsFromDocumentList(documentList)
            if not chatDocuments:
                return ActionResult.isFailure(
                    error="No documents found for the provided reference"
                )

            # Use new extraction service with ChatDocument objects
            try:
                # Build extraction options directly from AI planner parameters
                extraction_options = {
                    "prompt": prompt,
                    "operationType": operationType,
                    "processDocumentsIndividually": processDocumentsIndividually,
                    "chunkAllowed": chunkAllowed,
                    "mergeStrategy": mergeStrategy
                }

                # Add format instructions to prompt if expected formats are provided
                enhanced_prompt = prompt
                if expectedDocumentFormats:
                    format_instructions = []
                    for fmt in expectedDocumentFormats:
                        extension = fmt.get("extension", ".txt")
                        mime_type = fmt.get("mimeType", "text/plain")
                        description = fmt.get("description", "")
                        format_instructions.append(f"- {extension} ({mime_type}): {description}")

                    if format_instructions:
                        enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
                        enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)

                    extraction_options["expectedDocumentFormats"] = expectedDocumentFormats

                extraction_options["prompt"] = enhanced_prompt

                if not includeMetadata:
                    extraction_options["includeMetadata"] = False

                # Use new extraction service API
                all_extracted_content = self.services.extraction.extractContent(
                    documents=chatDocuments,
                    options=extraction_options
                )

                logger.info(f"Extraction completed: {len(all_extracted_content)} documents processed")

            except Exception as e:
                logger.error(f"Extraction failed: {str(e)}")
                all_extracted_content = []

            if not all_extracted_content:
                return ActionResult.isFailure(
                    error="No content could be extracted from any documents"
                )

            # Process each document individually with its own format conversion
            action_documents = []

            for i, chatDocument in enumerate(chatDocuments):
                # Extract text content from this document using new ExtractedContent structure
                text_content = ""
                try:
                    ec = all_extracted_content[i] if i < len(all_extracted_content) else None
                    if ec and hasattr(ec, 'parts'):
                        text_parts = []
                        for part in ec.parts:
                            try:
                                if part.typeGroup in ("text", "table", "structure") and part.data:
                                    text_parts.append(part.data)
                            except Exception:
                                continue
                        text_content = "\n".join(text_parts)
                    else:
                        text_content = ""
                except Exception:
                    text_content = ""

                # Use the extracted content directly - format conversion is handled by extraction service
                final_content = text_content
                final_mime_type = "text/plain"
                final_extension = ".txt"

                # Create meaningful output fileName with workflow context
                original_fileName = chatDocument.fileName
                base_name = original_fileName.rsplit('.', 1)[0] if '.' in original_fileName else original_fileName
                extension = final_extension.lstrip('.')  # Remove leading dot for meaningful naming
                output_fileName = self._generateMeaningfulFileName(
                    base_name=f"{base_name}_extracted",
                    extension=extension,
                    action_name="extract"
                )

                logger.info(f"Created output document: {output_fileName} with {len(final_content)} characters")

                # Create proper ActionDocument object
                action_documents.append(ActionDocument(
                    documentName=output_fileName,
                    documentData=final_content,
                    mimeType=final_mime_type
                ))

            return ActionResult.isSuccess(
                documents=action_documents
            )
        except Exception as e:
            logger.error(f"Error extracting content: {str(e)}")
            return ActionResult.isFailure(
                error=str(e)
            )


    @action
    async def generateReport(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Generate report from multiple documents using AI.

        Parameters:
            documentList (list): Document list reference(s) - List of document references to include in report
            prompt (str): AI prompt for report generation - Specific prompt describing what kind of report to generate
            title (str, optional): Report title - Title for the generated report (default: "Summary Report")
            outputFormat (str, optional): Output format extension - Specify the desired output format: 'html', 'pdf', 'docx', 'txt', 'md', 'json', 'csv', 'xlsx' (default: 'html')
            operationType (str, optional): Type of operation - Use 'generate_report', 'analyze_documents', etc. (default: 'generate_report')
            processDocumentsIndividually (bool, optional): Process each document separately - Set to True for individual processing (default: True)
            chunkAllowed (bool, optional): Allow content chunking - Set to True to allow AI service to chunk large content (default: True)
            mergeStrategy (dict, optional): Strategy for merging results - Specify how to merge content for report generation (default: concatenate)
            includeMetadata (bool, optional): Include document metadata - Set to True to include file metadata in results (default: True)
        """
        try:
            documentList = parameters.get("documentList")
            if isinstance(documentList, str):
                documentList = [documentList]
            prompt = parameters.get("prompt")
            title = parameters.get("title", "Summary Report")
            outputFormat = parameters.get("outputFormat", "html")
            operationType = parameters.get("operationType", "generate_report")
            processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
            chunkAllowed = parameters.get("chunkAllowed", True)
            mergeStrategy = parameters.get("mergeStrategy", {
                "groupBy": "typeGroup",
                "orderBy": "id",
                "mergeType": "concatenate"
            })
            includeMetadata = parameters.get("includeMetadata", True)

            if not documentList:
                return ActionResult.isFailure(
                    error="Document list reference is required"
                )

            if not prompt:
                return ActionResult.isFailure(
                    error="Prompt is required to specify what kind of report to generate"
                )

            chatDocuments = self.services.workflow.getChatDocumentsFromDocumentList(documentList)
            logger.info(f"Retrieved {len(chatDocuments)} chat documents for report generation")

            if not chatDocuments:
                return ActionResult.isFailure(
                    error="No documents found for the provided reference"
                )

            # Generate report using the new format handling system
            report_content, mime_type = await self._generateReport(
                chatDocuments, title, outputFormat, includeMetadata, prompt
            )

            # Create meaningful output fileName with workflow context
            output_fileName = self._generateMeaningfulFileName(
                base_name="report",
                extension=outputFormat,
                action_name="generate"
            )

            logger.info(f"Generated {outputFormat.upper()} report: {output_fileName} with {len(report_content)} characters")

            return ActionResult.isSuccess(
                documents=[ActionDocument(
                    documentName=output_fileName,
                    documentData=report_content,
                    mimeType=mime_type
                )]
            )
        except Exception as e:
            logger.error(f"Error generating report: {str(e)}")
            return ActionResult.isFailure(
                error=str(e)
            )

    async def _generateReport(self, chatDocuments: List[Any], title: str, outputFormat: str, includeMetadata: bool, prompt: str) -> tuple[str, str]:
        """
        Generate a report in the specified format using format-specific extraction:
        1. Get format-specific extraction prompt from renderer
        2. Extract content using AI with format-specific prompt
        3. Clean and return the formatted content
        """
        try:
            # Get format-specific extraction prompt
            from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
            generation_service = GenerationService(self.services)

            extraction_prompt = generation_service.getExtractionPrompt(
                output_format=outputFormat,
                user_prompt=prompt,
                title=title
            )

            # Extract content using format-specific prompt
            extracted_content = await self._extractContentWithPrompt(
                chatDocuments, extraction_prompt, includeMetadata
            )

            # Render the extracted content (mostly just cleaning)
            rendered_content, mime_type = await generation_service.renderReport(
                extracted_content=extracted_content,
                output_format=outputFormat,
                title=title
            )

            return rendered_content, mime_type

        except Exception as e:
            logger.error(f"Error generating report: {str(e)}")
            # Fallback to simple text format
            fallback_content = f"# {title}\n\nError generating report: {str(e)}"
            return fallback_content, "text/plain"

    async def _extractContentWithPrompt(self, chatDocuments: List[Any], extraction_prompt: str, includeMetadata: bool) -> str:
        """
        Extract content from documents using a specific extraction prompt.
        """
        try:
            # Use extraction service directly with format-specific prompt and all documents
            logger.info(f"Extracting content with format-specific prompt for {len(chatDocuments)} documents")

            # Build extraction options for report generation
            extraction_options = {
                "prompt": extraction_prompt,
                "operationType": "generate_report",
                "processDocumentsIndividually": True,
                "chunkAllowed": True,
                "mergeStrategy": {
                    "groupBy": "typeGroup",
                    "orderBy": "id",
                    "mergeType": "concatenate"
                }
            }

            if not includeMetadata:
                extraction_options["includeMetadata"] = False

            # Extract content using extraction service with format-specific prompt
            extracted_list = self.services.extraction.extractContent(
                documents=chatDocuments,
                options=extraction_options
            )

            if not extracted_list:
                logger.warning("No content extracted from documents")
                return "No readable content found in documents"

            # The extraction service should return format-specific content directly
            # Combine all extracted content
            all_extracted_content = []
            for ec in extracted_list:
                if ec and hasattr(ec, 'parts'):
                    for part in ec.parts:
                        try:
                            if part.typeGroup in ("text", "table", "structure") and part.data:
                                all_extracted_content.append(part.data)
                        except Exception:
                            continue

            if not all_extracted_content:
                logger.warning("No readable content found in extracted results")
                return "No readable content found in documents"

            # Join all extracted content
            combined_content = "\n\n".join(all_extracted_content)

            if not combined_content or combined_content.strip() == "":
                logger.error("No content extracted from documents")
                raise Exception("No content extracted from documents")

            # Call AI service to process the content with the format-specific prompt
            logger.info(f"Calling AI service to process {len(combined_content)} characters with prompt")
            aiResponse = await self.services.ai.callAi(
                prompt=extraction_prompt,
                documents=chatDocuments,  # Pass the original ChatDocument objects
                options=AiCallOptions(operationType=OperationType.GENERATE_CONTENT)
            )

            if not aiResponse or aiResponse.strip() == "":
                logger.error("AI content generation failed")
                raise Exception("AI content generation failed")

            # Clean up the AI response
            content = aiResponse.strip()

            # Remove markdown code blocks if present
            if content.startswith("```") and content.endswith("```"):
                lines = content.split('\n')
                if len(lines) >= 2:
                    content = '\n'.join(lines[1:-1]).strip()

            logger.info(f"Successfully generated format-specific content: {len(content)} characters")
            return content

        except Exception as e:
            logger.error(f"Error extracting content with prompt: {str(e)}")
            # Return minimal fallback content
            return f"Error extracting content: {str(e)}"

    async def _generateHtmlReport(self, chatDocuments: List[Any], title: str, includeMetadata: bool, prompt: str) -> str:
        """
        Generate a comprehensive HTML report using AI from all input documents.
        """
        try:
            # Filter out empty documents and collect content
            validDocuments = []
            allContent = []

            for doc in chatDocuments:
                content = ""
                logger.info(f"Processing document: type={type(doc)}")

                # Use new extraction service for each document
                try:
                    # Build extraction options for report generation from AI planner parameters
                    extraction_options = {
                        "prompt": prompt,
                        "operationType": operationType,
                        "processDocumentsIndividually": processDocumentsIndividually,
                        "chunkAllowed": chunkAllowed,
                        "mergeStrategy": mergeStrategy
                    }

                    # Add optional parameters if provided by AI planner
                    if not includeMetadata:
                        extraction_options["includeMetadata"] = False

                    # Extract content using new service
                    extracted_list = self.services.extraction.extractContent(
                        documents=[doc],
                        options=extraction_options
                    )

                    ec = extracted_list[0] if extracted_list else None
                    if ec and hasattr(ec, 'parts'):
                        for part in ec.parts:
                            try:
                                if part.typeGroup in ("text", "table", "structure") and part.data:
                                    content += part.data + " "
                            except Exception:
                                continue
                        if content.strip():
                            logger.info(f"  Retrieved content from file: {len(content)} characters")
                        else:
                            logger.info(f"  No readable text content found (binary file)")
                    else:
                        logger.info(f"  No content extracted (binary file)")
                except Exception as e:
                    logger.info(f"  Could not extract content (binary file): {str(e)}")

                # Skip empty documents
                if content and content.strip():
                    validDocuments.append(doc)
                    allContent.append(f"Document: {doc.fileName}\n{content}\n")
                    logger.info(f"  Added document to valid documents list")
                else:
                    logger.info(f"  Skipping document with no readable text content")

            if not validDocuments:
                # No readable content; return a minimal valid HTML document
                timestamp = int(self.services.utils.getUtcTimestamp())
                return f"<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>{title}</title></head><body><h1>{title}</h1><p>Keine auswertbaren Inhalte gefunden.</p><p>Generated: {timestamp}</p></body></html>"

            # Create AI prompt for comprehensive report generation using user's prompt
            combinedContent = "\n\n".join(allContent)
            aiPrompt = f"""
{prompt}

Report Title: {title}

OUTPUT POLICY:
- Return ONLY a complete, raw HTML document.
- Start with: <!DOCTYPE html>
- Must include: <html>, <head> (with <meta charset="UTF-8"> and <title>), and <body>.
- The response must be valid, self-contained HTML suitable for saving as .html.

Structure:
- Title and short subtitle
- Executive summary
- Sections with clear headings
- Use tables for structured data when helpful
- Key findings and recommendations
- Generation date and number of documents

Quality and design requirements:
- Use clear, professional, and accessible styling in a <style> block
- Apply clean layout, spacing, and visual hierarchy for headings
- Keep HTML and CSS standards-compliant and lightweight

SOURCE DOCUMENT CONTENT:
---START---
{combinedContent}
---END---
            """

            # Call AI to generate the report
            logger.info(f"Generating AI report for {len(validDocuments)} documents")
            # Build ChatDocument list from chatDocuments
            documents = []
            try:
                for d in validDocuments:
                    try:
                        data = self.services.workflow.getFileData(d.fileId) if hasattr(d, 'fileId') else None
                        if data:
                            documents.append(ChatDocument(fileData=data, fileName=d.fileName, mimeType=d.mimeType))
                    except Exception:
                        continue
            except Exception:
                documents = None
            aiReport = await self.services.ai.callAi(
                prompt=aiPrompt,
                documents=documents or None,
                options=AiCallOptions(
                    operationType=OperationType.GENERATE_CONTENT,  # Using GENERATE_CONTENT for report generation
                    priority=Priority.QUALITY,
                    compressPrompt=False,
                    compressContext=True,
                    processDocumentsIndividually=True,
                    resultFormat="html",
                    processingMode="detailed",
                    maxCost=0.08,
                    maxProcessingTime=90
                )
            )

            # If AI call fails, return error - AI is crucial for report generation
            if not aiReport or aiReport.strip() == "":
                logger.error("AI report generation failed - AI is crucial for this action")
                raise Exception("AI report generation failed - AI is required for report generation")

            # Clean up the AI response and ensure it's valid HTML
            aiReport = aiReport.strip()

            # Normalize: strip code fences if present
            if aiReport.startswith("```") and aiReport.endswith("```"):
                lines = aiReport.split('\n')
                if len(lines) >= 2:
                    aiReport = '\n'.join(lines[1:-1]).strip()

            cleaned = aiReport.strip()

            # Return exactly what we have (no wrapping)
            return cleaned

        except Exception as e:
            logger.error(f"Error generating AI report: {str(e)}")
            # Re-raise the error - AI is crucial for report generation
            raise