gateway/modules/workflows/methods/methodDocument.py

"""
Document processing method module.
Handles document operations using the document service.
"""

import logging
import os
from typing import Dict, Any, List, Optional
from datetime import datetime, UTC

from modules.workflows.methods.methodBase import MethodBase, action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallOptions, OperationType, Priority

logger = logging.getLogger(__name__)

class MethodDocument(MethodBase):
    """Document method implementation for document operations"""

    def __init__(self, services):
        """Initialize the document method"""
        super().__init__(services)
        self.name = "document"
        self.description = "Handle document operations like extraction and analysis"

    def _format_timestamp_for_filename(self) -> str:
        """Format current timestamp as YYYYMMDD-hhmmss for filenames."""
        return datetime.now(UTC).strftime("%Y%m%d-%H%M%S")

    @action
    async def extract(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Extract and analyze content from existing documents using AI.
        - Input requirements: documentList (required); prompt (required).
        - Output format: Plain text per source document (.txt by default).

        Parameters:
        - documentList (list, required): Document reference(s) to extract from.
        - prompt (str, required): Instruction describing what to extract.
        - operationType (str, optional): extract_content | analyze_document | summarize_content. Default: extract_content.
        - processDocumentsIndividually (bool, optional): Process each document separately. Default: True.
        - chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
        - mergeStrategy (dict, optional): Merge strategy for chunked content.
        - expectedDocumentFormats (list, optional): Desired output format specs.
        - includeMetadata (bool, optional): Include file metadata. Default: True.
        """
        try:
            documentList = parameters.get("documentList")
            if isinstance(documentList, str):
                documentList = [documentList]
            prompt = parameters.get("prompt")
            operationType = parameters.get("operationType", "extract_content")
            processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
            chunkAllowed = parameters.get("chunkAllowed", True)
            mergeStrategy = parameters.get("mergeStrategy", {
                "groupBy": "typeGroup",
                "orderBy": "id",
                "mergeType": "concatenate"
            })
            expectedDocumentFormats = parameters.get("expectedDocumentFormats", [])
            includeMetadata = parameters.get("includeMetadata", True)

            if not documentList:
                return ActionResult.isFailure(
                    error="Document list reference is required"
                )

            if not prompt:
                return ActionResult.isFailure(
                    error="Prompt is required"
                )

            chatDocuments = self.services.workflow.getChatDocumentsFromDocumentList(documentList)
            if not chatDocuments:
                return ActionResult.isFailure(
                    error="No documents found for the provided reference"
                )

            # Use enhanced AI service with integrated extraction
            try:
                # Build AI call options
                ai_options = AiCallOptions(
                    operationType=operationType,
                    processDocumentsIndividually=processDocumentsIndividually,
                    compressContext=not chunkAllowed
                )

                # Add format instructions to prompt if expected formats are provided
                enhanced_prompt = prompt
                if expectedDocumentFormats:
                    format_instructions = []
                    for fmt in expectedDocumentFormats:
                        extension = fmt.get("extension", ".txt")
                        mime_type = fmt.get("mimeType", "text/plain")
                        description = fmt.get("description", "")
                        format_instructions.append(f"- {extension} ({mime_type}): {description}")

                    if format_instructions:
                        enhanced_prompt += f"\n\nPlease format the output as: {', '.join([fmt.get('extension', '.txt') for fmt in expectedDocumentFormats])}"
                        enhanced_prompt += f"\nExpected formats:\n" + "\n".join(format_instructions)

                # Use enhanced AI service for extraction
                ai_response = await self.services.ai.callAi(
                    prompt=enhanced_prompt,
                    documents=chatDocuments,
                    options=ai_options
                )

                logger.info(f"AI extraction completed: {len(ai_response)} characters")

            except Exception as e:
                logger.error(f"AI extraction failed: {str(e)}")
                ai_response = ""

            if not ai_response or ai_response.strip() == "":
                return ActionResult.isFailure(
                    error="No content could be extracted from any documents"
                )

            # Process each document individually with extracted content
            action_documents = []

            for i, chatDocument in enumerate(chatDocuments):
                # Use the AI response directly - it already contains processed content
                final_content = ai_response
                final_mime_type = "text/plain"
                final_extension = ".txt"

                # Create meaningful output fileName with workflow context
                original_fileName = chatDocument.fileName
                base_name = original_fileName.rsplit('.', 1)[0] if '.' in original_fileName else original_fileName
                extension = final_extension.lstrip('.')  # Remove leading dot for meaningful naming
                output_fileName = self._generateMeaningfulFileName(
                    base_name=f"{base_name}_extracted",
                    extension=extension,
                    action_name="extract"
                )

                logger.info(f"Created output document: {output_fileName} with {len(final_content)} characters")

                # Create proper ActionDocument object
                action_documents.append(ActionDocument(
                    documentName=output_fileName,
                    documentData=final_content,
                    mimeType=final_mime_type
                ))

            return ActionResult.isSuccess(
                documents=action_documents
            )
        except Exception as e:
            logger.error(f"Error extracting content: {str(e)}")
            return ActionResult.isFailure(
                error=str(e)
            )


    @action
    async def generate(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Generate formatted documents and reports from source documents.
        - Input requirements: documentList (required); prompt (required); optional title and outputFormat.
        - Any output format, e.g.: html | pdf | docx | txt | md | json | csv | xlsx

        Parameters:
        - documentList (list, required): Document reference(s) to include as context.
        - prompt (str, required): Instruction describing the desired document/report.
        - title (str, optional): Title for the generated document. Default: "Summary Report".
        - outputFormat (str, optional): html | pdf | docx | txt | md | json | csv | xlsx. Default: html.
        - operationType (str, optional): generate_report | analyze_documents. Default: generate_report.
        - processDocumentsIndividually (bool, optional): Process per document. Default: True.
        - chunkAllowed (bool, optional): Allow chunking for large inputs. Default: True.
        - mergeStrategy (dict, optional): Merging rules for multi-part generation.
        - includeMetadata (bool, optional): Include file metadata. Default: True.
        """
        try:
            documentList = parameters.get("documentList")
            if isinstance(documentList, str):
                documentList = [documentList]
            prompt = parameters.get("prompt")
            title = parameters.get("title", "Summary Report")
            outputFormat = parameters.get("outputFormat", "html")
            operationType = parameters.get("operationType", "generate_report")
            processDocumentsIndividually = parameters.get("processDocumentsIndividually", True)
            chunkAllowed = parameters.get("chunkAllowed", True)
            mergeStrategy = parameters.get("mergeStrategy", {
                "groupBy": "typeGroup",
                "orderBy": "id",
                "mergeType": "concatenate"
            })
            includeMetadata = parameters.get("includeMetadata", True)

            if not documentList:
                return ActionResult.isFailure(
                    error="Document list reference is required"
                )

            if not prompt:
                return ActionResult.isFailure(
                    error="Prompt is required to specify what kind of report to generate"
                )

            chatDocuments = self.services.workflow.getChatDocumentsFromDocumentList(documentList)
            logger.info(f"Retrieved {len(chatDocuments)} chat documents for report generation")

            if not chatDocuments:
                return ActionResult.isFailure(
                    error="No documents found for the provided reference"
                )

            # Use enhanced AI service with document generation
            try:
                # Build AI call options
                ai_options = AiCallOptions(
                    operationType=operationType,
                    processDocumentsIndividually=processDocumentsIndividually,
                    compressContext=not chunkAllowed
                )

                # Use enhanced AI service with document generation
                result = await self.services.ai.callAi(
                    prompt=prompt,
                    documents=chatDocuments,
                    options=ai_options,
                    outputFormat=outputFormat,
                    title=title
                )

                if isinstance(result, dict) and result.get("success"):
                    # Extract document information from result
                    documents = result.get("documents", [])
                    if documents:
                        # Convert to ActionDocument format
                        action_documents = []
                        for doc in documents:
                            action_documents.append(ActionDocument(
                                documentName=doc["documentName"],
                                documentData=doc["documentData"],
                                mimeType=doc["mimeType"]
                            ))

                        logger.info(f"Generated {outputFormat.upper()} report: {len(action_documents)} documents")
                        return ActionResult.isSuccess(documents=action_documents)
                    else:
                        return ActionResult.isFailure(error="No documents generated")
                else:
                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "AI generation failed"
                    return ActionResult.isFailure(error=error_msg)

            except Exception as e:
                logger.error(f"AI generation failed: {str(e)}")
                return ActionResult.isFailure(error=str(e))

        except Exception as e:
            logger.error(f"Error generating report: {str(e)}")
            return ActionResult.isFailure(
                error=str(e)
            )