gateway/modules/workflows/methods/methodContext.py.old

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Context and workflow information method module.
Handles workflow context queries and document indexing.
"""

import time
import json
import logging
import aiohttp
from typing import Dict, Any, List
from datetime import datetime, UTC

from modules.workflows.methods.methodBase import MethodBase, action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class MethodContext(MethodBase):
    """Context and workflow information methods."""

    def __init__(self, services):
        super().__init__(services)
        self.name = "context"
        self.description = "Context and workflow information methods"

    @action
    async def getDocumentIndex(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        GENERAL:
        - Purpose: Generate a comprehensive index of all documents available in the current workflow, including documents from all rounds and tasks.
        - Input requirements: No input documents required. Optional resultType parameter.
        - Output format: Structured document index in JSON format (default) or text format, listing all documents with their references, metadata, and organization by rounds/tasks.

        Parameters:
        - resultType (str, optional): Output format (json, txt, md). Default: json.
        """
        try:
            workflow = self.services.workflow
            if not workflow:
                return ActionResult.isFailure(
                    error="No workflow available"
                )

            resultType = parameters.get("resultType", "json").lower().strip().lstrip('.')

            # Get available documents index from chat service
            documentsIndex = self.services.chat.getAvailableDocuments(workflow)

            if not documentsIndex or documentsIndex == "No documents available" or documentsIndex == "NO DOCUMENTS AVAILABLE - This workflow has no documents to process.":
                # Return empty index structure
                if resultType == "json":
                    indexData = {
                        "workflowId": getattr(workflow, 'id', 'unknown'),
                        "totalDocuments": 0,
                        "rounds": [],
                        "documentReferences": []
                    }
                    indexContent = json.dumps(indexData, indent=2, ensure_ascii=False)
                else:
                    indexContent = "Document Index\n==============\n\nNo documents available in this workflow.\n"
            else:
                # Parse the document index string to extract structured information
                indexData = self._parseDocumentIndex(documentsIndex, workflow)

                if resultType == "json":
                    indexContent = json.dumps(indexData, indent=2, ensure_ascii=False)
                elif resultType == "md":
                    indexContent = self._formatAsMarkdown(indexData)
                else:  # txt
                    indexContent = self._formatAsText(indexData, documentsIndex)

            # Generate meaningful filename
            workflowContext = self.services.chat.getWorkflowContext()
            filename = self._generateMeaningfulFileName(
                "document_index",
                resultType if resultType in ["json", "txt", "md"] else "json",
                workflowContext,
                "getDocumentIndex"
            )

            validationMetadata = {
                "actionType": "context.getDocumentIndex",
                "resultType": resultType,
                "workflowId": getattr(workflow, 'id', 'unknown'),
                "totalDocuments": indexData.get("totalDocuments", 0) if isinstance(indexData, dict) else 0
            }

            # Create ActionDocument
            document = ActionDocument(
                documentName=filename,
                documentData=indexContent,
                mimeType="application/json" if resultType == "json" else "text/plain",
                validationMetadata=validationMetadata
            )

            return ActionResult.isSuccess(documents=[document])

        except Exception as e:
            logger.error(f"Error generating document index: {str(e)}")
            return ActionResult.isFailure(
                error=f"Failed to generate document index: {str(e)}"
            )

    def _parseDocumentIndex(self, documentsIndex: str, workflow: Any) -> Dict[str, Any]:
        """Parse the document index string into structured data."""
        try:
            indexData = {
                "workflowId": getattr(workflow, 'id', 'unknown'),
                "generatedAt": datetime.now(UTC).isoformat(),
                "totalDocuments": 0,
                "rounds": [],
                "documentReferences": []
            }

            # Extract document references from the index string
            lines = documentsIndex.split('\n')
            currentRound = None
            currentDocList = None

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check for round headers
                if "Current round documents:" in line:
                    currentRound = "current"
                    continue
                elif "Past rounds documents:" in line:
                    currentRound = "past"
                    continue

                # Check for document list references (docList:...)
                if line.startswith("- docList:"):
                    docListRef = line.replace("- docList:", "").strip()
                    currentDocList = {
                        "reference": docListRef,
                        "round": currentRound,
                        "documents": []
                    }
                    indexData["rounds"].append(currentDocList)
                    continue

                # Check for individual document references (docItem:...)
                if line.startswith("  - docItem:") or line.startswith("- docItem:"):
                    docItemRef = line.replace("  - docItem:", "").replace("- docItem:", "").strip()
                    indexData["documentReferences"].append({
                        "reference": docItemRef,
                        "round": currentRound,
                        "docList": currentDocList["reference"] if currentDocList else None
                    })
                    indexData["totalDocuments"] += 1
                    if currentDocList:
                        currentDocList["documents"].append(docItemRef)

            return indexData

        except Exception as e:
            logger.error(f"Error parsing document index: {str(e)}")
            return {
                "workflowId": getattr(workflow, 'id', 'unknown'),
                "error": f"Failed to parse document index: {str(e)}",
                "rawIndex": documentsIndex
            }

    def _formatAsMarkdown(self, indexData: Dict[str, Any]) -> str:
        """Format document index as Markdown."""
        try:
            md = f"# Document Index\n\n"
            md += f"**Workflow ID:** {indexData.get('workflowId', 'unknown')}\n\n"
            md += f"**Generated At:** {indexData.get('generatedAt', 'unknown')}\n\n"
            md += f"**Total Documents:** {indexData.get('totalDocuments', 0)}\n\n"

            if indexData.get('rounds'):
                md += "## Documents by Round\n\n"
                for roundInfo in indexData['rounds']:
                    roundLabel = roundInfo.get('round', 'unknown').title()
                    md += f"### {roundLabel} Round\n\n"
                    md += f"**Document List:** `{roundInfo.get('reference', 'unknown')}`\n\n"
                    if roundInfo.get('documents'):
                        md += "**Documents:**\n\n"
                        for docRef in roundInfo['documents']:
                            md += f"- `{docRef}`\n"
                        md += "\n"

            if indexData.get('documentReferences'):
                md += "## All Document References\n\n"
                for docRef in indexData['documentReferences']:
                    md += f"- `{docRef.get('reference', 'unknown')}`\n"

            return md

        except Exception as e:
            logger.error(f"Error formatting as Markdown: {str(e)}")
            return f"# Document Index\n\nError formatting index: {str(e)}\n"

    def _formatAsText(self, indexData: Dict[str, Any], rawIndex: str) -> str:
        """Format document index as plain text."""
        try:
            text = "Document Index\n"
            text += "=" * 50 + "\n\n"
            text += f"Workflow ID: {indexData.get('workflowId', 'unknown')}\n"
            text += f"Generated At: {indexData.get('generatedAt', 'unknown')}\n"
            text += f"Total Documents: {indexData.get('totalDocuments', 0)}\n\n"

            # Include the raw formatted index for readability
            text += rawIndex

            return text

        except Exception as e:
            logger.error(f"Error formatting as text: {str(e)}")
            return f"Document Index\n\nError formatting index: {str(e)}\n\nRaw index:\n{rawIndex}\n"

    @action
    async def extractContent(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Extract content from documents (separate from AI calls).

        This action performs pure content extraction without AI processing.
        The extracted ContentParts can then be used by subsequent AI processing actions.

        Parameters:
        - documentList (list, required): Document reference(s) to extract content from.
        - extractionOptions (dict, optional): Extraction options (if not provided, defaults are used).

        Returns:
        - ActionResult with ActionDocument containing ContentExtracted objects
        - ContentExtracted.parts contains List[ContentPart] (already chunked if needed)
        """
        try:
            # Init progress logger
            workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
            operationId = f"context_extract_{workflowId}_{int(time.time())}"

            # Extract documentList from parameters dict
            from modules.datamodels.datamodelDocref import DocumentReferenceList
            documentListParam = parameters.get("documentList")
            if not documentListParam:
                return ActionResult.isFailure(error="documentList is required")

            # Convert to DocumentReferenceList if needed
            if isinstance(documentListParam, DocumentReferenceList):
                documentList = documentListParam
            elif isinstance(documentListParam, str):
                documentList = DocumentReferenceList.from_string_list([documentListParam])
            elif isinstance(documentListParam, list):
                documentList = DocumentReferenceList.from_string_list(documentListParam)
            else:
                return ActionResult.isFailure(error=f"Invalid documentList type: {type(documentListParam)}")

            # Start progress tracking
            parentOperationId = parameters.get('parentOperationId')
            self.services.chat.progressLogStart(
                operationId,
                "Extracting content from documents",
                "Content Extraction",
                f"Documents: {len(documentList.references)}",
                parentOperationId=parentOperationId
            )

            # Get ChatDocuments from documentList
            self.services.chat.progressLogUpdate(operationId, 0.2, "Loading documents")
            chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList)

            if not chatDocuments:
                self.services.chat.progressLogFinish(operationId, False)
                return ActionResult.isFailure(error="No documents found in documentList")

            logger.info(f"Extracting content from {len(chatDocuments)} documents")

            # Prepare extraction options
            self.services.chat.progressLogUpdate(operationId, 0.3, "Preparing extraction options")
            extractionOptionsParam = parameters.get("extractionOptions")

            # Convert dict to ExtractionOptions object if needed, or create defaults
            if extractionOptionsParam:
                if isinstance(extractionOptionsParam, dict):
                    # Convert dict to ExtractionOptions object
                    extractionOptions = ExtractionOptions(**extractionOptionsParam)
                elif isinstance(extractionOptionsParam, ExtractionOptions):
                    extractionOptions = extractionOptionsParam
                else:
                    # Invalid type, use defaults
                    extractionOptions = None
            else:
                extractionOptions = None

            # If extractionOptions not provided, create defaults
            if not extractionOptions:
                # Default extraction options for pure content extraction (no AI processing)
                extractionOptions = ExtractionOptions(
                    prompt="Extract all content from the document",
                    mergeStrategy=MergeStrategy(
                        mergeType="concatenate",
                        groupBy="typeGroup",
                        orderBy="id"
                    ),
                    processDocumentsIndividually=True
                )

            # Call extraction service with hierarchical progress logging
            self.services.chat.progressLogUpdate(operationId, 0.4, "Initiating")
            self.services.chat.progressLogUpdate(operationId, 0.5, f"Extracting content from {len(chatDocuments)} documents")
            # Pass operationId for hierarchical per-document progress logging
            extractedResults = self.services.extraction.extractContent(chatDocuments, extractionOptions, operationId=operationId)

            # Build ActionDocuments from ContentExtracted results
            self.services.chat.progressLogUpdate(operationId, 0.8, "Building result documents")
            actionDocuments = []
            # Map extracted results back to original documents by index (results are in same order)
            for i, extracted in enumerate(extractedResults):
                # Get original document name if available
                originalDoc = chatDocuments[i] if i < len(chatDocuments) else None
                if originalDoc and hasattr(originalDoc, 'fileName') and originalDoc.fileName:
                    # Use original filename with "extracted_" prefix
                    baseName = originalDoc.fileName.rsplit('.', 1)[0] if '.' in originalDoc.fileName else originalDoc.fileName
                    documentName = f"{baseName}_extracted_{extracted.id}.json"
                else:
                    # Fallback to generic name with index
                    documentName = f"document_{i+1:03d}_extracted_{extracted.id}.json"

                # Store ContentExtracted object in ActionDocument.documentData
                validationMetadata = {
                    "actionType": "context.extractContent",
                    "documentIndex": i,
                    "extractedId": extracted.id,
                    "partCount": len(extracted.parts) if extracted.parts else 0,
                    "originalFileName": originalDoc.fileName if originalDoc and hasattr(originalDoc, 'fileName') else None
                }
                actionDoc = ActionDocument(
                    documentName=documentName,
                    documentData=extracted,  # ContentExtracted object
                    mimeType="application/json",
                    validationMetadata=validationMetadata
                )
                actionDocuments.append(actionDoc)

            self.services.chat.progressLogFinish(operationId, True)

            return ActionResult.isSuccess(documents=actionDocuments)

        except Exception as e:
            logger.error(f"Error in content extraction: {str(e)}")

            # Complete progress tracking with failure
            try:
                self.services.chat.progressLogFinish(operationId, False)
            except:
                pass  # Don't fail on progress logging errors

            return ActionResult.isFailure(error=str(e))

    @action
    async def triggerPreprocessingServer(self, parameters: Dict[str, Any]) -> ActionResult:
        """
        Trigger preprocessing server at customer tenant to update database with configuration.

        This action makes a POST request to the preprocessing server endpoint with the provided
        configuration JSON. The authorization secret is retrieved from APP_CONFIG using the provided config key.

        Parameters:
        - endpoint (str, required): The full URL endpoint for the preprocessing server API.
        - configJson (dict or str, required): Configuration JSON object to send to the preprocessing server. Can be provided as a dict or as a JSON string that will be parsed.
        - authSecretConfigKey (str, required): The APP_CONFIG key name to retrieve the authorization secret from.

        Returns:
        - ActionResult with ActionDocument containing "ok" on success, or error message on failure.
        """
        try:
            endpoint = parameters.get("endpoint")
            if not endpoint:
                return ActionResult.isFailure(error="endpoint parameter is required")

            configJsonParam = parameters.get("configJson")
            if not configJsonParam:
                return ActionResult.isFailure(error="configJson parameter is required")

            authSecretConfigKey = parameters.get("authSecretConfigKey")
            if not authSecretConfigKey:
                return ActionResult.isFailure(error="authSecretConfigKey parameter is required")

            # Handle configJson as either dict or JSON string
            if isinstance(configJsonParam, str):
                try:
                    configJson = json.loads(configJsonParam)
                except json.JSONDecodeError as e:
                    return ActionResult.isFailure(error=f"configJson is not valid JSON: {str(e)}")
            elif isinstance(configJsonParam, dict):
                configJson = configJsonParam
            else:
                return ActionResult.isFailure(error=f"configJson must be a dict or JSON string, got {type(configJsonParam)}")

            # Get authorization secret from APP_CONFIG using the provided config key
            authSecret = APP_CONFIG.get(authSecretConfigKey)
            if not authSecret:
                errorMsg = f"{authSecretConfigKey} not found in APP_CONFIG"
                logger.error(errorMsg)
                return ActionResult.isFailure(error=errorMsg)

            # Prepare headers with authorization (default headers as in original function)
            headers = {
                "X-PP-API-Key": authSecret,
                "Content-Type": "application/json"
            }

            # Make POST request
            timeout = aiohttp.ClientTimeout(total=60)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.post(
                    endpoint,
                    headers=headers,
                    json=configJson
                ) as response:
                    if response.status in [200, 201]:
                        responseText = await response.text()
                        logger.info(f"Preprocessing server trigger successful: {response.status}")
                        logger.debug(f"Response: {responseText}")

                        # Generate meaningful filename
                        workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
                        filename = self._generateMeaningfulFileName(
                            "preprocessing_result",
                            "txt",
                            workflowContext,
                            "triggerPreprocessingServer"
                        )

                        # Create validation metadata
                        validationMetadata = self._createValidationMetadata(
                            "triggerPreprocessingServer",
                            endpoint=endpoint,
                            statusCode=response.status,
                            responseText=responseText
                        )

                        # Return success with "ok" document
                        document = ActionDocument(
                            documentName=filename,
                            documentData="ok",
                            mimeType="text/plain",
                            validationMetadata=validationMetadata
                        )

                        return ActionResult.isSuccess(documents=[document])
                    else:
                        errorText = await response.text()
                        errorMsg = f"Preprocessing server trigger failed: {response.status} - {errorText}"
                        logger.error(errorMsg)
                        return ActionResult.isFailure(error=errorMsg)

        except Exception as e:
            errorMsg = f"Error triggering preprocessing server: {str(e)}"
            logger.error(errorMsg)
            return ActionResult.isFailure(error=errorMsg)