gateway/modules/workflows/methods/methodAi/actions/convert.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

"""
Convert action for AI operations.
Converts documents/data between different formats with specific formatting options.
"""

import logging
import json
from typing import Dict, Any
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
from modules.datamodels.datamodelDocref import DocumentReferenceList

logger = logging.getLogger(__name__)

@action
async def convert(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    GENERAL:
    - Purpose: Convert documents/data between different formats with specific formatting options (e.g., JSON→CSV with custom columns, delimiters).
    - Input requirements: documentList (required); inputFormat and outputFormat (required).
    - Output format: Document in target format with specified formatting options.
    - CRITICAL: If input is already in standardized JSON format, uses automatic rendering system (no AI call needed).

    Parameters:
    - documentList (list, required): Document reference(s) to convert.
    - inputFormat (str, required): Source format (json, csv, xlsx, txt, etc.).
    - outputFormat (str, required): Target format (csv, json, xlsx, txt, etc.).
    - columnsPerRow (int, optional): For CSV output, number of columns per row. Default: auto-detect.
    - delimiter (str, optional): For CSV output, delimiter character. Default: comma (,).
    - includeHeader (bool, optional): For CSV output, whether to include header row. Default: True.
    - language (str, optional): Language for output (e.g., 'de', 'en', 'fr'). Default: 'en'.
    """
    documentList = parameters.get("documentList", [])
    if not documentList:
        return ActionResult.isFailure(error="documentList is required")

    inputFormat = parameters.get("inputFormat")
    outputFormat = parameters.get("outputFormat")
    if not inputFormat or not outputFormat:
        return ActionResult.isFailure(error="inputFormat and outputFormat are required")

    # Normalize formats (remove leading dot if present)
    normalizedInputFormat = inputFormat.strip().lstrip('.').lower()
    normalizedOutputFormat = outputFormat.strip().lstrip('.').lower()

    # Get documents
    if isinstance(documentList, DocumentReferenceList):
        docRefList = documentList
    elif isinstance(documentList, list):
        docRefList = DocumentReferenceList.from_string_list(documentList)
    else:
        docRefList = DocumentReferenceList.from_string_list([documentList])

    chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList)
    if not chatDocuments:
        return ActionResult.isFailure(error="No documents found in documentList")

    # Check if input is standardized JSON format - if so, use direct rendering
    if normalizedInputFormat == "json" and len(chatDocuments) == 1:
        try:
            doc = chatDocuments[0]
            # ChatDocument doesn't have documentData - need to load file content using fileId
            docBytes = self.services.chat.getFileData(doc.fileId)
            if not docBytes:
                raise ValueError(f"No file data found for fileId={doc.fileId}")

            # Decode bytes to string
            docData = docBytes.decode('utf-8')

            # Try to parse as JSON
            if isinstance(docData, str):
                jsonData = json.loads(docData)
            elif isinstance(docData, dict):
                jsonData = docData
            else:
                jsonData = None

            # Check if it's standardized JSON format (has "documents" or "sections")
            if jsonData and (isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData)):
                # Use direct rendering - no AI call needed!
                from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
                generationService = GenerationService(self.services)

                # Ensure format is "documents" array
                if "documents" not in jsonData:
                    jsonData = {"documents": [{"sections": jsonData.get("sections", []), "metadata": jsonData.get("metadata", {})}]}

                # Get title
                title = jsonData.get("metadata", {}).get("title", doc.documentName or "Converted Document")

                # Render with options
                renderOptions = {}
                if normalizedOutputFormat == "csv":
                    renderOptions["delimiter"] = parameters.get("delimiter", ",")
                    renderOptions["columnsPerRow"] = parameters.get("columnsPerRow")
                    renderOptions["includeHeader"] = parameters.get("includeHeader", True)

                rendered_content, mime_type = await generationService.renderReport(
                    jsonData, normalizedOutputFormat, title, None, None
                )

                # Apply CSV options if needed (renderer will handle them)
                if normalizedOutputFormat == "csv" and renderOptions:
                    rendered_content = self.csvProcessing.applyCsvOptions(rendered_content, renderOptions)

                validationMetadata = {
                    "actionType": "ai.convert",
                    "inputFormat": normalizedInputFormat,
                    "outputFormat": normalizedOutputFormat,
                    "hasSourceJson": True,
                    "conversionType": "direct_rendering"
                }
                actionDoc = ActionDocument(
                    documentName=f"{doc.documentName.rsplit('.', 1)[0] if '.' in doc.documentName else doc.documentName}.{normalizedOutputFormat}",
                    documentData=rendered_content,
                    mimeType=mime_type,
                    sourceJson=jsonData,  # Preserve source JSON for structure validation
                    validationMetadata=validationMetadata
                )

                return ActionResult.isSuccess(documents=[actionDoc])

        except Exception as e:
            logger.warning(f"Direct rendering failed, falling back to AI conversion: {str(e)}")
            # Fall through to AI-based conversion

    # Fallback: Use AI for conversion (for non-JSON inputs or complex conversions)
    columnsPerRow = parameters.get("columnsPerRow")
    delimiter = parameters.get("delimiter", ",")
    includeHeader = parameters.get("includeHeader", True)
    language = parameters.get("language", "en")

    aiPrompt = f"Convert the provided document(s) from {normalizedInputFormat.upper()} format to {normalizedOutputFormat.upper()} format."

    if normalizedOutputFormat == "csv":
        aiPrompt += f" Use '{delimiter}' as the delimiter character."
        if columnsPerRow:
            aiPrompt += f" Format the output with {columnsPerRow} columns per row."
        if not includeHeader:
            aiPrompt += " Do not include a header row."
        else:
            aiPrompt += " Include a header row with column names."

    if language and language != "en":
        aiPrompt += f" Use language: {language}."

    aiPrompt += " Preserve all data and ensure accurate conversion. Maintain data integrity and structure."

    return await self.process({
        "aiPrompt": aiPrompt,
        "documentList": documentList,
        "resultType": normalizedOutputFormat
    })