gateway/modules/workflows/methods/methodJira/actions/parseExcelContent.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

"""
Parse Excel Content action for JIRA operations.
Parses Excel content with custom headers.
"""

import logging
import json
import pandas as pd
from io import BytesIO
from typing import Dict, Any
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

@action
async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult:
    """
    Parse Excel content with custom headers.

    Parameters:
    - excelContent (str, required): Document reference containing Excel file content as bytes
    - skipRows (int, optional): Number of header rows to skip (default: 3)
    - hasCustomHeaders (bool, optional): Whether Excel has custom header rows (default: true)

    Returns:
    - ActionResult with ActionDocument containing parsed data and headers as JSON
    """
    try:
        excelContentParam = parameters.get("excelContent")
        if not excelContentParam:
            return ActionResult.isFailure(error="excelContent parameter is required")

        skipRows = parameters.get("skipRows", 3)
        hasCustomHeaders = parameters.get("hasCustomHeaders", True)

        # Get Excel content from document
        excelBytes = self.documentParsing.getDocumentData(excelContentParam)
        if excelBytes is None:
            return ActionResult.isFailure(error="Could not get Excel content from document reference")

        # Convert to bytes if needed
        if isinstance(excelBytes, str):
            excelBytes = excelBytes.encode('latin-1')  # Excel might have binary data
        elif not isinstance(excelBytes, bytes):
            return ActionResult.isFailure(error="Excel content must be bytes or string")

        # Parse Excel
        df = pd.read_excel(BytesIO(excelBytes), engine='openpyxl', header=None)

        # Extract headers if hasCustomHeaders
        headers = {"header1": "Header 1", "header2": "Header 2"}
        if hasCustomHeaders and len(df) >= 3:
            headerRow1 = df.iloc[0:1].copy()
            headerRow2 = df.iloc[1:2].copy()
            tableHeaders = df.iloc[2:3].copy()
            dfData = df.iloc[skipRows:].copy()
            dfData.columns = tableHeaders.iloc[0]

            headers = {
                "header1": ",".join([str(x) if pd.notna(x) else "" for x in headerRow1.iloc[0].tolist()]),
                "header2": ",".join([str(x) if pd.notna(x) else "" for x in headerRow2.iloc[0].tolist()]),
            }
        else:
            # No custom headers, use standard parsing
            if skipRows > 0:
                dfData = df.iloc[skipRows:].copy()
                if len(df) > skipRows:
                    dfData.columns = df.iloc[skipRows-1]
            else:
                dfData = df.copy()

        # Reset index and clean data
        dfData = dfData.reset_index(drop=True)
        for column in dfData.columns:
            dfData[column] = dfData[column].astype('object').fillna('')

        data = dfData.to_dict(orient='records')

        logger.info(f"Parsed Excel: {len(data)} rows, {len(dfData.columns)} columns")

        # Generate filename
        workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
        filename = self._generateMeaningfulFileName(
            "parsed_excel_data",
            "json",
            workflowContext,
            "parseExcelContent"
        )

        result = {
            "data": data,
            "headers": headers,
            "rowCount": len(data),
            "columnCount": len(dfData.columns)
        }

        validationMetadata = self._createValidationMetadata(
            "parseExcelContent",
            rowCount=len(data),
            columnCount=len(dfData.columns),
            skipRows=skipRows
        )

        document = ActionDocument(
            documentName=filename,
            documentData=json.dumps(result, indent=2, ensure_ascii=False),
            mimeType="application/json",
            validationMetadata=validationMetadata
        )

        return ActionResult.isSuccess(documents=[document])

    except Exception as e:
        errorMsg = f"Error parsing Excel content: {str(e)}"
        logger.error(errorMsg)
        return ActionResult.isFailure(error=errorMsg)