gateway/modules/workflows/methods/methodJira/actions/parseExcelContent.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import logging
import json
import pandas as pd
from io import BytesIO
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult:
    try:
        excelContentParam = parameters.get("excelContent")
        if not excelContentParam:
            return ActionResult.isFailure(error="excelContent parameter is required")

        skipRows = parameters.get("skipRows", 3)
        hasCustomHeaders = parameters.get("hasCustomHeaders", True)

        # Get Excel content from document
        excelBytes = self.documentParsing.getDocumentData(excelContentParam)
        if excelBytes is None:
            return ActionResult.isFailure(error="Could not get Excel content from document reference")

        # Convert to bytes if needed
        if isinstance(excelBytes, str):
            excelBytes = excelBytes.encode('latin-1')  # Excel might have binary data
        elif not isinstance(excelBytes, bytes):
            return ActionResult.isFailure(error="Excel content must be bytes or string")

        # Parse Excel
        df = pd.read_excel(BytesIO(excelBytes), engine='openpyxl', header=None)

        # Extract headers if hasCustomHeaders
        headers = {"header1": "Header 1", "header2": "Header 2"}
        if hasCustomHeaders and len(df) >= 3:
            headerRow1 = df.iloc[0:1].copy()
            headerRow2 = df.iloc[1:2].copy()
            tableHeaders = df.iloc[2:3].copy()
            dfData = df.iloc[skipRows:].copy()
            dfData.columns = tableHeaders.iloc[0]

            headers = {
                "header1": ",".join([str(x) if pd.notna(x) else "" for x in headerRow1.iloc[0].tolist()]),
                "header2": ",".join([str(x) if pd.notna(x) else "" for x in headerRow2.iloc[0].tolist()]),
            }
        else:
            # No custom headers, use standard parsing
            if skipRows > 0:
                dfData = df.iloc[skipRows:].copy()
                if len(df) > skipRows:
                    dfData.columns = df.iloc[skipRows-1]
            else:
                dfData = df.copy()

        # Reset index and clean data
        dfData = dfData.reset_index(drop=True)
        for column in dfData.columns:
            dfData[column] = dfData[column].astype('object').fillna('')

        data = dfData.to_dict(orient='records')

        logger.info(f"Parsed Excel: {len(data)} rows, {len(dfData.columns)} columns")

        # Generate filename
        workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
        filename = self._generateMeaningfulFileName(
            "parsed_excel_data",
            "json",
            workflowContext,
            "parseExcelContent"
        )

        result = {
            "data": data,
            "headers": headers,
            "rowCount": len(data),
            "columnCount": len(dfData.columns)
        }

        validationMetadata = self._createValidationMetadata(
            "parseExcelContent",
            rowCount=len(data),
            columnCount=len(dfData.columns),
            skipRows=skipRows
        )

        document = ActionDocument(
            documentName=filename,
            documentData=json.dumps(result, indent=2, ensure_ascii=False),
            mimeType="application/json",
            validationMetadata=validationMetadata
        )

        return ActionResult.isSuccess(documents=[document])

    except Exception as e:
        errorMsg = f"Error parsing Excel content: {str(e)}"
        logger.error(errorMsg)
        return ActionResult.isFailure(error=errorMsg)