gateway/modules/workflows/methods/methodJira/actions/parseExcelContent.py
2025-12-17 10:45:09 +01:00

121 lines
4.4 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Parse Excel Content action for JIRA operations.
Parses Excel content with custom headers.
"""
import logging
import json
import pandas as pd
from io import BytesIO
from typing import Dict, Any
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
@action
async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult:
"""
Parse Excel content with custom headers.
Parameters:
- excelContent (str, required): Document reference containing Excel file content as bytes
- skipRows (int, optional): Number of header rows to skip (default: 3)
- hasCustomHeaders (bool, optional): Whether Excel has custom header rows (default: true)
Returns:
- ActionResult with ActionDocument containing parsed data and headers as JSON
"""
try:
excelContentParam = parameters.get("excelContent")
if not excelContentParam:
return ActionResult.isFailure(error="excelContent parameter is required")
skipRows = parameters.get("skipRows", 3)
hasCustomHeaders = parameters.get("hasCustomHeaders", True)
# Get Excel content from document
excelBytes = self.documentParsing.getDocumentData(excelContentParam)
if excelBytes is None:
return ActionResult.isFailure(error="Could not get Excel content from document reference")
# Convert to bytes if needed
if isinstance(excelBytes, str):
excelBytes = excelBytes.encode('latin-1') # Excel might have binary data
elif not isinstance(excelBytes, bytes):
return ActionResult.isFailure(error="Excel content must be bytes or string")
# Parse Excel
df = pd.read_excel(BytesIO(excelBytes), engine='openpyxl', header=None)
# Extract headers if hasCustomHeaders
headers = {"header1": "Header 1", "header2": "Header 2"}
if hasCustomHeaders and len(df) >= 3:
headerRow1 = df.iloc[0:1].copy()
headerRow2 = df.iloc[1:2].copy()
tableHeaders = df.iloc[2:3].copy()
dfData = df.iloc[skipRows:].copy()
dfData.columns = tableHeaders.iloc[0]
headers = {
"header1": ",".join([str(x) if pd.notna(x) else "" for x in headerRow1.iloc[0].tolist()]),
"header2": ",".join([str(x) if pd.notna(x) else "" for x in headerRow2.iloc[0].tolist()]),
}
else:
# No custom headers, use standard parsing
if skipRows > 0:
dfData = df.iloc[skipRows:].copy()
if len(df) > skipRows:
dfData.columns = df.iloc[skipRows-1]
else:
dfData = df.copy()
# Reset index and clean data
dfData = dfData.reset_index(drop=True)
for column in dfData.columns:
dfData[column] = dfData[column].astype('object').fillna('')
data = dfData.to_dict(orient='records')
logger.info(f"Parsed Excel: {len(data)} rows, {len(dfData.columns)} columns")
# Generate filename
workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
filename = self._generateMeaningfulFileName(
"parsed_excel_data",
"json",
workflowContext,
"parseExcelContent"
)
result = {
"data": data,
"headers": headers,
"rowCount": len(data),
"columnCount": len(dfData.columns)
}
validationMetadata = self._createValidationMetadata(
"parseExcelContent",
rowCount=len(data),
columnCount=len(dfData.columns),
skipRows=skipRows
)
document = ActionDocument(
documentName=filename,
documentData=json.dumps(result, indent=2, ensure_ascii=False),
mimeType="application/json",
validationMetadata=validationMetadata
)
return ActionResult.isSuccess(documents=[document])
except Exception as e:
errorMsg = f"Error parsing Excel content: {str(e)}"
logger.error(errorMsg)
return ActionResult.isFailure(error=errorMsg)