gateway/modules/workflows/methods/methodJira/actions/parseExcelContent.py
2026-01-20 00:55:39 +01:00

103 lines
3.8 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import json
import pandas as pd
from io import BytesIO
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
async def parseExcelContent(self, parameters: Dict[str, Any]) -> ActionResult:
try:
excelContentParam = parameters.get("excelContent")
if not excelContentParam:
return ActionResult.isFailure(error="excelContent parameter is required")
skipRows = parameters.get("skipRows", 3)
hasCustomHeaders = parameters.get("hasCustomHeaders", True)
# Get Excel content from document
excelBytes = self.documentParsing.getDocumentData(excelContentParam)
if excelBytes is None:
return ActionResult.isFailure(error="Could not get Excel content from document reference")
# Convert to bytes if needed
if isinstance(excelBytes, str):
excelBytes = excelBytes.encode('latin-1') # Excel might have binary data
elif not isinstance(excelBytes, bytes):
return ActionResult.isFailure(error="Excel content must be bytes or string")
# Parse Excel
df = pd.read_excel(BytesIO(excelBytes), engine='openpyxl', header=None)
# Extract headers if hasCustomHeaders
headers = {"header1": "Header 1", "header2": "Header 2"}
if hasCustomHeaders and len(df) >= 3:
headerRow1 = df.iloc[0:1].copy()
headerRow2 = df.iloc[1:2].copy()
tableHeaders = df.iloc[2:3].copy()
dfData = df.iloc[skipRows:].copy()
dfData.columns = tableHeaders.iloc[0]
headers = {
"header1": ",".join([str(x) if pd.notna(x) else "" for x in headerRow1.iloc[0].tolist()]),
"header2": ",".join([str(x) if pd.notna(x) else "" for x in headerRow2.iloc[0].tolist()]),
}
else:
# No custom headers, use standard parsing
if skipRows > 0:
dfData = df.iloc[skipRows:].copy()
if len(df) > skipRows:
dfData.columns = df.iloc[skipRows-1]
else:
dfData = df.copy()
# Reset index and clean data
dfData = dfData.reset_index(drop=True)
for column in dfData.columns:
dfData[column] = dfData[column].astype('object').fillna('')
data = dfData.to_dict(orient='records')
logger.info(f"Parsed Excel: {len(data)} rows, {len(dfData.columns)} columns")
# Generate filename
workflowContext = self.services.chat.getWorkflowContext() if hasattr(self.services, 'chat') else None
filename = self._generateMeaningfulFileName(
"parsed_excel_data",
"json",
workflowContext,
"parseExcelContent"
)
result = {
"data": data,
"headers": headers,
"rowCount": len(data),
"columnCount": len(dfData.columns)
}
validationMetadata = self._createValidationMetadata(
"parseExcelContent",
rowCount=len(data),
columnCount=len(dfData.columns),
skipRows=skipRows
)
document = ActionDocument(
documentName=filename,
documentData=json.dumps(result, indent=2, ensure_ascii=False),
mimeType="application/json",
validationMetadata=validationMetadata
)
return ActionResult.isSuccess(documents=[document])
except Exception as e:
errorMsg = f"Error parsing Excel content: {str(e)}"
logger.error(errorMsg)
return ActionResult.isFailure(error=errorMsg)