gateway/modules/workflows/methods/methodSharepoint/actions/readDocuments.py
2026-03-22 19:46:50 +01:00

311 lines
16 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import time
import json
import base64
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
# Init progress logger
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"sharepoint_read_{workflowId}_{int(time.time())}"
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"Read Documents",
"SharePoint Document Reading",
"Processing document list",
parentOperationId=parentOperationId
)
documentList = parameters.get("documentList")
pathQuery = parameters.get("pathQuery", "*")
connectionReference = parameters.get("connectionReference")
includeMetadata = parameters.get("includeMetadata", True)
# Validate connection reference
if not connectionReference:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Connection reference is required")
# Require either documentList or pathQuery
if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList or pathQuery is required")
# Get connection first
self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
connection = self.connection.getMicrosoftConnection(connectionReference)
if not connection:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
# Parse documentList to extract foundDocuments and site information
sharePointFileIds = None
sites = None
if documentList:
foundDocuments, sites, errorMsg = await self.documentParsing.parseDocumentListForFoundDocuments(documentList)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
if foundDocuments:
# Extract SharePoint file IDs from foundDocuments
sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"]
if not sharePointFileIds:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No files found in documentList from findDocumentPath result")
logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from documentList")
# If we have SharePoint file IDs from documentList (findDocumentPath result), read them directly
if sharePointFileIds and sites:
# Read SharePoint files directly using their IDs
readResults = []
siteId = sites[0]['id']
self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading {len(sharePointFileIds)} file(s) from SharePoint")
for idx, fileId in enumerate(sharePointFileIds):
try:
self.services.chat.progressLogUpdate(operationId, 0.5 + (idx * 0.3 / len(sharePointFileIds)), f"Reading file {idx + 1}/{len(sharePointFileIds)}")
# Get file info from SharePoint
endpoint = f"sites/{siteId}/drive/items/{fileId}"
fileInfo = await self.apiClient.makeGraphApiCall(endpoint)
if "error" in fileInfo:
logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}")
continue
# Get file content using SharePoint service (handles binary data correctly)
fileName = fileInfo.get("name", f"file_{fileId}")
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
# Create result document
resultItem = {
"fileId": fileId,
"fileName": fileName,
"sharepointFileId": fileId,
"siteName": sites[0]['displayName'],
"siteUrl": sites[0]['webUrl'],
"size": fileInfo.get("size", 0),
"createdDateTime": fileInfo.get("createdDateTime"),
"lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"),
"webUrl": fileInfo.get("webUrl")
}
# Add content if available
if fileContent:
resultItem["content"] = fileContent
# Add metadata if requested
if includeMetadata:
resultItem["metadata"] = {
"mimeType": fileInfo.get("file", {}).get("mimeType"),
"downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"),
"createdBy": fileInfo.get("createdBy", {}),
"lastModifiedBy": fileInfo.get("lastModifiedBy", {}),
"parentReference": fileInfo.get("parentReference", {})
}
readResults.append(resultItem)
except Exception as e:
logger.error(f"Error reading file {fileId}: {str(e)}")
continue
if not readResults:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No files could be read from documentList")
# Convert read results to ActionDocument objects
# IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData
# The system will create FileData and ChatDocument automatically
self.services.chat.progressLogUpdate(operationId, 0.8, f"Processing {len(readResults)} document(s)")
actionDocuments = []
for resultItem in readResults:
fileContent = resultItem.get("content")
fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}")
# Determine MIME type from metadata or file extension
mimeType = "application/octet-stream"
if resultItem.get("metadata", {}).get("mimeType"):
mimeType = resultItem["metadata"]["mimeType"]
elif fileName:
if fileName.endswith('.pdf'):
mimeType = "application/pdf"
elif fileName.endswith('.txt'):
mimeType = "text/plain"
elif fileName.endswith('.json'):
mimeType = "application/json"
# For binary files (PDFs, etc.), store Base64-encoded content directly
# The GenerationService will detect PDF mimeType and handle base64 decoding
if fileContent and isinstance(fileContent, bytes):
# Encode binary content as Base64 string
base64Content = base64.b64encode(fileContent).decode('utf-8')
validationMetadata = {
"actionType": "sharepoint.readDocuments",
"fileName": fileName,
"sharepointFileId": resultItem.get("sharepointFileId"),
"siteName": resultItem.get("siteName"),
"mimeType": mimeType,
"contentType": "binary",
"size": len(fileContent),
"includeMetadata": includeMetadata
}
actionDoc = ActionDocument(
documentName=fileName,
documentData=base64Content, # Base64 string for binary files
mimeType=mimeType,
validationMetadata=validationMetadata
)
actionDocuments.append(actionDoc)
logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument")
elif fileContent:
# Text content - store directly in documentData
validationMetadata = {
"actionType": "sharepoint.readDocuments",
"fileName": fileName,
"sharepointFileId": resultItem.get("sharepointFileId"),
"siteName": resultItem.get("siteName"),
"mimeType": mimeType,
"contentType": "text",
"includeMetadata": includeMetadata
}
actionDoc = ActionDocument(
documentName=fileName,
documentData=fileContent if isinstance(fileContent, str) else str(fileContent),
mimeType=mimeType,
validationMetadata=validationMetadata
)
actionDocuments.append(actionDoc)
else:
# No content - store metadata only
docData = {
"fileName": fileName,
"sharepointFileId": resultItem.get("sharepointFileId"),
"siteName": resultItem.get("siteName"),
"siteUrl": resultItem.get("siteUrl"),
"size": resultItem.get("size"),
"createdDateTime": resultItem.get("createdDateTime"),
"lastModifiedDateTime": resultItem.get("lastModifiedDateTime"),
"webUrl": resultItem.get("webUrl")
}
if resultItem.get("metadata"):
docData["metadata"] = resultItem["metadata"]
validationMetadata = {
"actionType": "sharepoint.readDocuments",
"fileName": fileName,
"sharepointFileId": resultItem.get("sharepointFileId"),
"siteName": resultItem.get("siteName"),
"mimeType": mimeType,
"contentType": "metadata_only",
"includeMetadata": includeMetadata
}
actionDoc = ActionDocument(
documentName=fileName,
documentData=json.dumps(docData, indent=2),
mimeType=mimeType,
validationMetadata=validationMetadata
)
actionDocuments.append(actionDoc)
# Return success with action documents
self.services.chat.progressLogUpdate(operationId, 0.9, f"Read {len(actionDocuments)} document(s)")
self.services.chat.progressLogFinish(operationId, True)
return ActionResult.isSuccess(documents=actionDocuments)
# If no sites from documentList, try pathQuery fallback
if not sites and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
# If still no sites, return error
if not sites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with file information, or pathQuery must be provided. Use findDocumentPath first to get file paths, or provide pathQuery directly.")
# When we have pathQuery + sites but no sharePointFileIds (e.g. user selected from browse tree),
# download the file by path
if pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*" and sites and not sharePointFileIds:
parsedPath = self.services.sharepoint.extractSiteFromStandardPath(pathQuery)
if parsedPath:
innerPath = (parsedPath.get("innerPath") or "").strip()
if not innerPath:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="pathQuery must include a file path (e.g. /sites/SiteName/Shared Documents/file.pdf)")
siteId = sites[0].get("id")
if not siteId:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Could not resolve site ID from pathQuery")
self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading file from path: {innerPath}")
fileContent = await self.services.sharepoint.downloadFileByPath(siteId=siteId, filePath=innerPath)
if fileContent is None:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=f"File not found or could not be downloaded: {innerPath}")
fileName = innerPath.split("/")[-1] if "/" in innerPath else innerPath
mimeType = "application/octet-stream"
if fileName.endswith(".pdf"):
mimeType = "application/pdf"
elif fileName.endswith(".txt"):
mimeType = "text/plain"
elif fileName.endswith(".json"):
mimeType = "application/json"
base64Content = base64.b64encode(fileContent).decode("utf-8")
validationMetadata = {
"actionType": "sharepoint.readDocuments",
"fileName": fileName,
"sharepointFileId": None,
"siteName": sites[0].get("displayName"),
"mimeType": mimeType,
"contentType": "binary",
"size": len(fileContent),
"includeMetadata": includeMetadata
}
actionDoc = ActionDocument(
documentName=fileName,
documentData=base64Content,
mimeType=mimeType,
validationMetadata=validationMetadata
)
self.services.chat.progressLogUpdate(operationId, 0.9, f"Read 1 document(s)")
self.services.chat.progressLogFinish(operationId, True)
return ActionResult.isSuccess(documents=[actionDoc])
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Unexpected error: could not process documentList or pathQuery")
except Exception as e:
logger.error(f"Error reading SharePoint documents: {str(e)}")
if operationId:
try:
self.services.chat.progressLogFinish(operationId, False)
except:
pass # Don't fail on progress logging errors
return ActionResult(
success=False,
error=str(e)
)