262 lines
13 KiB
Python
262 lines
13 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
|
|
import logging
|
|
import time
|
|
import json
|
|
import base64
|
|
from typing import Dict, Any
|
|
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
|
|
operationId = None
|
|
try:
|
|
# Init progress logger
|
|
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
|
operationId = f"sharepoint_read_{workflowId}_{int(time.time())}"
|
|
|
|
# Start progress tracking
|
|
parentOperationId = parameters.get('parentOperationId')
|
|
self.services.chat.progressLogStart(
|
|
operationId,
|
|
"Read Documents",
|
|
"SharePoint Document Reading",
|
|
"Processing document list",
|
|
parentOperationId=parentOperationId
|
|
)
|
|
|
|
documentList = parameters.get("documentList")
|
|
pathQuery = parameters.get("pathQuery", "*")
|
|
connectionReference = parameters.get("connectionReference")
|
|
includeMetadata = parameters.get("includeMetadata", True)
|
|
|
|
# Validate connection reference
|
|
if not connectionReference:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="Connection reference is required")
|
|
|
|
# Require either documentList or pathQuery
|
|
if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="Either documentList or pathQuery is required")
|
|
|
|
# Get connection first
|
|
self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
|
|
connection = self.connection.getMicrosoftConnection(connectionReference)
|
|
if not connection:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
|
|
|
|
# Parse documentList to extract foundDocuments and site information
|
|
sharePointFileIds = None
|
|
sites = None
|
|
|
|
if documentList:
|
|
foundDocuments, sites, errorMsg = await self.documentParsing.parseDocumentListForFoundDocuments(documentList)
|
|
if errorMsg:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error=errorMsg)
|
|
|
|
if foundDocuments:
|
|
# Extract SharePoint file IDs from foundDocuments
|
|
sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"]
|
|
if not sharePointFileIds:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="No files found in documentList from findDocumentPath result")
|
|
logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from documentList")
|
|
|
|
# If we have SharePoint file IDs from documentList (findDocumentPath result), read them directly
|
|
if sharePointFileIds and sites:
|
|
# Read SharePoint files directly using their IDs
|
|
readResults = []
|
|
siteId = sites[0]['id']
|
|
|
|
self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading {len(sharePointFileIds)} file(s) from SharePoint")
|
|
for idx, fileId in enumerate(sharePointFileIds):
|
|
try:
|
|
self.services.chat.progressLogUpdate(operationId, 0.5 + (idx * 0.3 / len(sharePointFileIds)), f"Reading file {idx + 1}/{len(sharePointFileIds)}")
|
|
# Get file info from SharePoint
|
|
endpoint = f"sites/{siteId}/drive/items/{fileId}"
|
|
fileInfo = await self.apiClient.makeGraphApiCall(endpoint)
|
|
|
|
if "error" in fileInfo:
|
|
logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}")
|
|
continue
|
|
|
|
# Get file content using SharePoint service (handles binary data correctly)
|
|
fileName = fileInfo.get("name", f"file_{fileId}")
|
|
fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)
|
|
|
|
# Create result document
|
|
resultItem = {
|
|
"fileId": fileId,
|
|
"fileName": fileName,
|
|
"sharepointFileId": fileId,
|
|
"siteName": sites[0]['displayName'],
|
|
"siteUrl": sites[0]['webUrl'],
|
|
"size": fileInfo.get("size", 0),
|
|
"createdDateTime": fileInfo.get("createdDateTime"),
|
|
"lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"),
|
|
"webUrl": fileInfo.get("webUrl")
|
|
}
|
|
|
|
# Add content if available
|
|
if fileContent:
|
|
resultItem["content"] = fileContent
|
|
|
|
# Add metadata if requested
|
|
if includeMetadata:
|
|
resultItem["metadata"] = {
|
|
"mimeType": fileInfo.get("file", {}).get("mimeType"),
|
|
"downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"),
|
|
"createdBy": fileInfo.get("createdBy", {}),
|
|
"lastModifiedBy": fileInfo.get("lastModifiedBy", {}),
|
|
"parentReference": fileInfo.get("parentReference", {})
|
|
}
|
|
|
|
readResults.append(resultItem)
|
|
except Exception as e:
|
|
logger.error(f"Error reading file {fileId}: {str(e)}")
|
|
continue
|
|
|
|
if not readResults:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="No files could be read from documentList")
|
|
|
|
# Convert read results to ActionDocument objects
|
|
# IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData
|
|
# The system will create FileData and ChatDocument automatically
|
|
self.services.chat.progressLogUpdate(operationId, 0.8, f"Processing {len(readResults)} document(s)")
|
|
|
|
actionDocuments = []
|
|
for resultItem in readResults:
|
|
fileContent = resultItem.get("content")
|
|
fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}")
|
|
|
|
# Determine MIME type from metadata or file extension
|
|
mimeType = "application/octet-stream"
|
|
if resultItem.get("metadata", {}).get("mimeType"):
|
|
mimeType = resultItem["metadata"]["mimeType"]
|
|
elif fileName:
|
|
if fileName.endswith('.pdf'):
|
|
mimeType = "application/pdf"
|
|
elif fileName.endswith('.txt'):
|
|
mimeType = "text/plain"
|
|
elif fileName.endswith('.json'):
|
|
mimeType = "application/json"
|
|
|
|
# For binary files (PDFs, etc.), store Base64-encoded content directly
|
|
# The GenerationService will detect PDF mimeType and handle base64 decoding
|
|
if fileContent and isinstance(fileContent, bytes):
|
|
# Encode binary content as Base64 string
|
|
base64Content = base64.b64encode(fileContent).decode('utf-8')
|
|
validationMetadata = {
|
|
"actionType": "sharepoint.readDocuments",
|
|
"fileName": fileName,
|
|
"sharepointFileId": resultItem.get("sharepointFileId"),
|
|
"siteName": resultItem.get("siteName"),
|
|
"mimeType": mimeType,
|
|
"contentType": "binary",
|
|
"size": len(fileContent),
|
|
"includeMetadata": includeMetadata
|
|
}
|
|
actionDoc = ActionDocument(
|
|
documentName=fileName,
|
|
documentData=base64Content, # Base64 string for binary files
|
|
mimeType=mimeType,
|
|
validationMetadata=validationMetadata
|
|
)
|
|
actionDocuments.append(actionDoc)
|
|
logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument")
|
|
elif fileContent:
|
|
# Text content - store directly in documentData
|
|
validationMetadata = {
|
|
"actionType": "sharepoint.readDocuments",
|
|
"fileName": fileName,
|
|
"sharepointFileId": resultItem.get("sharepointFileId"),
|
|
"siteName": resultItem.get("siteName"),
|
|
"mimeType": mimeType,
|
|
"contentType": "text",
|
|
"includeMetadata": includeMetadata
|
|
}
|
|
actionDoc = ActionDocument(
|
|
documentName=fileName,
|
|
documentData=fileContent if isinstance(fileContent, str) else str(fileContent),
|
|
mimeType=mimeType,
|
|
validationMetadata=validationMetadata
|
|
)
|
|
actionDocuments.append(actionDoc)
|
|
else:
|
|
# No content - store metadata only
|
|
docData = {
|
|
"fileName": fileName,
|
|
"sharepointFileId": resultItem.get("sharepointFileId"),
|
|
"siteName": resultItem.get("siteName"),
|
|
"siteUrl": resultItem.get("siteUrl"),
|
|
"size": resultItem.get("size"),
|
|
"createdDateTime": resultItem.get("createdDateTime"),
|
|
"lastModifiedDateTime": resultItem.get("lastModifiedDateTime"),
|
|
"webUrl": resultItem.get("webUrl")
|
|
}
|
|
if resultItem.get("metadata"):
|
|
docData["metadata"] = resultItem["metadata"]
|
|
|
|
validationMetadata = {
|
|
"actionType": "sharepoint.readDocuments",
|
|
"fileName": fileName,
|
|
"sharepointFileId": resultItem.get("sharepointFileId"),
|
|
"siteName": resultItem.get("siteName"),
|
|
"mimeType": mimeType,
|
|
"contentType": "metadata_only",
|
|
"includeMetadata": includeMetadata
|
|
}
|
|
actionDoc = ActionDocument(
|
|
documentName=fileName,
|
|
documentData=json.dumps(docData, indent=2),
|
|
mimeType=mimeType,
|
|
validationMetadata=validationMetadata
|
|
)
|
|
actionDocuments.append(actionDoc)
|
|
|
|
# Return success with action documents
|
|
self.services.chat.progressLogUpdate(operationId, 0.9, f"Read {len(actionDocuments)} document(s)")
|
|
self.services.chat.progressLogFinish(operationId, True)
|
|
return ActionResult.isSuccess(documents=actionDocuments)
|
|
|
|
# If no sites from documentList, try pathQuery fallback
|
|
if not sites and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
|
|
sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
|
|
if errorMsg:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error=errorMsg)
|
|
|
|
# If still no sites, return error
|
|
if not sites:
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with file information, or pathQuery must be provided. Use findDocumentPath first to get file paths, or provide pathQuery directly.")
|
|
|
|
# This should never be reached if logic above is correct
|
|
if operationId:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
return ActionResult.isFailure(error="Unexpected error: could not process documentList or pathQuery")
|
|
except Exception as e:
|
|
logger.error(f"Error reading SharePoint documents: {str(e)}")
|
|
if operationId:
|
|
try:
|
|
self.services.chat.progressLogFinish(operationId, False)
|
|
except:
|
|
pass # Don't fail on progress logging errors
|
|
return ActionResult(
|
|
success=False,
|
|
error=str(e)
|
|
)
|
|
|