# Copyright (c) 2025 Patrick Motsch # All rights reserved. import logging import time import json import base64 from typing import Dict, Any from modules.datamodels.datamodelChatbot import ActionResult, ActionDocument logger = logging.getLogger(__name__) async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult: operationId = None try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operationId = f"sharepoint_read_{workflowId}_{int(time.time())}" # Start progress tracking parentOperationId = parameters.get('parentOperationId') self.services.chat.progressLogStart( operationId, "Read Documents", "SharePoint Document Reading", "Processing document list", parentOperationId=parentOperationId ) documentList = parameters.get("documentList") pathQuery = parameters.get("pathQuery", "*") connectionReference = parameters.get("connectionReference") includeMetadata = parameters.get("includeMetadata", True) # Validate connection reference if not connectionReference: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="Connection reference is required") # Require either documentList or pathQuery if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"): if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="Either documentList or pathQuery is required") # Get connection first self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") connection = self.connection.getMicrosoftConnection(connectionReference) if not connection: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") # Parse documentList to extract foundDocuments and site information sharePointFileIds = None sites = None if documentList: foundDocuments, sites, errorMsg = await self.documentParsing.parseDocumentListForFoundDocuments(documentList) if errorMsg: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error=errorMsg) if foundDocuments: # Extract SharePoint file IDs from foundDocuments sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"] if not sharePointFileIds: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No files found in documentList from findDocumentPath result") logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from documentList") # If we have SharePoint file IDs from documentList (findDocumentPath result), read them directly if sharePointFileIds and sites: # Read SharePoint files directly using their IDs readResults = [] siteId = sites[0]['id'] self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading {len(sharePointFileIds)} file(s) from SharePoint") for idx, fileId in enumerate(sharePointFileIds): try: self.services.chat.progressLogUpdate(operationId, 0.5 + (idx * 0.3 / len(sharePointFileIds)), f"Reading file {idx + 1}/{len(sharePointFileIds)}") # Get file info from SharePoint endpoint = f"sites/{siteId}/drive/items/{fileId}" fileInfo = await self.apiClient.makeGraphApiCall(endpoint) if "error" in fileInfo: logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}") continue # Get file content using SharePoint service (handles binary data correctly) fileName = fileInfo.get("name", f"file_{fileId}") fileContent = await self.services.sharepoint.downloadFile(siteId, fileId) # Create result document resultItem = { "fileId": fileId, "fileName": fileName, "sharepointFileId": fileId, "siteName": sites[0]['displayName'], "siteUrl": sites[0]['webUrl'], "size": fileInfo.get("size", 0), "createdDateTime": fileInfo.get("createdDateTime"), "lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"), "webUrl": fileInfo.get("webUrl") } # Add content if available if fileContent: resultItem["content"] = fileContent # Add metadata if requested if includeMetadata: resultItem["metadata"] = { "mimeType": fileInfo.get("file", {}).get("mimeType"), "downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"), "createdBy": fileInfo.get("createdBy", {}), "lastModifiedBy": fileInfo.get("lastModifiedBy", {}), "parentReference": fileInfo.get("parentReference", {}) } readResults.append(resultItem) except Exception as e: logger.error(f"Error reading file {fileId}: {str(e)}") continue if not readResults: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No files could be read from documentList") # Convert read results to ActionDocument objects # IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData # The system will create FileData and ChatDocument automatically self.services.chat.progressLogUpdate(operationId, 0.8, f"Processing {len(readResults)} document(s)") actionDocuments = [] for resultItem in readResults: fileContent = resultItem.get("content") fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}") # Determine MIME type from metadata or file extension mimeType = "application/octet-stream" if resultItem.get("metadata", {}).get("mimeType"): mimeType = resultItem["metadata"]["mimeType"] elif fileName: if fileName.endswith('.pdf'): mimeType = "application/pdf" elif fileName.endswith('.txt'): mimeType = "text/plain" elif fileName.endswith('.json'): mimeType = "application/json" # For binary files (PDFs, etc.), store Base64-encoded content directly # The GenerationService will detect PDF mimeType and handle base64 decoding if fileContent and isinstance(fileContent, bytes): # Encode binary content as Base64 string base64Content = base64.b64encode(fileContent).decode('utf-8') validationMetadata = { "actionType": "sharepoint.readDocuments", "fileName": fileName, "sharepointFileId": resultItem.get("sharepointFileId"), "siteName": resultItem.get("siteName"), "mimeType": mimeType, "contentType": "binary", "size": len(fileContent), "includeMetadata": includeMetadata } actionDoc = ActionDocument( documentName=fileName, documentData=base64Content, # Base64 string for binary files mimeType=mimeType, validationMetadata=validationMetadata ) actionDocuments.append(actionDoc) logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument") elif fileContent: # Text content - store directly in documentData validationMetadata = { "actionType": "sharepoint.readDocuments", "fileName": fileName, "sharepointFileId": resultItem.get("sharepointFileId"), "siteName": resultItem.get("siteName"), "mimeType": mimeType, "contentType": "text", "includeMetadata": includeMetadata } actionDoc = ActionDocument( documentName=fileName, documentData=fileContent if isinstance(fileContent, str) else str(fileContent), mimeType=mimeType, validationMetadata=validationMetadata ) actionDocuments.append(actionDoc) else: # No content - store metadata only docData = { "fileName": fileName, "sharepointFileId": resultItem.get("sharepointFileId"), "siteName": resultItem.get("siteName"), "siteUrl": resultItem.get("siteUrl"), "size": resultItem.get("size"), "createdDateTime": resultItem.get("createdDateTime"), "lastModifiedDateTime": resultItem.get("lastModifiedDateTime"), "webUrl": resultItem.get("webUrl") } if resultItem.get("metadata"): docData["metadata"] = resultItem["metadata"] validationMetadata = { "actionType": "sharepoint.readDocuments", "fileName": fileName, "sharepointFileId": resultItem.get("sharepointFileId"), "siteName": resultItem.get("siteName"), "mimeType": mimeType, "contentType": "metadata_only", "includeMetadata": includeMetadata } actionDoc = ActionDocument( documentName=fileName, documentData=json.dumps(docData, indent=2), mimeType=mimeType, validationMetadata=validationMetadata ) actionDocuments.append(actionDoc) # Return success with action documents self.services.chat.progressLogUpdate(operationId, 0.9, f"Read {len(actionDocuments)} document(s)") self.services.chat.progressLogFinish(operationId, True) return ActionResult.isSuccess(documents=actionDocuments) # If no sites from documentList, try pathQuery fallback if not sites and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery) if errorMsg: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error=errorMsg) # If still no sites, return error if not sites: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with file information, or pathQuery must be provided. Use findDocumentPath first to get file paths, or provide pathQuery directly.") # This should never be reached if logic above is correct if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="Unexpected error: could not process documentList or pathQuery") except Exception as e: logger.error(f"Error reading SharePoint documents: {str(e)}") if operationId: try: self.services.chat.progressLogFinish(operationId, False) except: pass # Don't fail on progress logging errors return ActionResult( success=False, error=str(e) )