gateway/modules/workflows/methods/methodSharepoint/actions/readDocuments.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import logging
import time
import json
import base64
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

async def readDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
    operationId = None
    try:
        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"sharepoint_read_{workflowId}_{int(time.time())}"

        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "Read Documents",
            "SharePoint Document Reading",
            "Processing document list",
            parentOperationId=parentOperationId
        )

        documentList = parameters.get("documentList")
        pathQuery = parameters.get("pathQuery", "*")
        connectionReference = parameters.get("connectionReference")
        includeMetadata = parameters.get("includeMetadata", True)

        # Validate connection reference
        if not connectionReference:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Connection reference is required")

        # Require either documentList or pathQuery
        if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Either documentList or pathQuery is required")

        # Get connection first
        self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
        connection = self.connection.getMicrosoftConnection(connectionReference)
        if not connection:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")

        # Parse documentList to extract foundDocuments and site information
        sharePointFileIds = None
        sites = None

        if documentList:
            foundDocuments, sites, errorMsg = await self.documentParsing.parseDocumentListForFoundDocuments(documentList)
            if errorMsg:
                if operationId:
                    self.services.chat.progressLogFinish(operationId, False)
                return ActionResult.isFailure(error=errorMsg)

            if foundDocuments:
                # Extract SharePoint file IDs from foundDocuments
                sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"]
                if not sharePointFileIds:
                    if operationId:
                        self.services.chat.progressLogFinish(operationId, False)
                    return ActionResult.isFailure(error="No files found in documentList from findDocumentPath result")
                logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from documentList")

        # If we have SharePoint file IDs from documentList (findDocumentPath result), read them directly
        if sharePointFileIds and sites:
            # Read SharePoint files directly using their IDs
            readResults = []
            siteId = sites[0]['id']

            self.services.chat.progressLogUpdate(operationId, 0.5, f"Reading {len(sharePointFileIds)} file(s) from SharePoint")
            for idx, fileId in enumerate(sharePointFileIds):
                try:
                    self.services.chat.progressLogUpdate(operationId, 0.5 + (idx * 0.3 / len(sharePointFileIds)), f"Reading file {idx + 1}/{len(sharePointFileIds)}")
                    # Get file info from SharePoint
                    endpoint = f"sites/{siteId}/drive/items/{fileId}"
                    fileInfo = await self.apiClient.makeGraphApiCall(endpoint)

                    if "error" in fileInfo:
                        logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}")
                        continue

                    # Get file content using SharePoint service (handles binary data correctly)
                    fileName = fileInfo.get("name", f"file_{fileId}")
                    fileContent = await self.services.sharepoint.downloadFile(siteId, fileId)

                    # Create result document
                    resultItem = {
                        "fileId": fileId,
                        "fileName": fileName,
                        "sharepointFileId": fileId,
                        "siteName": sites[0]['displayName'],
                        "siteUrl": sites[0]['webUrl'],
                        "size": fileInfo.get("size", 0),
                        "createdDateTime": fileInfo.get("createdDateTime"),
                        "lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"),
                        "webUrl": fileInfo.get("webUrl")
                    }

                    # Add content if available
                    if fileContent:
                        resultItem["content"] = fileContent

                    # Add metadata if requested
                    if includeMetadata:
                        resultItem["metadata"] = {
                            "mimeType": fileInfo.get("file", {}).get("mimeType"),
                            "downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"),
                            "createdBy": fileInfo.get("createdBy", {}),
                            "lastModifiedBy": fileInfo.get("lastModifiedBy", {}),
                            "parentReference": fileInfo.get("parentReference", {})
                        }

                    readResults.append(resultItem)
                except Exception as e:
                    logger.error(f"Error reading file {fileId}: {str(e)}")
                    continue

            if not readResults:
                self.services.chat.progressLogFinish(operationId, False)
                return ActionResult.isFailure(error="No files could be read from documentList")

            # Convert read results to ActionDocument objects
            # IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData
            # The system will create FileData and ChatDocument automatically
            self.services.chat.progressLogUpdate(operationId, 0.8, f"Processing {len(readResults)} document(s)")

            actionDocuments = []
            for resultItem in readResults:
                fileContent = resultItem.get("content")
                fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}")

                # Determine MIME type from metadata or file extension
                mimeType = "application/octet-stream"
                if resultItem.get("metadata", {}).get("mimeType"):
                    mimeType = resultItem["metadata"]["mimeType"]
                elif fileName:
                    if fileName.endswith('.pdf'):
                        mimeType = "application/pdf"
                    elif fileName.endswith('.txt'):
                        mimeType = "text/plain"
                    elif fileName.endswith('.json'):
                        mimeType = "application/json"

                # For binary files (PDFs, etc.), store Base64-encoded content directly
                # The GenerationService will detect PDF mimeType and handle base64 decoding
                if fileContent and isinstance(fileContent, bytes):
                    # Encode binary content as Base64 string
                    base64Content = base64.b64encode(fileContent).decode('utf-8')
                    validationMetadata = {
                        "actionType": "sharepoint.readDocuments",
                        "fileName": fileName,
                        "sharepointFileId": resultItem.get("sharepointFileId"),
                        "siteName": resultItem.get("siteName"),
                        "mimeType": mimeType,
                        "contentType": "binary",
                        "size": len(fileContent),
                        "includeMetadata": includeMetadata
                    }
                    actionDoc = ActionDocument(
                        documentName=fileName,
                        documentData=base64Content,  # Base64 string for binary files
                        mimeType=mimeType,
                        validationMetadata=validationMetadata
                    )
                    actionDocuments.append(actionDoc)
                    logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument")
                elif fileContent:
                    # Text content - store directly in documentData
                    validationMetadata = {
                        "actionType": "sharepoint.readDocuments",
                        "fileName": fileName,
                        "sharepointFileId": resultItem.get("sharepointFileId"),
                        "siteName": resultItem.get("siteName"),
                        "mimeType": mimeType,
                        "contentType": "text",
                        "includeMetadata": includeMetadata
                    }
                    actionDoc = ActionDocument(
                        documentName=fileName,
                        documentData=fileContent if isinstance(fileContent, str) else str(fileContent),
                        mimeType=mimeType,
                        validationMetadata=validationMetadata
                    )
                    actionDocuments.append(actionDoc)
                else:
                    # No content - store metadata only
                    docData = {
                        "fileName": fileName,
                        "sharepointFileId": resultItem.get("sharepointFileId"),
                        "siteName": resultItem.get("siteName"),
                        "siteUrl": resultItem.get("siteUrl"),
                        "size": resultItem.get("size"),
                        "createdDateTime": resultItem.get("createdDateTime"),
                        "lastModifiedDateTime": resultItem.get("lastModifiedDateTime"),
                        "webUrl": resultItem.get("webUrl")
                    }
                    if resultItem.get("metadata"):
                        docData["metadata"] = resultItem["metadata"]

                    validationMetadata = {
                        "actionType": "sharepoint.readDocuments",
                        "fileName": fileName,
                        "sharepointFileId": resultItem.get("sharepointFileId"),
                        "siteName": resultItem.get("siteName"),
                        "mimeType": mimeType,
                        "contentType": "metadata_only",
                        "includeMetadata": includeMetadata
                    }
                    actionDoc = ActionDocument(
                        documentName=fileName,
                        documentData=json.dumps(docData, indent=2),
                        mimeType=mimeType,
                        validationMetadata=validationMetadata
                    )
                    actionDocuments.append(actionDoc)

            # Return success with action documents
            self.services.chat.progressLogUpdate(operationId, 0.9, f"Read {len(actionDocuments)} document(s)")
            self.services.chat.progressLogFinish(operationId, True)
            return ActionResult.isSuccess(documents=actionDocuments)

        # If no sites from documentList, try pathQuery fallback
        if not sites and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
            sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
            if errorMsg:
                if operationId:
                    self.services.chat.progressLogFinish(operationId, False)
                return ActionResult.isFailure(error=errorMsg)

        # If still no sites, return error
        if not sites:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with file information, or pathQuery must be provided. Use findDocumentPath first to get file paths, or provide pathQuery directly.")

        # This should never be reached if logic above is correct
        if operationId:
            self.services.chat.progressLogFinish(operationId, False)
        return ActionResult.isFailure(error="Unexpected error: could not process documentList or pathQuery")
    except Exception as e:
        logger.error(f"Error reading SharePoint documents: {str(e)}")
        if operationId:
            try:
                self.services.chat.progressLogFinish(operationId, False)
            except:
                pass  # Don't fail on progress logging errors
        return ActionResult(
            success=False,
            error=str(e)
        )