gateway/modules/workflows/methods/methodSharepoint/actions/listDocuments.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import logging
import time
import json
import urllib.parse
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

async def listDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
    operationId = None
    try:
        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"sharepoint_list_{workflowId}_{int(time.time())}"

        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "List Documents",
            "SharePoint Listing",
            "Processing document list",
            parentOperationId=parentOperationId
        )

        connectionReference = parameters.get("connectionReference")
        documentList = parameters.get("documentList")
        pathQuery = parameters.get("pathQuery", "*")
        if isinstance(documentList, str):
            documentList = [documentList]
        includeSubfolders = parameters.get("includeSubfolders", False)  # Default to False for better UX

        if not connectionReference:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Connection reference is required")

        # Require either documentList or pathQuery
        if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Either documentList or pathQuery is required")

        # Parse documentList to extract folder path and site information
        listQuery, sites, _, errorMsg = await self.documentParsing.parseDocumentListForFolder(documentList)
        if errorMsg:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error=errorMsg)

        # If no folder path found from documentList, use pathQuery if provided
        if not listQuery and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
            listQuery = pathQuery
            logger.info(f"Using pathQuery for list query: {listQuery}")
            # Resolve sites from pathQuery
            sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
            if errorMsg:
                if operationId:
                    self.services.chat.progressLogFinish(operationId, False)
                return ActionResult.isFailure(error=errorMsg)

        # Validate required parameters
        if not listQuery:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get folder path, or provide pathQuery directly.")

        if not sites:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Site information missing. Cannot determine target site for list operation.")

        # Get connection
        self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
        connection = self.connection.getMicrosoftConnection(connectionReference)
        if not connection:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")

        logger.info(f"Starting SharePoint listDocuments for listQuery: {listQuery}")
        logger.debug(f"Connection ID: {connection['id']}")

        self.services.chat.progressLogUpdate(operationId, 0.3, "Processing folder path")

        # Parse listQuery to extract path, search terms, search type, and options
        pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(listQuery)

        # Check if listQuery is a folder ID (starts with 01PPXICCB...)
        if listQuery.startswith('01PPXICCB') or listQuery.startswith('01'):
            # Direct folder ID - use it directly
            folderPaths = [listQuery]
            logger.info(f"Using direct folder ID: {listQuery}")
        else:
            # Remove site prefix from pathQuery before resolving (it's only for site filtering)
            pathQueryForResolve = pathQuery
            # Microsoft-standard path: /sites/SiteName/Path -> /Path
            if pathQuery.startswith('/sites/'):
                parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
                if parsedPath:
                    innerPath = parsedPath.get("innerPath", "")
                    pathQueryForResolve = '/' + innerPath if innerPath else '/'
                else:
                    pathQueryForResolve = '/'

            # Remove first path segment if it looks like a document library name
            # In SharePoint Graph API, /drive/root already points to the default document library,
            # so library names in paths should be removed
            # Generic approach: if path has multiple segments, store original for fallback
            pathSegments = [s for s in pathQueryForResolve.split('/') if s.strip()]
            if len(pathSegments) > 1:
                # Path has multiple segments - first might be a library name
                # Store original for potential fallback
                originalPath = pathQueryForResolve
                # Try without first segment (assuming it's a library name)
                pathQueryForResolve = '/' + '/'.join(pathSegments[1:])
                logger.info(f"Removed first path segment (potential library name), path changed from '{originalPath}' to '{pathQueryForResolve}'")
            elif len(pathSegments) == 1:
                # Only one segment - if it's a common library-like name, use root
                firstSegmentLower = pathSegments[0].lower()
                libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek']
                if any(indicator in firstSegmentLower for indicator in libraryIndicators):
                    pathQueryForResolve = '/'
                    logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root")

            # Resolve path query into folder paths
            folderPaths = self.pathProcessing.resolvePathQuery(pathQueryForResolve)
            logger.info(f"Resolved folder paths: {folderPaths}")

        # Process each folder path across all sites
        listResults = []

        self.services.chat.progressLogUpdate(operationId, 0.5, f"Listing {len(folderPaths)} folder(s) across {len(sites)} site(s)")

        for folderPath in folderPaths:
            try:
                folderResults = []

                for site in sites:
                    siteId = site["id"]
                    siteName = site["displayName"]
                    siteUrl = site["webUrl"]

                    logger.info(f"Listing folder {folderPath} in site: {siteName}")

                    # Determine the endpoint based on folder path
                    if folderPath in ["/", ""] or folderPath == "*":
                        # Root folder
                        endpoint = f"sites/{siteId}/drive/root/children"
                    elif folderPath.startswith('01PPXICCB') or folderPath.startswith('01'):
                        # Direct folder ID
                        endpoint = f"sites/{siteId}/drive/items/{folderPath}/children"
                    else:
                        # Specific folder path - remove leading slash if present and URL encode
                        folderPathClean = folderPath.lstrip('/')
                        # URL encode the path for Graph API (spaces and special characters need encoding)
                        folderPathEncoded = urllib.parse.quote(folderPathClean, safe='/')
                        endpoint = f"sites/{siteId}/drive/root:/{folderPathEncoded}:/children"

                    # Make the API call to list folder contents
                    apiResult = await self.apiClient.makeGraphApiCall(endpoint)

                    if "error" in apiResult:
                        logger.warning(f"Failed to list folder {folderPath} in site {siteName}: {apiResult['error']}")
                        continue

                    # Process the results
                    items = apiResult.get("value", [])
                    processedItems = []

                    for item in items:
                        # Use improved folder detection logic
                        isFolder = self.services.sharepoint.detectFolderType(item)

                        itemInfo = {
                            "id": item.get("id"),
                            "name": item.get("name"),
                            "size": item.get("size", 0),
                            "createdDateTime": item.get("createdDateTime"),
                            "lastModifiedDateTime": item.get("lastModifiedDateTime"),
                            "webUrl": item.get("webUrl"),
                            "type": "folder" if isFolder else "file",
                            "siteName": siteName,
                            "siteUrl": siteUrl
                        }

                        # Add file-specific information
                        if "file" in item:
                            itemInfo.update({
                                "mimeType": item["file"].get("mimeType"),
                                "downloadUrl": item.get("@microsoft.graph.downloadUrl")
                            })

                        # Add folder-specific information
                        if "folder" in item:
                            itemInfo.update({
                                "childCount": item["folder"].get("childCount", 0)
                            })

                        processedItems.append(itemInfo)

                    # If include subfolders is enabled, get ONLY direct subfolder contents (1 level deep only)
                    if includeSubfolders:
                        folderItems = [item for item in processedItems if item['type'] == 'folder']
                        logger.info(f"Including subfolders - processing {len(folderItems)} folders")
                        subfolderCount = 0
                        maxSubfolders = 10  # Limit to prevent infinite loops

                        for item in processedItems[:]:  # Use slice to avoid modifying list during iteration
                            if item["type"] == "folder" and subfolderCount < maxSubfolders:
                                subfolderCount += 1
                                subfolderPath = f"{folderPath.rstrip('/')}/{item['name']}"
                                subfolderEndpoint = f"sites/{siteId}/drive/items/{item['id']}/children"

                                logger.debug(f"Getting contents of subfolder: {item['name']}")
                                subfolderResult = await self.apiClient.makeGraphApiCall(subfolderEndpoint)
                                if "error" not in subfolderResult:
                                    subfolderItems = subfolderResult.get("value", [])
                                    logger.debug(f"Found {len(subfolderItems)} items in subfolder {item['name']}")

                                    for subfolderItem in subfolderItems:
                                        # Use improved folder detection logic for subfolder items
                                        subfolderIsFolder = self.services.sharepoint.detectFolderType(subfolderItem)

                                        # Only add files and direct subfolders, NO RECURSION
                                        subfolderItemInfo = {
                                            "id": subfolderItem.get("id"),
                                            "name": subfolderItem.get("name"),
                                            "size": subfolderItem.get("size", 0),
                                            "createdDateTime": subfolderItem.get("createdDateTime"),
                                            "lastModifiedDateTime": subfolderItem.get("lastModifiedDateTime"),
                                            "webUrl": subfolderItem.get("webUrl"),
                                            "type": "folder" if subfolderIsFolder else "file",
                                            "parentPath": subfolderPath,
                                            "siteName": siteName,
                                            "siteUrl": siteUrl
                                        }

                                        if "file" in subfolderItem:
                                            subfolderItemInfo.update({
                                                "mimeType": subfolderItem["file"].get("mimeType"),
                                                "downloadUrl": subfolderItem.get("@microsoft.graph.downloadUrl")
                                            })

                                        processedItems.append(subfolderItemInfo)
                                else:
                                    logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolderResult.get('error')}")
                            elif subfolderCount >= maxSubfolders:
                                logger.warning(f"Reached maximum subfolder limit ({maxSubfolders}), skipping remaining folders")
                                break

                        logger.info(f"Processed {subfolderCount} subfolders, total items: {len(processedItems)}")

                    folderResults.append({
                        "siteName": siteName,
                        "siteUrl": siteUrl,
                        "itemCount": len(processedItems),
                        "items": processedItems
                    })

                listResults.append({
                    "folderPath": folderPath,
                    "sitesProcessed": len(folderResults),
                    "siteResults": folderResults
                })

            except Exception as e:
                logger.error(f"Error listing folder {folderPath}: {str(e)}")
                listResults.append({
                    "folderPath": folderPath,
                    "error": str(e),
                    "sitesProcessed": 0,
                    "siteResults": []
                })

        # Create result data
        totalItems = sum(len(siteResult.get("items", [])) for result in listResults for siteResult in result.get("siteResults", []))

        resultData = {
            "listQuery": listQuery,
            "pathQuery": pathQuery,
            "totalItems": totalItems,
            "foldersProcessed": len(listResults),
            "listResults": listResults,
            "includeSubfolders": includeSubfolders,
            "timestamp": self.services.utils.timestampGetUtc()
        }

        self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {totalItems} item(s) in {len(listResults)} folder(s)")

        validationMetadata = {
            "actionType": "sharepoint.listDocuments",
            "listQuery": listQuery,
            "totalItems": totalItems,
            "foldersProcessed": len(listResults),
            "includeSubfolders": includeSubfolders
        }

        self.services.chat.progressLogFinish(operationId, True)
        return ActionResult(
            success=True,
            documents=[
                ActionDocument(
                    documentName=self._generateMeaningfulFileName("sharepoint_list", "json", None, "listDocuments"),
                    documentData=json.dumps(resultData, indent=2),
                    mimeType="application/json",
                    validationMetadata=validationMetadata
                )
            ]
        )

    except Exception as e:
        logger.error(f"Error listing SharePoint documents: {str(e)}")
        if operationId:
            try:
                self.services.chat.progressLogFinish(operationId, False)
            except:
                pass
        return ActionResult(
            success=False,
            error=str(e)
        )