gateway/modules/workflows/methods/methodSharepoint/actions/listDocuments.py
2025-12-17 10:45:09 +01:00

345 lines
18 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
List Documents action for SharePoint operations.
Lists documents and folders in SharePoint paths across sites.
"""
import logging
import time
import json
import urllib.parse
from typing import Dict, Any
from modules.workflows.methods.methodBase import action
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
@action
async def listDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
"""
GENERAL:
- Purpose: List documents and folders in SharePoint paths across sites.
- Input requirements: connectionReference (required); documentList (required); includeSubfolders (optional).
- Output format: JSON with folder items and metadata.
Parameters:
- connectionReference (str, required): Microsoft connection label.
- documentList (list, required): Document list reference(s) containing findDocumentPath result.
- includeSubfolders (bool, optional): Include one level of subfolders. Default: False.
"""
operationId = None
try:
# Init progress logger
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"sharepoint_list_{workflowId}_{int(time.time())}"
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"List Documents",
"SharePoint Listing",
"Processing document list",
parentOperationId=parentOperationId
)
connectionReference = parameters.get("connectionReference")
documentList = parameters.get("documentList")
pathQuery = parameters.get("pathQuery", "*")
if isinstance(documentList, str):
documentList = [documentList]
includeSubfolders = parameters.get("includeSubfolders", False) # Default to False for better UX
if not connectionReference:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Connection reference is required")
# Require either documentList or pathQuery
if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList or pathQuery is required")
# Parse documentList to extract folder path and site information
listQuery, sites, _, errorMsg = await self.documentParsing.parseDocumentListForFolder(documentList)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
# If no folder path found from documentList, use pathQuery if provided
if not listQuery and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
listQuery = pathQuery
logger.info(f"Using pathQuery for list query: {listQuery}")
# Resolve sites from pathQuery
sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
# Validate required parameters
if not listQuery:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get folder path, or provide pathQuery directly.")
if not sites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Site information missing. Cannot determine target site for list operation.")
# Get connection
self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
connection = self.connection.getMicrosoftConnection(connectionReference)
if not connection:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
logger.info(f"Starting SharePoint listDocuments for listQuery: {listQuery}")
logger.debug(f"Connection ID: {connection['id']}")
self.services.chat.progressLogUpdate(operationId, 0.3, "Processing folder path")
# Parse listQuery to extract path, search terms, search type, and options
pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(listQuery)
# Check if listQuery is a folder ID (starts with 01PPXICCB...)
if listQuery.startswith('01PPXICCB') or listQuery.startswith('01'):
# Direct folder ID - use it directly
folderPaths = [listQuery]
logger.info(f"Using direct folder ID: {listQuery}")
else:
# Remove site prefix from pathQuery before resolving (it's only for site filtering)
pathQueryForResolve = pathQuery
# Microsoft-standard path: /sites/SiteName/Path -> /Path
if pathQuery.startswith('/sites/'):
parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
if parsedPath:
innerPath = parsedPath.get("innerPath", "")
pathQueryForResolve = '/' + innerPath if innerPath else '/'
else:
pathQueryForResolve = '/'
# Remove first path segment if it looks like a document library name
# In SharePoint Graph API, /drive/root already points to the default document library,
# so library names in paths should be removed
# Generic approach: if path has multiple segments, store original for fallback
pathSegments = [s for s in pathQueryForResolve.split('/') if s.strip()]
if len(pathSegments) > 1:
# Path has multiple segments - first might be a library name
# Store original for potential fallback
originalPath = pathQueryForResolve
# Try without first segment (assuming it's a library name)
pathQueryForResolve = '/' + '/'.join(pathSegments[1:])
logger.info(f"Removed first path segment (potential library name), path changed from '{originalPath}' to '{pathQueryForResolve}'")
elif len(pathSegments) == 1:
# Only one segment - if it's a common library-like name, use root
firstSegmentLower = pathSegments[0].lower()
libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek']
if any(indicator in firstSegmentLower for indicator in libraryIndicators):
pathQueryForResolve = '/'
logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root")
# Resolve path query into folder paths
folderPaths = self.pathProcessing.resolvePathQuery(pathQueryForResolve)
logger.info(f"Resolved folder paths: {folderPaths}")
# Process each folder path across all sites
listResults = []
self.services.chat.progressLogUpdate(operationId, 0.5, f"Listing {len(folderPaths)} folder(s) across {len(sites)} site(s)")
for folderPath in folderPaths:
try:
folderResults = []
for site in sites:
siteId = site["id"]
siteName = site["displayName"]
siteUrl = site["webUrl"]
logger.info(f"Listing folder {folderPath} in site: {siteName}")
# Determine the endpoint based on folder path
if folderPath in ["/", ""] or folderPath == "*":
# Root folder
endpoint = f"sites/{siteId}/drive/root/children"
elif folderPath.startswith('01PPXICCB') or folderPath.startswith('01'):
# Direct folder ID
endpoint = f"sites/{siteId}/drive/items/{folderPath}/children"
else:
# Specific folder path - remove leading slash if present and URL encode
folderPathClean = folderPath.lstrip('/')
# URL encode the path for Graph API (spaces and special characters need encoding)
folderPathEncoded = urllib.parse.quote(folderPathClean, safe='/')
endpoint = f"sites/{siteId}/drive/root:/{folderPathEncoded}:/children"
# Make the API call to list folder contents
apiResult = await self.apiClient.makeGraphApiCall(endpoint)
if "error" in apiResult:
logger.warning(f"Failed to list folder {folderPath} in site {siteName}: {apiResult['error']}")
continue
# Process the results
items = apiResult.get("value", [])
processedItems = []
for item in items:
# Use improved folder detection logic
isFolder = self.services.sharepoint.detectFolderType(item)
itemInfo = {
"id": item.get("id"),
"name": item.get("name"),
"size": item.get("size", 0),
"createdDateTime": item.get("createdDateTime"),
"lastModifiedDateTime": item.get("lastModifiedDateTime"),
"webUrl": item.get("webUrl"),
"type": "folder" if isFolder else "file",
"siteName": siteName,
"siteUrl": siteUrl
}
# Add file-specific information
if "file" in item:
itemInfo.update({
"mimeType": item["file"].get("mimeType"),
"downloadUrl": item.get("@microsoft.graph.downloadUrl")
})
# Add folder-specific information
if "folder" in item:
itemInfo.update({
"childCount": item["folder"].get("childCount", 0)
})
processedItems.append(itemInfo)
# If include subfolders is enabled, get ONLY direct subfolder contents (1 level deep only)
if includeSubfolders:
folderItems = [item for item in processedItems if item['type'] == 'folder']
logger.info(f"Including subfolders - processing {len(folderItems)} folders")
subfolderCount = 0
maxSubfolders = 10 # Limit to prevent infinite loops
for item in processedItems[:]: # Use slice to avoid modifying list during iteration
if item["type"] == "folder" and subfolderCount < maxSubfolders:
subfolderCount += 1
subfolderPath = f"{folderPath.rstrip('/')}/{item['name']}"
subfolderEndpoint = f"sites/{siteId}/drive/items/{item['id']}/children"
logger.debug(f"Getting contents of subfolder: {item['name']}")
subfolderResult = await self.apiClient.makeGraphApiCall(subfolderEndpoint)
if "error" not in subfolderResult:
subfolderItems = subfolderResult.get("value", [])
logger.debug(f"Found {len(subfolderItems)} items in subfolder {item['name']}")
for subfolderItem in subfolderItems:
# Use improved folder detection logic for subfolder items
subfolderIsFolder = self.services.sharepoint.detectFolderType(subfolderItem)
# Only add files and direct subfolders, NO RECURSION
subfolderItemInfo = {
"id": subfolderItem.get("id"),
"name": subfolderItem.get("name"),
"size": subfolderItem.get("size", 0),
"createdDateTime": subfolderItem.get("createdDateTime"),
"lastModifiedDateTime": subfolderItem.get("lastModifiedDateTime"),
"webUrl": subfolderItem.get("webUrl"),
"type": "folder" if subfolderIsFolder else "file",
"parentPath": subfolderPath,
"siteName": siteName,
"siteUrl": siteUrl
}
if "file" in subfolderItem:
subfolderItemInfo.update({
"mimeType": subfolderItem["file"].get("mimeType"),
"downloadUrl": subfolderItem.get("@microsoft.graph.downloadUrl")
})
processedItems.append(subfolderItemInfo)
else:
logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolderResult.get('error')}")
elif subfolderCount >= maxSubfolders:
logger.warning(f"Reached maximum subfolder limit ({maxSubfolders}), skipping remaining folders")
break
logger.info(f"Processed {subfolderCount} subfolders, total items: {len(processedItems)}")
folderResults.append({
"siteName": siteName,
"siteUrl": siteUrl,
"itemCount": len(processedItems),
"items": processedItems
})
listResults.append({
"folderPath": folderPath,
"sitesProcessed": len(folderResults),
"siteResults": folderResults
})
except Exception as e:
logger.error(f"Error listing folder {folderPath}: {str(e)}")
listResults.append({
"folderPath": folderPath,
"error": str(e),
"sitesProcessed": 0,
"siteResults": []
})
# Create result data
totalItems = sum(len(siteResult.get("items", [])) for result in listResults for siteResult in result.get("siteResults", []))
resultData = {
"listQuery": listQuery,
"pathQuery": pathQuery,
"totalItems": totalItems,
"foldersProcessed": len(listResults),
"listResults": listResults,
"includeSubfolders": includeSubfolders,
"timestamp": self.services.utils.timestampGetUtc()
}
self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {totalItems} item(s) in {len(listResults)} folder(s)")
validationMetadata = {
"actionType": "sharepoint.listDocuments",
"listQuery": listQuery,
"totalItems": totalItems,
"foldersProcessed": len(listResults),
"includeSubfolders": includeSubfolders
}
self.services.chat.progressLogFinish(operationId, True)
return ActionResult(
success=True,
documents=[
ActionDocument(
documentName=self._generateMeaningfulFileName("sharepoint_list", "json", None, "listDocuments"),
documentData=json.dumps(resultData, indent=2),
mimeType="application/json",
validationMetadata=validationMetadata
)
]
)
except Exception as e:
logger.error(f"Error listing SharePoint documents: {str(e)}")
if operationId:
try:
self.services.chat.progressLogFinish(operationId, False)
except:
pass
return ActionResult(
success=False,
error=str(e)
)