gateway/modules/workflows/methods/methodSharepoint/actions/listDocuments.py
2026-01-23 01:10:00 +01:00

327 lines
17 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import time
import json
import urllib.parse
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
async def listDocuments(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
# Init progress logger
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"sharepoint_list_{workflowId}_{int(time.time())}"
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"List Documents",
"SharePoint Listing",
"Processing document list",
parentOperationId=parentOperationId
)
connectionReference = parameters.get("connectionReference")
documentList = parameters.get("documentList")
pathQuery = parameters.get("pathQuery", "*")
if isinstance(documentList, str):
documentList = [documentList]
includeSubfolders = parameters.get("includeSubfolders", False) # Default to False for better UX
if not connectionReference:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Connection reference is required")
# Require either documentList or pathQuery
if not documentList and (not pathQuery or pathQuery.strip() == "" or pathQuery.strip() == "*"):
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList or pathQuery is required")
# Parse documentList to extract folder path and site information
listQuery, sites, _, errorMsg = await self.documentParsing.parseDocumentListForFolder(documentList)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
# If no folder path found from documentList, use pathQuery if provided
if not listQuery and pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*":
listQuery = pathQuery
logger.info(f"Using pathQuery for list query: {listQuery}")
# Resolve sites from pathQuery
sites, errorMsg = await self.siteDiscovery.resolveSitesFromPathQuery(pathQuery)
if errorMsg:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=errorMsg)
# Validate required parameters
if not listQuery:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Either documentList must contain findDocumentPath result with folder information, or pathQuery must be provided. Use findDocumentPath first to get folder path, or provide pathQuery directly.")
if not sites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Site information missing. Cannot determine target site for list operation.")
# Get connection
self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
connection = self.connection.getMicrosoftConnection(connectionReference)
if not connection:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
logger.info(f"Starting SharePoint listDocuments for listQuery: {listQuery}")
logger.debug(f"Connection ID: {connection['id']}")
self.services.chat.progressLogUpdate(operationId, 0.3, "Processing folder path")
# Parse listQuery to extract path, search terms, search type, and options
pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(listQuery)
# Check if listQuery is a folder ID (starts with 01PPXICCB...)
if listQuery.startswith('01PPXICCB') or listQuery.startswith('01'):
# Direct folder ID - use it directly
folderPaths = [listQuery]
logger.info(f"Using direct folder ID: {listQuery}")
else:
# Remove site prefix from pathQuery before resolving (it's only for site filtering)
pathQueryForResolve = pathQuery
# Microsoft-standard path: /sites/SiteName/Path -> /Path
if pathQuery.startswith('/sites/'):
parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
if parsedPath:
innerPath = parsedPath.get("innerPath", "")
pathQueryForResolve = '/' + innerPath if innerPath else '/'
else:
pathQueryForResolve = '/'
# Remove first path segment if it looks like a document library name
# In SharePoint Graph API, /drive/root already points to the default document library,
# so library names in paths should be removed
# Generic approach: if path has multiple segments, store original for fallback
pathSegments = [s for s in pathQueryForResolve.split('/') if s.strip()]
if len(pathSegments) > 1:
# Path has multiple segments - first might be a library name
# Store original for potential fallback
originalPath = pathQueryForResolve
# Try without first segment (assuming it's a library name)
pathQueryForResolve = '/' + '/'.join(pathSegments[1:])
logger.info(f"Removed first path segment (potential library name), path changed from '{originalPath}' to '{pathQueryForResolve}'")
elif len(pathSegments) == 1:
# Only one segment - if it's a common library-like name, use root
firstSegmentLower = pathSegments[0].lower()
libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek']
if any(indicator in firstSegmentLower for indicator in libraryIndicators):
pathQueryForResolve = '/'
logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root")
# Resolve path query into folder paths
folderPaths = self.pathProcessing.resolvePathQuery(pathQueryForResolve)
logger.info(f"Resolved folder paths: {folderPaths}")
# Process each folder path across all sites
listResults = []
self.services.chat.progressLogUpdate(operationId, 0.5, f"Listing {len(folderPaths)} folder(s) across {len(sites)} site(s)")
for folderPath in folderPaths:
try:
folderResults = []
for site in sites:
siteId = site["id"]
siteName = site["displayName"]
siteUrl = site["webUrl"]
logger.info(f"Listing folder {folderPath} in site: {siteName}")
# Determine the endpoint based on folder path
if folderPath in ["/", ""] or folderPath == "*":
# Root folder
endpoint = f"sites/{siteId}/drive/root/children"
elif folderPath.startswith('01PPXICCB') or folderPath.startswith('01'):
# Direct folder ID
endpoint = f"sites/{siteId}/drive/items/{folderPath}/children"
else:
# Specific folder path - remove leading slash if present and URL encode
folderPathClean = folderPath.lstrip('/')
# URL encode the path for Graph API (spaces and special characters need encoding)
folderPathEncoded = urllib.parse.quote(folderPathClean, safe='/')
endpoint = f"sites/{siteId}/drive/root:/{folderPathEncoded}:/children"
# Make the API call to list folder contents
apiResult = await self.apiClient.makeGraphApiCall(endpoint)
if "error" in apiResult:
logger.warning(f"Failed to list folder {folderPath} in site {siteName}: {apiResult['error']}")
continue
# Process the results
items = apiResult.get("value", [])
processedItems = []
for item in items:
# Use improved folder detection logic
isFolder = self.services.sharepoint.detectFolderType(item)
itemInfo = {
"id": item.get("id"),
"name": item.get("name"),
"size": item.get("size", 0),
"createdDateTime": item.get("createdDateTime"),
"lastModifiedDateTime": item.get("lastModifiedDateTime"),
"webUrl": item.get("webUrl"),
"type": "folder" if isFolder else "file",
"siteName": siteName,
"siteUrl": siteUrl
}
# Add file-specific information
if "file" in item:
itemInfo.update({
"mimeType": item["file"].get("mimeType"),
"downloadUrl": item.get("@microsoft.graph.downloadUrl")
})
# Add folder-specific information
if "folder" in item:
itemInfo.update({
"childCount": item["folder"].get("childCount", 0)
})
processedItems.append(itemInfo)
# If include subfolders is enabled, get ONLY direct subfolder contents (1 level deep only)
if includeSubfolders:
folderItems = [item for item in processedItems if item['type'] == 'folder']
logger.info(f"Including subfolders - processing {len(folderItems)} folders")
subfolderCount = 0
maxSubfolders = 10 # Limit to prevent infinite loops
for item in processedItems[:]: # Use slice to avoid modifying list during iteration
if item["type"] == "folder" and subfolderCount < maxSubfolders:
subfolderCount += 1
subfolderPath = f"{folderPath.rstrip('/')}/{item['name']}"
subfolderEndpoint = f"sites/{siteId}/drive/items/{item['id']}/children"
logger.debug(f"Getting contents of subfolder: {item['name']}")
subfolderResult = await self.apiClient.makeGraphApiCall(subfolderEndpoint)
if "error" not in subfolderResult:
subfolderItems = subfolderResult.get("value", [])
logger.debug(f"Found {len(subfolderItems)} items in subfolder {item['name']}")
for subfolderItem in subfolderItems:
# Use improved folder detection logic for subfolder items
subfolderIsFolder = self.services.sharepoint.detectFolderType(subfolderItem)
# Only add files and direct subfolders, NO RECURSION
subfolderItemInfo = {
"id": subfolderItem.get("id"),
"name": subfolderItem.get("name"),
"size": subfolderItem.get("size", 0),
"createdDateTime": subfolderItem.get("createdDateTime"),
"lastModifiedDateTime": subfolderItem.get("lastModifiedDateTime"),
"webUrl": subfolderItem.get("webUrl"),
"type": "folder" if subfolderIsFolder else "file",
"parentPath": subfolderPath,
"siteName": siteName,
"siteUrl": siteUrl
}
if "file" in subfolderItem:
subfolderItemInfo.update({
"mimeType": subfolderItem["file"].get("mimeType"),
"downloadUrl": subfolderItem.get("@microsoft.graph.downloadUrl")
})
processedItems.append(subfolderItemInfo)
else:
logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolderResult.get('error')}")
elif subfolderCount >= maxSubfolders:
logger.warning(f"Reached maximum subfolder limit ({maxSubfolders}), skipping remaining folders")
break
logger.info(f"Processed {subfolderCount} subfolders, total items: {len(processedItems)}")
folderResults.append({
"siteName": siteName,
"siteUrl": siteUrl,
"itemCount": len(processedItems),
"items": processedItems
})
listResults.append({
"folderPath": folderPath,
"sitesProcessed": len(folderResults),
"siteResults": folderResults
})
except Exception as e:
logger.error(f"Error listing folder {folderPath}: {str(e)}")
listResults.append({
"folderPath": folderPath,
"error": str(e),
"sitesProcessed": 0,
"siteResults": []
})
# Create result data
totalItems = sum(len(siteResult.get("items", [])) for result in listResults for siteResult in result.get("siteResults", []))
resultData = {
"listQuery": listQuery,
"pathQuery": pathQuery,
"totalItems": totalItems,
"foldersProcessed": len(listResults),
"listResults": listResults,
"includeSubfolders": includeSubfolders,
"timestamp": self.services.utils.timestampGetUtc()
}
self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {totalItems} item(s) in {len(listResults)} folder(s)")
validationMetadata = {
"actionType": "sharepoint.listDocuments",
"listQuery": listQuery,
"totalItems": totalItems,
"foldersProcessed": len(listResults),
"includeSubfolders": includeSubfolders
}
self.services.chat.progressLogFinish(operationId, True)
return ActionResult(
success=True,
documents=[
ActionDocument(
documentName=self._generateMeaningfulFileName("sharepoint_list", "json", None, "listDocuments"),
documentData=json.dumps(resultData, indent=2),
mimeType="application/json",
validationMetadata=validationMetadata
)
]
)
except Exception as e:
logger.error(f"Error listing SharePoint documents: {str(e)}")
if operationId:
try:
self.services.chat.progressLogFinish(operationId, False)
except:
pass
return ActionResult(
success=False,
error=str(e)
)