gateway/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py
2026-01-23 01:10:00 +01:00

478 lines
27 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import time
import json
import urllib.parse
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument
logger = logging.getLogger(__name__)
async def findDocumentPath(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
# Init progress logger
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
operationId = f"sharepoint_find_{workflowId}_{int(time.time())}"
# Start progress tracking
parentOperationId = parameters.get('parentOperationId')
self.services.chat.progressLogStart(
operationId,
"Find Document Path",
"SharePoint Search",
f"Query: {parameters.get('searchQuery', '*')}",
parentOperationId=parentOperationId
)
connectionReference = parameters.get("connectionReference")
site = parameters.get("site")
searchQuery = parameters.get("searchQuery", "*")
maxResults = parameters.get("maxResults", 1000)
if not connectionReference:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="Connection reference is required")
# Parse searchQuery to extract path, search terms, search type, and options
pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(searchQuery)
logger.debug(f"Parsed searchQuery '{searchQuery}' -> pathQuery='{pathQuery}', fileQuery='{fileQuery}', searchType='{searchType}'")
self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
connection = self.connection.getMicrosoftConnection(connectionReference)
if not connection:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")
# Extract site name from pathQuery if it contains Microsoft-standard path (/sites/SiteName/...)
siteFromPath = None
directSite = None
if pathQuery and pathQuery.startswith('/sites/'):
parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
if parsedPath:
siteFromPath = parsedPath.get("siteName")
logger.info(f"Extracted site from Microsoft-standard pathQuery '{pathQuery}': '{siteFromPath}'")
# Try to get site directly by path (optimization - no need to load all 60 sites)
directSite = await self.siteDiscovery.getSiteByStandardPath(siteFromPath)
if directSite:
logger.info(f"Got site directly by standard path - no need to discover all sites")
sites = [directSite]
else:
logger.warning(f"Could not get site directly, falling back to site discovery")
directSite = None
else:
logger.warning(f"Failed to parse site from standard pathQuery '{pathQuery}'")
# If we didn't get the site directly, use discovery and filtering
if not directSite:
# Determine which site hint to use (priority: site parameter > site from pathQuery > site_hint from searchOptions)
siteHintToUse = site or siteFromPath or searchOptions.get("site_hint")
# Discover SharePoint sites - use targeted approach when site hint is available
self.services.chat.progressLogUpdate(operationId, 0.3, "Discovering SharePoint sites")
if siteHintToUse:
# When site hint is available, discover all sites first, then filter
allSites = await self.siteDiscovery.discoverSharePointSites()
if not allSites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No SharePoint sites found or accessible")
sites = self.siteDiscovery.filterSitesByHint(allSites, siteHintToUse)
logger.info(f"Filtered sites by site hint '{siteHintToUse}' -> {len(sites)} sites")
if not sites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=f"No SharePoint sites found matching '{siteHintToUse}'")
else:
# No site hint - discover all sites
sites = await self.siteDiscovery.discoverSharePointSites()
if not sites:
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error="No SharePoint sites found or accessible")
# Resolve path query into search paths
searchPaths = self.pathProcessing.resolvePathQuery(pathQuery)
self.services.chat.progressLogUpdate(operationId, 0.5, f"Searching across {len(sites)} site(s)")
try:
# Search across all discovered sites
foundDocuments = []
allSitesSearched = []
# Handle different search approaches based on search type
if searchType == "folders" and fileQuery and fileQuery.strip() != "" and fileQuery.strip() != "*":
# Use unified search for folders - this is global and searches all sites
try:
# Use Microsoft Graph Search API syntax (simple term search only)
terms = [t for t in fileQuery.split() if t.strip()]
if len(terms) > 1:
# Multiple terms: search for ALL terms (AND) - more specific results
queryString = " AND ".join(terms)
else:
# Single term: search for the term
queryString = terms[0] if terms else fileQuery
logger.info(f"Using unified search for folders: {queryString}")
payload = {
"requests": [
{
"entityTypes": ["driveItem"],
"query": {"queryString": queryString},
"from": 0,
"size": 50
}
]
}
logger.info(f"Using unified search API for folders with queryString: {queryString}")
# Use global search endpoint (site-specific search not available)
unifiedResult = await self.apiClient.makeGraphApiCall(
"search/query",
method="POST",
data=json.dumps(payload).encode("utf-8")
)
if "error" in unifiedResult:
logger.warning(f"Unified search failed: {unifiedResult['error']}")
items = []
else:
# Flatten hits -> driveItem resources
items = []
for container in (unifiedResult.get("value", []) or []):
for hitsContainer in (container.get("hitsContainers", []) or []):
for hit in (hitsContainer.get("hits", []) or []):
resource = hit.get("resource")
if resource:
items.append(resource)
logger.info(f"Unified search returned {len(items)} items (pre-filter)")
# Apply our improved folder detection logic
folderItems = []
for item in items:
resource = item
# Use the same detection logic as our test
isFolder = self.services.sharepoint.detectFolderType(resource)
if isFolder:
folderItems.append(item)
items = folderItems
logger.info(f"Filtered to {len(items)} folders using improved detection logic")
# Process unified search results - extract site information from webUrl
for item in items:
itemName = item.get("name", "")
webUrl = item.get("webUrl", "")
# Extract site information from webUrl
siteName = "Unknown Site"
siteId = "unknown"
if webUrl and '/sites/' in webUrl:
try:
# Extract site name from URL: https://pcuster.sharepoint.com/sites/SiteName/...
urlParts = webUrl.split('/sites/')
if len(urlParts) > 1:
sitePath = urlParts[1].split('/')[0]
# Find matching site from discovered sites
# First try to match by site name (URL path)
for site in sites:
if site.get("name") == sitePath:
siteName = site.get("displayName", sitePath)
siteId = site.get("id", "unknown")
break
else:
# If no match by name, try to match by displayName
for site in sites:
if site.get("displayName") == sitePath:
siteName = site.get("displayName", sitePath)
siteId = site.get("id", "unknown")
break
else:
# If no exact match, use the site path as site name
siteName = sitePath
# Try to find a site with similar name
for site in sites:
if sitePath.lower() in site.get("name", "").lower() or sitePath.lower() in site.get("displayName", "").lower():
siteName = site.get("displayName", sitePath)
siteId = site.get("id", "unknown")
break
except Exception as e:
logger.warning(f"Error extracting site info from URL {webUrl}: {e}")
# Use improved folder detection logic
isFolder = self.services.sharepoint.detectFolderType(item)
itemType = "folder" if isFolder else "file"
itemPath = item.get("parentReference", {}).get("path", "")
logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'")
# Simple filtering like test file - just check search type
if searchType == "files" and isFolder:
continue # Skip folders when searching for files
elif searchType == "folders" and not isFolder:
continue # Skip files when searching for folders
# Simple approach like test file - no complex filtering
logger.debug(f"Item '{itemName}' found - adding to results")
# Create result with full path information for proper action chaining
parentPath = item.get("parentReference", {}).get("path", "")
# Extract the full SharePoint path from webUrl or parentReference
fullPath = ""
if webUrl:
# Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung
if '/sites/' in webUrl:
pathPart = webUrl.split('/sites/')[1]
# Decode URL encoding and convert to backslash format
decodedPath = urllib.parse.unquote(pathPart)
fullPath = "\\" + decodedPath.replace('/', '\\')
elif parentPath:
# Use parentReference path if available
fullPath = parentPath.replace('/', '\\')
docInfo = {
"id": item.get("id"),
"name": item.get("name"),
"type": "folder" if isFolder else "file",
"siteName": siteName,
"siteId": siteId,
"webUrl": webUrl,
"fullPath": fullPath,
"parentPath": parentPath
}
foundDocuments.append(docInfo)
logger.info(f"Found {len(foundDocuments)} documents from unified search")
except Exception as e:
logger.error(f"Error performing unified folder search: {str(e)}")
# Fallback to site-by-site search
pass
# If no unified search was performed or it failed, fall back to site-by-site search
if not foundDocuments:
# Use simple approach like test file - no complex filtering
siteScopedSites = sites
for site in siteScopedSites:
siteId = site["id"]
siteName = site["displayName"]
siteUrl = site["webUrl"]
logger.info(f"Searching in site: {siteName} ({siteUrl})")
# Check if pathQuery contains a specific folder path (not just /sites/SiteName)
folderPath = None
if pathQuery and pathQuery.startswith('/sites/'):
parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
if parsedPath:
innerPath = parsedPath.get("innerPath", "")
if innerPath and innerPath.strip():
# Remove leading slash if present
folderPath = innerPath.lstrip('/')
# Generic approach: Try to find the folder, if it fails, remove first segment
# This works for all languages because we test the actual API response
# In SharePoint Graph API, /drive/root already points to the default document library,
# so library names in paths should be removed
pathSegments = [s for s in folderPath.split('/') if s.strip()]
if len(pathSegments) > 1:
# Try with first segment removed (first segment is likely the document library)
testPath = '/'.join(pathSegments[1:])
# Quick test: try to get folder info (this is fast and doesn't require full search)
testEndpoint = f"sites/{siteId}/drive/root:/{urllib.parse.quote(testPath, safe='')}:"
testResult = await self.apiClient.makeGraphApiCall(testEndpoint)
if testResult and "error" not in testResult:
# Path without first segment works - first segment was likely the document library
folderPath = testPath
logger.info(f"Removed document library name '{pathSegments[0]}' from folder path (tested via API)")
else:
# Keep original path - first segment is not a document library
logger.info(f"Keeping original folder path '{folderPath}' (first segment is not a document library)")
elif len(pathSegments) == 1:
# Only one segment - likely the document library itself, use root
folderPath = None
logger.info(f"Only one segment '{pathSegments[0]}' found, likely document library - using root")
if folderPath:
logger.info(f"Extracted folder path from pathQuery: '{folderPath}'")
else:
logger.info(f"Folder path resolved to root (only document library in path)")
# Use Microsoft Graph API for this specific site
# Handle empty or wildcard queries
if not fileQuery or fileQuery.strip() == "" or fileQuery.strip() == "*":
# For wildcard/empty queries, list all items
if folderPath:
# List items in specific folder
encodedPath = urllib.parse.quote(folderPath, safe='')
endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/children"
logger.info(f"Listing items in folder: '{folderPath}'")
else:
# List all items in the drive root
endpoint = f"sites/{siteId}/drive/root/children"
# Make the API call to list items
listResult = await self.apiClient.makeGraphApiCall(endpoint)
if "error" in listResult:
logger.warning(f"List failed for site {siteName}: {listResult['error']}")
continue
# Process list results for this site
items = listResult.get("value", [])
logger.info(f"Retrieved {len(items)} items from site {siteName}")
else:
# For files, use regular search API
# Clean the query: remove path-like syntax and invalid KQL syntax
searchQueryCleaned = self.pathProcessing.cleanSearchQuery(fileQuery)
# URL-encode the query parameter
encodedQuery = urllib.parse.quote(searchQueryCleaned, safe='')
if folderPath:
# Search in specific folder
encodedPath = urllib.parse.quote(folderPath, safe='')
endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/search(q='{encodedQuery}')"
logger.info(f"Searching in folder '{folderPath}' with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')")
else:
# Search in drive root
endpoint = f"sites/{siteId}/drive/root/search(q='{encodedQuery}')"
logger.info(f"Using search API for files with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')")
# Make the search API call (files)
searchResult = await self.apiClient.makeGraphApiCall(endpoint)
if "error" in searchResult:
logger.warning(f"Search failed for site {siteName}: {searchResult['error']}")
continue
# Process search results for this site (files)
items = searchResult.get("value", [])
logger.info(f"Retrieved {len(items)} items from site {siteName}")
siteDocuments = []
for item in items:
itemName = item.get("name", "")
# Use improved folder detection logic
isFolder = self.services.sharepoint.detectFolderType(item)
itemType = "folder" if isFolder else "file"
itemPath = item.get("parentReference", {}).get("path", "")
logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'")
# Simple filtering like test file - just check search type
if searchType == "files" and isFolder:
continue # Skip folders when searching for files
elif searchType == "folders" and not isFolder:
continue # Skip files when searching for folders
# Simple approach like test file - no complex filtering
logger.debug(f"Item '{itemName}' found - adding to results")
# Create result with full path information for proper action chaining
webUrl = item.get("webUrl", "")
parentPath = item.get("parentReference", {}).get("path", "")
# Extract the full SharePoint path from webUrl or parentReference
fullPath = ""
if webUrl:
# Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung
if '/sites/' in webUrl:
pathPart = webUrl.split('/sites/')[1]
# Decode URL encoding and convert to backslash format
decodedPath = urllib.parse.unquote(pathPart)
fullPath = "\\" + decodedPath.replace('/', '\\')
elif parentPath:
# Use parentReference path if available
fullPath = parentPath.replace('/', '\\')
docInfo = {
"id": item.get("id"),
"name": item.get("name"),
"type": "folder" if isFolder else "file",
"siteName": siteName,
"siteId": siteId,
"webUrl": webUrl,
"fullPath": fullPath,
"parentPath": parentPath
}
siteDocuments.append(docInfo)
foundDocuments.extend(siteDocuments)
allSitesSearched.append({
"siteName": siteName,
"siteUrl": siteUrl,
"siteId": siteId,
"documentsFound": len(siteDocuments)
})
logger.info(f"Found {len(siteDocuments)} documents in site {siteName}")
# Limit total results to maxResults
if len(foundDocuments) > maxResults:
foundDocuments = foundDocuments[:maxResults]
logger.info(f"Limited results to {maxResults} items")
self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {len(foundDocuments)} document(s)")
resultData = {
"searchQuery": searchQuery,
"totalResults": len(foundDocuments),
"maxResults": maxResults,
"foundDocuments": foundDocuments,
"timestamp": self.services.utils.timestampGetUtc()
}
except Exception as e:
logger.error(f"Error searching SharePoint: {str(e)}")
if operationId:
self.services.chat.progressLogFinish(operationId, False)
return ActionResult.isFailure(error=str(e))
# Use default JSON format for output
outputExtension = ".json" # Default
outputMimeType = "application/json" # Default
validationMetadata = {
"actionType": "sharepoint.findDocumentPath",
"searchQuery": searchQuery,
"maxResults": maxResults,
"totalResults": len(foundDocuments),
"hasResults": len(foundDocuments) > 0
}
self.services.chat.progressLogFinish(operationId, True)
return ActionResult(
success=True,
documents=[
ActionDocument(
documentName=self._generateMeaningfulFileName("sharepoint_find_path", "json", None, "findDocumentPath"),
documentData=json.dumps(resultData, indent=2),
mimeType=outputMimeType,
validationMetadata=validationMetadata
)
]
)
except Exception as e:
logger.error(f"Error finding document path: {str(e)}")
if operationId:
try:
self.services.chat.progressLogFinish(operationId, False)
except:
pass
return ActionResult.isFailure(error=str(e))