# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Find Document Path action for SharePoint operations. Finds documents and folders by name/path across SharePoint sites. """ import logging import time import json import urllib.parse from typing import Dict, Any from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult, ActionDocument logger = logging.getLogger(__name__) @action async def findDocumentPath(self, parameters: Dict[str, Any]) -> ActionResult: """ GENERAL: - Purpose: Find documents and folders by name/path across sites. - Input requirements: connectionReference (required); searchQuery (required); optional site, maxResults. - Output format: JSON with found items and paths. Parameters: - connectionReference (str, required): Microsoft connection label. - site (str, optional): Site hint. - searchQuery (str, required): Search terms or path. - maxResults (int, optional): Maximum items to return. Default: 1000. """ operationId = None try: # Init progress logger workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}" operationId = f"sharepoint_find_{workflowId}_{int(time.time())}" # Start progress tracking parentOperationId = parameters.get('parentOperationId') self.services.chat.progressLogStart( operationId, "Find Document Path", "SharePoint Search", f"Query: {parameters.get('searchQuery', '*')}", parentOperationId=parentOperationId ) connectionReference = parameters.get("connectionReference") site = parameters.get("site") searchQuery = parameters.get("searchQuery", "*") maxResults = parameters.get("maxResults", 1000) if not connectionReference: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="Connection reference is required") # Parse searchQuery to extract path, search terms, search type, and options pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(searchQuery) logger.debug(f"Parsed searchQuery '{searchQuery}' -> pathQuery='{pathQuery}', fileQuery='{fileQuery}', searchType='{searchType}'") self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection") connection = self.connection.getMicrosoftConnection(connectionReference) if not connection: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") # Extract site name from pathQuery if it contains Microsoft-standard path (/sites/SiteName/...) siteFromPath = None directSite = None if pathQuery and pathQuery.startswith('/sites/'): parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery) if parsedPath: siteFromPath = parsedPath.get("siteName") logger.info(f"Extracted site from Microsoft-standard pathQuery '{pathQuery}': '{siteFromPath}'") # Try to get site directly by path (optimization - no need to load all 60 sites) directSite = await self.siteDiscovery.getSiteByStandardPath(siteFromPath) if directSite: logger.info(f"Got site directly by standard path - no need to discover all sites") sites = [directSite] else: logger.warning(f"Could not get site directly, falling back to site discovery") directSite = None else: logger.warning(f"Failed to parse site from standard pathQuery '{pathQuery}'") # If we didn't get the site directly, use discovery and filtering if not directSite: # Determine which site hint to use (priority: site parameter > site from pathQuery > site_hint from searchOptions) siteHintToUse = site or siteFromPath or searchOptions.get("site_hint") # Discover SharePoint sites - use targeted approach when site hint is available self.services.chat.progressLogUpdate(operationId, 0.3, "Discovering SharePoint sites") if siteHintToUse: # When site hint is available, discover all sites first, then filter allSites = await self.siteDiscovery.discoverSharePointSites() if not allSites: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No SharePoint sites found or accessible") sites = self.siteDiscovery.filterSitesByHint(allSites, siteHintToUse) logger.info(f"Filtered sites by site hint '{siteHintToUse}' -> {len(sites)} sites") if not sites: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error=f"No SharePoint sites found matching '{siteHintToUse}'") else: # No site hint - discover all sites sites = await self.siteDiscovery.discoverSharePointSites() if not sites: if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error="No SharePoint sites found or accessible") # Resolve path query into search paths searchPaths = self.pathProcessing.resolvePathQuery(pathQuery) self.services.chat.progressLogUpdate(operationId, 0.5, f"Searching across {len(sites)} site(s)") try: # Search across all discovered sites foundDocuments = [] allSitesSearched = [] # Handle different search approaches based on search type if searchType == "folders" and fileQuery and fileQuery.strip() != "" and fileQuery.strip() != "*": # Use unified search for folders - this is global and searches all sites try: # Use Microsoft Graph Search API syntax (simple term search only) terms = [t for t in fileQuery.split() if t.strip()] if len(terms) > 1: # Multiple terms: search for ALL terms (AND) - more specific results queryString = " AND ".join(terms) else: # Single term: search for the term queryString = terms[0] if terms else fileQuery logger.info(f"Using unified search for folders: {queryString}") payload = { "requests": [ { "entityTypes": ["driveItem"], "query": {"queryString": queryString}, "from": 0, "size": 50 } ] } logger.info(f"Using unified search API for folders with queryString: {queryString}") # Use global search endpoint (site-specific search not available) unifiedResult = await self.apiClient.makeGraphApiCall( "search/query", method="POST", data=json.dumps(payload).encode("utf-8") ) if "error" in unifiedResult: logger.warning(f"Unified search failed: {unifiedResult['error']}") items = [] else: # Flatten hits -> driveItem resources items = [] for container in (unifiedResult.get("value", []) or []): for hitsContainer in (container.get("hitsContainers", []) or []): for hit in (hitsContainer.get("hits", []) or []): resource = hit.get("resource") if resource: items.append(resource) logger.info(f"Unified search returned {len(items)} items (pre-filter)") # Apply our improved folder detection logic folderItems = [] for item in items: resource = item # Use the same detection logic as our test isFolder = self.services.sharepoint.detectFolderType(resource) if isFolder: folderItems.append(item) items = folderItems logger.info(f"Filtered to {len(items)} folders using improved detection logic") # Process unified search results - extract site information from webUrl for item in items: itemName = item.get("name", "") webUrl = item.get("webUrl", "") # Extract site information from webUrl siteName = "Unknown Site" siteId = "unknown" if webUrl and '/sites/' in webUrl: try: # Extract site name from URL: https://pcuster.sharepoint.com/sites/SiteName/... urlParts = webUrl.split('/sites/') if len(urlParts) > 1: sitePath = urlParts[1].split('/')[0] # Find matching site from discovered sites # First try to match by site name (URL path) for site in sites: if site.get("name") == sitePath: siteName = site.get("displayName", sitePath) siteId = site.get("id", "unknown") break else: # If no match by name, try to match by displayName for site in sites: if site.get("displayName") == sitePath: siteName = site.get("displayName", sitePath) siteId = site.get("id", "unknown") break else: # If no exact match, use the site path as site name siteName = sitePath # Try to find a site with similar name for site in sites: if sitePath.lower() in site.get("name", "").lower() or sitePath.lower() in site.get("displayName", "").lower(): siteName = site.get("displayName", sitePath) siteId = site.get("id", "unknown") break except Exception as e: logger.warning(f"Error extracting site info from URL {webUrl}: {e}") # Use improved folder detection logic isFolder = self.services.sharepoint.detectFolderType(item) itemType = "folder" if isFolder else "file" itemPath = item.get("parentReference", {}).get("path", "") logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") # Simple filtering like test file - just check search type if searchType == "files" and isFolder: continue # Skip folders when searching for files elif searchType == "folders" and not isFolder: continue # Skip files when searching for folders # Simple approach like test file - no complex filtering logger.debug(f"Item '{itemName}' found - adding to results") # Create result with full path information for proper action chaining parentPath = item.get("parentReference", {}).get("path", "") # Extract the full SharePoint path from webUrl or parentReference fullPath = "" if webUrl: # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung if '/sites/' in webUrl: pathPart = webUrl.split('/sites/')[1] # Decode URL encoding and convert to backslash format decodedPath = urllib.parse.unquote(pathPart) fullPath = "\\" + decodedPath.replace('/', '\\') elif parentPath: # Use parentReference path if available fullPath = parentPath.replace('/', '\\') docInfo = { "id": item.get("id"), "name": item.get("name"), "type": "folder" if isFolder else "file", "siteName": siteName, "siteId": siteId, "webUrl": webUrl, "fullPath": fullPath, "parentPath": parentPath } foundDocuments.append(docInfo) logger.info(f"Found {len(foundDocuments)} documents from unified search") except Exception as e: logger.error(f"Error performing unified folder search: {str(e)}") # Fallback to site-by-site search pass # If no unified search was performed or it failed, fall back to site-by-site search if not foundDocuments: # Use simple approach like test file - no complex filtering siteScopedSites = sites for site in siteScopedSites: siteId = site["id"] siteName = site["displayName"] siteUrl = site["webUrl"] logger.info(f"Searching in site: {siteName} ({siteUrl})") # Check if pathQuery contains a specific folder path (not just /sites/SiteName) folderPath = None if pathQuery and pathQuery.startswith('/sites/'): parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery) if parsedPath: innerPath = parsedPath.get("innerPath", "") if innerPath and innerPath.strip(): # Remove leading slash if present folderPath = innerPath.lstrip('/') # Generic approach: Try to find the folder, if it fails, remove first segment # This works for all languages because we test the actual API response # In SharePoint Graph API, /drive/root already points to the default document library, # so library names in paths should be removed pathSegments = [s for s in folderPath.split('/') if s.strip()] if len(pathSegments) > 1: # Try with first segment removed (first segment is likely the document library) testPath = '/'.join(pathSegments[1:]) # Quick test: try to get folder info (this is fast and doesn't require full search) testEndpoint = f"sites/{siteId}/drive/root:/{urllib.parse.quote(testPath, safe='')}:" testResult = await self.apiClient.makeGraphApiCall(testEndpoint) if testResult and "error" not in testResult: # Path without first segment works - first segment was likely the document library folderPath = testPath logger.info(f"Removed document library name '{pathSegments[0]}' from folder path (tested via API)") else: # Keep original path - first segment is not a document library logger.info(f"Keeping original folder path '{folderPath}' (first segment is not a document library)") elif len(pathSegments) == 1: # Only one segment - likely the document library itself, use root folderPath = None logger.info(f"Only one segment '{pathSegments[0]}' found, likely document library - using root") if folderPath: logger.info(f"Extracted folder path from pathQuery: '{folderPath}'") else: logger.info(f"Folder path resolved to root (only document library in path)") # Use Microsoft Graph API for this specific site # Handle empty or wildcard queries if not fileQuery or fileQuery.strip() == "" or fileQuery.strip() == "*": # For wildcard/empty queries, list all items if folderPath: # List items in specific folder encodedPath = urllib.parse.quote(folderPath, safe='') endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/children" logger.info(f"Listing items in folder: '{folderPath}'") else: # List all items in the drive root endpoint = f"sites/{siteId}/drive/root/children" # Make the API call to list items listResult = await self.apiClient.makeGraphApiCall(endpoint) if "error" in listResult: logger.warning(f"List failed for site {siteName}: {listResult['error']}") continue # Process list results for this site items = listResult.get("value", []) logger.info(f"Retrieved {len(items)} items from site {siteName}") else: # For files, use regular search API # Clean the query: remove path-like syntax and invalid KQL syntax searchQueryCleaned = self.pathProcessing.cleanSearchQuery(fileQuery) # URL-encode the query parameter encodedQuery = urllib.parse.quote(searchQueryCleaned, safe='') if folderPath: # Search in specific folder encodedPath = urllib.parse.quote(folderPath, safe='') endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/search(q='{encodedQuery}')" logger.info(f"Searching in folder '{folderPath}' with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')") else: # Search in drive root endpoint = f"sites/{siteId}/drive/root/search(q='{encodedQuery}')" logger.info(f"Using search API for files with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')") # Make the search API call (files) searchResult = await self.apiClient.makeGraphApiCall(endpoint) if "error" in searchResult: logger.warning(f"Search failed for site {siteName}: {searchResult['error']}") continue # Process search results for this site (files) items = searchResult.get("value", []) logger.info(f"Retrieved {len(items)} items from site {siteName}") siteDocuments = [] for item in items: itemName = item.get("name", "") # Use improved folder detection logic isFolder = self.services.sharepoint.detectFolderType(item) itemType = "folder" if isFolder else "file" itemPath = item.get("parentReference", {}).get("path", "") logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") # Simple filtering like test file - just check search type if searchType == "files" and isFolder: continue # Skip folders when searching for files elif searchType == "folders" and not isFolder: continue # Skip files when searching for folders # Simple approach like test file - no complex filtering logger.debug(f"Item '{itemName}' found - adding to results") # Create result with full path information for proper action chaining webUrl = item.get("webUrl", "") parentPath = item.get("parentReference", {}).get("path", "") # Extract the full SharePoint path from webUrl or parentReference fullPath = "" if webUrl: # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung if '/sites/' in webUrl: pathPart = webUrl.split('/sites/')[1] # Decode URL encoding and convert to backslash format decodedPath = urllib.parse.unquote(pathPart) fullPath = "\\" + decodedPath.replace('/', '\\') elif parentPath: # Use parentReference path if available fullPath = parentPath.replace('/', '\\') docInfo = { "id": item.get("id"), "name": item.get("name"), "type": "folder" if isFolder else "file", "siteName": siteName, "siteId": siteId, "webUrl": webUrl, "fullPath": fullPath, "parentPath": parentPath } siteDocuments.append(docInfo) foundDocuments.extend(siteDocuments) allSitesSearched.append({ "siteName": siteName, "siteUrl": siteUrl, "siteId": siteId, "documentsFound": len(siteDocuments) }) logger.info(f"Found {len(siteDocuments)} documents in site {siteName}") # Limit total results to maxResults if len(foundDocuments) > maxResults: foundDocuments = foundDocuments[:maxResults] logger.info(f"Limited results to {maxResults} items") self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {len(foundDocuments)} document(s)") resultData = { "searchQuery": searchQuery, "totalResults": len(foundDocuments), "maxResults": maxResults, "foundDocuments": foundDocuments, "timestamp": self.services.utils.timestampGetUtc() } except Exception as e: logger.error(f"Error searching SharePoint: {str(e)}") if operationId: self.services.chat.progressLogFinish(operationId, False) return ActionResult.isFailure(error=str(e)) # Use default JSON format for output outputExtension = ".json" # Default outputMimeType = "application/json" # Default validationMetadata = { "actionType": "sharepoint.findDocumentPath", "searchQuery": searchQuery, "maxResults": maxResults, "totalResults": len(foundDocuments), "hasResults": len(foundDocuments) > 0 } self.services.chat.progressLogFinish(operationId, True) return ActionResult( success=True, documents=[ ActionDocument( documentName=self._generateMeaningfulFileName("sharepoint_find_path", "json", None, "findDocumentPath"), documentData=json.dumps(resultData, indent=2), mimeType=outputMimeType, validationMetadata=validationMetadata ) ] ) except Exception as e: logger.error(f"Error finding document path: {str(e)}") if operationId: try: self.services.chat.progressLogFinish(operationId, False) except: pass return ActionResult.isFailure(error=str(e))