gateway/modules/workflows/methods/methodSharepoint/actions/findDocumentPath.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

import logging
import time
import json
import urllib.parse
from typing import Dict, Any
from modules.datamodels.datamodelChat import ActionResult, ActionDocument

logger = logging.getLogger(__name__)

async def findDocumentPath(self, parameters: Dict[str, Any]) -> ActionResult:
    operationId = None
    try:
        # Init progress logger
        workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
        operationId = f"sharepoint_find_{workflowId}_{int(time.time())}"

        # Start progress tracking
        parentOperationId = parameters.get('parentOperationId')
        self.services.chat.progressLogStart(
            operationId,
            "Find Document Path",
            "SharePoint Search",
            f"Query: {parameters.get('searchQuery', '*')}",
            parentOperationId=parentOperationId
        )

        connectionReference = parameters.get("connectionReference")
        site = parameters.get("site")
        searchQuery = parameters.get("searchQuery", "*")
        maxResults = parameters.get("maxResults", 1000)

        if not connectionReference:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="Connection reference is required")

        # Parse searchQuery to extract path, search terms, search type, and options
        pathQuery, fileQuery, searchType, searchOptions = self.pathProcessing.parseSearchQuery(searchQuery)
        logger.debug(f"Parsed searchQuery '{searchQuery}' -> pathQuery='{pathQuery}', fileQuery='{fileQuery}', searchType='{searchType}'")

        self.services.chat.progressLogUpdate(operationId, 0.2, "Getting Microsoft connection")
        connection = self.connection.getMicrosoftConnection(connectionReference)
        if not connection:
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference")

        # Extract site name from pathQuery if it contains Microsoft-standard path (/sites/SiteName/...)
        siteFromPath = None
        directSite = None
        if pathQuery and pathQuery.startswith('/sites/'):
            parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
            if parsedPath:
                siteFromPath = parsedPath.get("siteName")
                logger.info(f"Extracted site from Microsoft-standard pathQuery '{pathQuery}': '{siteFromPath}'")

                # Try to get site directly by path (optimization - no need to load all 60 sites)
                directSite = await self.siteDiscovery.getSiteByStandardPath(siteFromPath)
                if directSite:
                    logger.info(f"Got site directly by standard path - no need to discover all sites")
                    sites = [directSite]
                else:
                    logger.warning(f"Could not get site directly, falling back to site discovery")
                    directSite = None
            else:
                logger.warning(f"Failed to parse site from standard pathQuery '{pathQuery}'")

        # If we didn't get the site directly, use discovery and filtering
        if not directSite:
            # Determine which site hint to use (priority: site parameter > site from pathQuery > site_hint from searchOptions)
            siteHintToUse = site or siteFromPath or searchOptions.get("site_hint")

            # Discover SharePoint sites - use targeted approach when site hint is available
            self.services.chat.progressLogUpdate(operationId, 0.3, "Discovering SharePoint sites")
            if siteHintToUse:
                # When site hint is available, discover all sites first, then filter
                allSites = await self.siteDiscovery.discoverSharePointSites()
                if not allSites:
                    if operationId:
                        self.services.chat.progressLogFinish(operationId, False)
                    return ActionResult.isFailure(error="No SharePoint sites found or accessible")

                sites = self.siteDiscovery.filterSitesByHint(allSites, siteHintToUse)
                logger.info(f"Filtered sites by site hint '{siteHintToUse}' -> {len(sites)} sites")
                if not sites:
                    if operationId:
                        self.services.chat.progressLogFinish(operationId, False)
                    return ActionResult.isFailure(error=f"No SharePoint sites found matching '{siteHintToUse}'")
            else:
                # No site hint - discover all sites
                sites = await self.siteDiscovery.discoverSharePointSites()
                if not sites:
                    if operationId:
                        self.services.chat.progressLogFinish(operationId, False)
                    return ActionResult.isFailure(error="No SharePoint sites found or accessible")

        # Resolve path query into search paths
        searchPaths = self.pathProcessing.resolvePathQuery(pathQuery)

        self.services.chat.progressLogUpdate(operationId, 0.5, f"Searching across {len(sites)} site(s)")

        try:
            # Search across all discovered sites
            foundDocuments = []
            allSitesSearched = []

            # Handle different search approaches based on search type
            if searchType == "folders" and fileQuery and fileQuery.strip() != "" and fileQuery.strip() != "*":
                # Use unified search for folders - this is global and searches all sites
                try:

                    # Use Microsoft Graph Search API syntax (simple term search only)
                    terms = [t for t in fileQuery.split() if t.strip()]

                    if len(terms) > 1:
                        # Multiple terms: search for ALL terms (AND) - more specific results
                        queryString = " AND ".join(terms)
                    else:
                        # Single term: search for the term
                        queryString = terms[0] if terms else fileQuery
                    logger.info(f"Using unified search for folders: {queryString}")

                    payload = {
                        "requests": [
                            {
                                "entityTypes": ["driveItem"],
                                "query": {"queryString": queryString},
                                "from": 0,
                                "size": 50
                            }
                        ]
                    }
                    logger.info(f"Using unified search API for folders with queryString: {queryString}")

                    # Use global search endpoint (site-specific search not available)
                    unifiedResult = await self.apiClient.makeGraphApiCall(
                        "search/query",
                        method="POST",
                        data=json.dumps(payload).encode("utf-8")
                    )

                    if "error" in unifiedResult:
                        logger.warning(f"Unified search failed: {unifiedResult['error']}")
                        items = []
                    else:
                        # Flatten hits -> driveItem resources
                        items = []
                        for container in (unifiedResult.get("value", []) or []):
                            for hitsContainer in (container.get("hitsContainers", []) or []):
                                for hit in (hitsContainer.get("hits", []) or []):
                                    resource = hit.get("resource")
                                    if resource:
                                        items.append(resource)

                        logger.info(f"Unified search returned {len(items)} items (pre-filter)")

                        # Apply our improved folder detection logic
                        folderItems = []
                        for item in items:
                            resource = item

                            # Use the same detection logic as our test
                            isFolder = self.services.sharepoint.detectFolderType(resource)

                            if isFolder:
                                folderItems.append(item)

                        items = folderItems
                        logger.info(f"Filtered to {len(items)} folders using improved detection logic")

                        # Process unified search results - extract site information from webUrl
                        for item in items:
                            itemName = item.get("name", "")
                            webUrl = item.get("webUrl", "")

                            # Extract site information from webUrl
                            siteName = "Unknown Site"
                            siteId = "unknown"

                            if webUrl and '/sites/' in webUrl:
                                try:
                                    # Extract site name from URL: https://pcuster.sharepoint.com/sites/SiteName/...
                                    urlParts = webUrl.split('/sites/')
                                    if len(urlParts) > 1:
                                        sitePath = urlParts[1].split('/')[0]
                                        # Find matching site from discovered sites
                                        # First try to match by site name (URL path)
                                        for site in sites:
                                            if site.get("name") == sitePath:
                                                siteName = site.get("displayName", sitePath)
                                                siteId = site.get("id", "unknown")
                                                break
                                        else:
                                            # If no match by name, try to match by displayName
                                            for site in sites:
                                                if site.get("displayName") == sitePath:
                                                    siteName = site.get("displayName", sitePath)
                                                    siteId = site.get("id", "unknown")
                                                    break
                                            else:
                                                # If no exact match, use the site path as site name
                                                siteName = sitePath
                                                # Try to find a site with similar name
                                                for site in sites:
                                                    if sitePath.lower() in site.get("name", "").lower() or sitePath.lower() in site.get("displayName", "").lower():
                                                        siteName = site.get("displayName", sitePath)
                                                        siteId = site.get("id", "unknown")
                                                        break
                                except Exception as e:
                                    logger.warning(f"Error extracting site info from URL {webUrl}: {e}")

                            # Use improved folder detection logic
                            isFolder = self.services.sharepoint.detectFolderType(item)
                            itemType = "folder" if isFolder else "file"
                            itemPath = item.get("parentReference", {}).get("path", "")
                            logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'")

                            # Simple filtering like test file - just check search type
                            if searchType == "files" and isFolder:
                                continue  # Skip folders when searching for files
                            elif searchType == "folders" and not isFolder:
                                continue  # Skip files when searching for folders

                            # Simple approach like test file - no complex filtering
                            logger.debug(f"Item '{itemName}' found - adding to results")

                            # Create result with full path information for proper action chaining
                            parentPath = item.get("parentReference", {}).get("path", "")

                            # Extract the full SharePoint path from webUrl or parentReference
                            fullPath = ""
                            if webUrl:
                                # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung
                                if '/sites/' in webUrl:
                                    pathPart = webUrl.split('/sites/')[1]
                                    # Decode URL encoding and convert to backslash format
                                    decodedPath = urllib.parse.unquote(pathPart)
                                    fullPath = "\\" + decodedPath.replace('/', '\\')
                            elif parentPath:
                                # Use parentReference path if available
                                fullPath = parentPath.replace('/', '\\')

                            docInfo = {
                                "id": item.get("id"),
                                "name": item.get("name"),
                                "type": "folder" if isFolder else "file",
                                "siteName": siteName,
                                "siteId": siteId,
                                "webUrl": webUrl,
                                "fullPath": fullPath,
                                "parentPath": parentPath
                            }

                            foundDocuments.append(docInfo)

                        logger.info(f"Found {len(foundDocuments)} documents from unified search")

                except Exception as e:
                    logger.error(f"Error performing unified folder search: {str(e)}")
                    # Fallback to site-by-site search
                    pass

            # If no unified search was performed or it failed, fall back to site-by-site search
            if not foundDocuments:
                # Use simple approach like test file - no complex filtering
                siteScopedSites = sites

                for site in siteScopedSites:
                    siteId = site["id"]
                    siteName = site["displayName"]
                    siteUrl = site["webUrl"]

                    logger.info(f"Searching in site: {siteName} ({siteUrl})")

                    # Check if pathQuery contains a specific folder path (not just /sites/SiteName)
                    folderPath = None
                    if pathQuery and pathQuery.startswith('/sites/'):
                        parsedPath = self.siteDiscovery.extractSiteFromStandardPath(pathQuery)
                        if parsedPath:
                            innerPath = parsedPath.get("innerPath", "")
                            if innerPath and innerPath.strip():
                                # Remove leading slash if present
                                folderPath = innerPath.lstrip('/')

                                # Generic approach: Try to find the folder, if it fails, remove first segment
                                # This works for all languages because we test the actual API response
                                # In SharePoint Graph API, /drive/root already points to the default document library,
                                # so library names in paths should be removed
                                pathSegments = [s for s in folderPath.split('/') if s.strip()]
                                if len(pathSegments) > 1:
                                    # Try with first segment removed (first segment is likely the document library)
                                    testPath = '/'.join(pathSegments[1:])
                                    # Quick test: try to get folder info (this is fast and doesn't require full search)
                                    testEndpoint = f"sites/{siteId}/drive/root:/{urllib.parse.quote(testPath, safe='')}:"
                                    testResult = await self.apiClient.makeGraphApiCall(testEndpoint)
                                    if testResult and "error" not in testResult:
                                        # Path without first segment works - first segment was likely the document library
                                        folderPath = testPath
                                        logger.info(f"Removed document library name '{pathSegments[0]}' from folder path (tested via API)")
                                    else:
                                        # Keep original path - first segment is not a document library
                                        logger.info(f"Keeping original folder path '{folderPath}' (first segment is not a document library)")
                                elif len(pathSegments) == 1:
                                    # Only one segment - likely the document library itself, use root
                                    folderPath = None
                                    logger.info(f"Only one segment '{pathSegments[0]}' found, likely document library - using root")

                                if folderPath:
                                    logger.info(f"Extracted folder path from pathQuery: '{folderPath}'")
                                else:
                                    logger.info(f"Folder path resolved to root (only document library in path)")

                    # Use Microsoft Graph API for this specific site
                    # Handle empty or wildcard queries
                    if not fileQuery or fileQuery.strip() == "" or fileQuery.strip() == "*":
                        # For wildcard/empty queries, list all items
                        if folderPath:
                            # List items in specific folder
                            encodedPath = urllib.parse.quote(folderPath, safe='')
                            endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/children"
                            logger.info(f"Listing items in folder: '{folderPath}'")
                        else:
                            # List all items in the drive root
                            endpoint = f"sites/{siteId}/drive/root/children"

                        # Make the API call to list items
                        listResult = await self.apiClient.makeGraphApiCall(endpoint)
                        if "error" in listResult:
                            logger.warning(f"List failed for site {siteName}: {listResult['error']}")
                            continue
                        # Process list results for this site
                        items = listResult.get("value", [])
                        logger.info(f"Retrieved {len(items)} items from site {siteName}")
                    else:
                        # For files, use regular search API
                        # Clean the query: remove path-like syntax and invalid KQL syntax
                        searchQueryCleaned = self.pathProcessing.cleanSearchQuery(fileQuery)
                        # URL-encode the query parameter
                        encodedQuery = urllib.parse.quote(searchQueryCleaned, safe='')

                        if folderPath:
                            # Search in specific folder
                            encodedPath = urllib.parse.quote(folderPath, safe='')
                            endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/search(q='{encodedQuery}')"
                            logger.info(f"Searching in folder '{folderPath}' with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')")
                        else:
                            # Search in drive root
                            endpoint = f"sites/{siteId}/drive/root/search(q='{encodedQuery}')"
                            logger.info(f"Using search API for files with query: '{searchQueryCleaned}' (encoded: '{encodedQuery}')")

                        # Make the search API call (files)
                        searchResult = await self.apiClient.makeGraphApiCall(endpoint)
                        if "error" in searchResult:
                            logger.warning(f"Search failed for site {siteName}: {searchResult['error']}")
                            continue
                        # Process search results for this site (files)
                        items = searchResult.get("value", [])
                        logger.info(f"Retrieved {len(items)} items from site {siteName}")

                    siteDocuments = []

                    for item in items:
                        itemName = item.get("name", "")

                        # Use improved folder detection logic
                        isFolder = self.services.sharepoint.detectFolderType(item)

                        itemType = "folder" if isFolder else "file"
                        itemPath = item.get("parentReference", {}).get("path", "")
                        logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'")

                        # Simple filtering like test file - just check search type
                        if searchType == "files" and isFolder:
                            continue  # Skip folders when searching for files
                        elif searchType == "folders" and not isFolder:
                            continue  # Skip files when searching for folders

                        # Simple approach like test file - no complex filtering
                        logger.debug(f"Item '{itemName}' found - adding to results")

                        # Create result with full path information for proper action chaining
                        webUrl = item.get("webUrl", "")
                        parentPath = item.get("parentReference", {}).get("path", "")

                        # Extract the full SharePoint path from webUrl or parentReference
                        fullPath = ""
                        if webUrl:
                            # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung
                            if '/sites/' in webUrl:
                                pathPart = webUrl.split('/sites/')[1]
                                # Decode URL encoding and convert to backslash format
                                decodedPath = urllib.parse.unquote(pathPart)
                                fullPath = "\\" + decodedPath.replace('/', '\\')
                        elif parentPath:
                            # Use parentReference path if available
                            fullPath = parentPath.replace('/', '\\')

                        docInfo = {
                            "id": item.get("id"),
                            "name": item.get("name"),
                            "type": "folder" if isFolder else "file",
                            "siteName": siteName,
                            "siteId": siteId,
                            "webUrl": webUrl,
                            "fullPath": fullPath,
                            "parentPath": parentPath
                        }

                        siteDocuments.append(docInfo)

                    foundDocuments.extend(siteDocuments)
                    allSitesSearched.append({
                        "siteName": siteName,
                        "siteUrl": siteUrl,
                        "siteId": siteId,
                        "documentsFound": len(siteDocuments)
                    })

                    logger.info(f"Found {len(siteDocuments)} documents in site {siteName}")

            # Limit total results to maxResults
            if len(foundDocuments) > maxResults:
                foundDocuments = foundDocuments[:maxResults]
                logger.info(f"Limited results to {maxResults} items")

            self.services.chat.progressLogUpdate(operationId, 0.9, f"Found {len(foundDocuments)} document(s)")

            resultData = {
                "searchQuery": searchQuery,
                "totalResults": len(foundDocuments),
                "maxResults": maxResults,
                "foundDocuments": foundDocuments,
                "timestamp": self.services.utils.timestampGetUtc()
            }

        except Exception as e:
            logger.error(f"Error searching SharePoint: {str(e)}")
            if operationId:
                self.services.chat.progressLogFinish(operationId, False)
            return ActionResult.isFailure(error=str(e))

        # Use default JSON format for output
        outputExtension = ".json"  # Default
        outputMimeType = "application/json"  # Default

        validationMetadata = {
            "actionType": "sharepoint.findDocumentPath",
            "searchQuery": searchQuery,
            "maxResults": maxResults,
            "totalResults": len(foundDocuments),
            "hasResults": len(foundDocuments) > 0
        }

        self.services.chat.progressLogFinish(operationId, True)
        return ActionResult(
            success=True,
            documents=[
                ActionDocument(
                    documentName=self._generateMeaningfulFileName("sharepoint_find_path", "json", None, "findDocumentPath"),
                    documentData=json.dumps(resultData, indent=2),
                    mimeType=outputMimeType,
                    validationMetadata=validationMetadata
                )
            ]
        )

    except Exception as e:
        logger.error(f"Error finding document path: {str(e)}")
        if operationId:
            try:
                self.services.chat.progressLogFinish(operationId, False)
            except:
                pass
        return ActionResult.isFailure(error=str(e))