gateway/modules/workflows/methods/methodSharepoint/helpers/pathProcessing.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

"""
Path Processing helper for SharePoint operations.
Handles search query parsing, path resolution, and query cleaning.
"""

import logging
import re
from typing import List, Optional, Dict, Any

logger = logging.getLogger(__name__)

class PathProcessingHelper:
    """Helper for path and query processing"""

    def __init__(self, methodInstance):
        """
        Initialize path processing helper.

        Args:
            methodInstance: Instance of MethodSharepoint (for access to services)
        """
        self.method = methodInstance
        self.services = methodInstance.services

    def parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]:
        """
        Parse searchQuery to extract path, search terms, search type, and search options.

        CRITICAL: NEVER convert words to paths! Words stay as search terms.
        - "root document lesson" → fileQuery="root document lesson" (NOT "/root/document/lesson")
        - "root, gose" → fileQuery="root, gose" (NOT "/root/gose")
        - "druckersteuerung eskalation logobject" → fileQuery="druckersteuerung eskalation logobject"

        Parameters:
            searchQuery (str): Enhanced search query with options:
                - "budget" -> pathQuery="*", fileQuery="budget", searchType="all", options={}
                - "root document lesson" -> pathQuery="*", fileQuery="root document lesson", searchType="all", options={}
                - "root, gose" -> pathQuery="*", fileQuery="root, gose", searchType="all", options={}
                - "/Documents:budget" -> pathQuery="/Documents", fileQuery="budget", searchType="all", options={}
                - "files:budget" -> pathQuery="*", fileQuery="budget", searchType="files", options={}
                - "folders:DELTA" -> pathQuery="*", fileQuery="DELTA", searchType="folders", options={}
                - "exact:\"Operations 2025\"" -> exact phrase matching
                - "regex:^Operations.*2025$" -> regex pattern matching
                - "case:DELTA" -> case-sensitive search
                - "and:DELTA AND 2025 Mars AND Group" -> all AND terms must be present

        Returns:
            tuple[str, str, str, dict]: (pathQuery, fileQuery, searchType, searchOptions)
        """
        try:
            if not searchQuery or not searchQuery.strip() or searchQuery.strip() == "*":
                return "*", "*", "all", {}

            searchQuery = searchQuery.strip()
            searchOptions = {}

            # CRITICAL: Do NOT convert space-separated or comma-separated words to paths!
            # "root document lesson" should stay as "root document lesson", NOT "/root/document/lesson"
            # "root, gose" should stay as "root, gose", NOT "/root/gose"

            # Check for search type specification (files:, folders:, all:) FIRST
            searchType = "all"  # Default
            if searchQuery.startswith(("files:", "folders:", "all:")):
                typeParts = searchQuery.split(':', 1)
                searchType = typeParts[0].strip()
                searchQuery = typeParts[1].strip()

            # Extract optional site hint tokens: support "site=Name" or leading "site:Name"
            def _extractSiteHint(q: str) -> tuple[str, Optional[str]]:
                try:
                    qStrip = q.strip()
                    # Leading form: site:KM LayerFinance ...
                    if qStrip.lower().startswith("site:"):
                        after = qStrip[5:].lstrip()
                        # site name until next space or end
                        if ' ' in after:
                            siteName, rest = after.split(' ', 1)
                        else:
                            siteName, rest = after, ''
                        return rest.strip(), siteName.strip()
                    # Inline key=value form anywhere
                    m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE)
                    if m:
                        siteName = m.group(1).strip()
                        # remove the token from query
                        qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip()
                        return qNew, siteName
                except Exception:
                    pass
                return q, None

            searchQuery, extractedSite = _extractSiteHint(searchQuery)
            if extractedSite:
                searchOptions["site_hint"] = extractedSite
                logger.info(f"Extracted site hint: '{extractedSite}'")

            # Extract name="..." if present (for quoted multi-word names)
            nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery)
            if nameMatch:
                searchQuery = nameMatch.group(1)
                logger.info(f"Extracted name from quotes: '{searchQuery}'")

            # Check for search mode specification (exact:, regex:, case:, and:)
            if searchQuery.startswith(("exact:", "regex:", "case:", "and:")):
                modeParts = searchQuery.split(':', 1)
                mode = modeParts[0].strip()
                searchQuery = modeParts[1].strip()

                if mode == "exact":
                    searchOptions["exact_match"] = True
                    # Remove quotes if present
                    if searchQuery.startswith('"') and searchQuery.endswith('"'):
                        searchQuery = searchQuery[1:-1]
                elif mode == "regex":
                    searchOptions["regex_match"] = True
                elif mode == "case":
                    searchOptions["case_sensitive"] = True
                elif mode == "and":
                    searchOptions["and_terms"] = True

            # Check if it contains path:search format
            # Microsoft-standard paths: /sites/SiteName/Path:files:.pdf
            if ':' in searchQuery:
                # For Microsoft-standard paths (/sites/...), find the colon that separates path from search
                if searchQuery.startswith('/sites/'):
                    # Find the colon that separates path from search (after the full path)
                    # Look for pattern: /sites/SiteName/Path/...:files:.pdf
                    # We need to find the colon that's followed by search type or file extension
                    colonPositions = []
                    for i, char in enumerate(searchQuery):
                        if char == ':':
                            colonPositions.append(i)

                    # If we have colons, find the one that's followed by search type or file extension
                    splitPos = None
                    if colonPositions:
                        for pos in colonPositions:
                            afterColon = searchQuery[pos+1:pos+10].strip().lower()
                            # Check if this colon is followed by search type or looks like a file extension
                            if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '':
                                splitPos = pos
                                break

                    # If no clear split found, use the last colon
                    if splitPos is None and colonPositions:
                        splitPos = colonPositions[-1]

                    if splitPos:
                        pathPart = searchQuery[:splitPos].strip()
                        searchPart = searchQuery[splitPos+1:].strip()
                    else:
                        # Fallback: split on first colon
                        parts = searchQuery.split(':', 1)
                        pathPart = parts[0].strip()
                        searchPart = parts[1].strip()
                else:
                    # Regular path:search format - split on first colon
                    parts = searchQuery.split(':', 1)
                    pathPart = parts[0].strip()
                    searchPart = parts[1].strip()

                # Check if searchPart starts with search type (files:, folders:, all:)
                if searchPart.startswith(("files:", "folders:", "all:")):
                    typeParts = searchPart.split(':', 1)
                    searchType = typeParts[0].strip()  # Update searchType
                    searchPart = typeParts[1].strip() if len(typeParts) > 1 else ""

                # Handle path part
                if not pathPart or pathPart == "*":
                    pathQuery = "*"
                elif pathPart.startswith('/'):
                    pathQuery = pathPart
                else:
                    pathQuery = f"/Documents/{pathPart}"

                # Handle search part
                if not searchPart or searchPart == "*":
                    fileQuery = "*"
                else:
                    fileQuery = searchPart

                return pathQuery, fileQuery, searchType, searchOptions

            # No colon - check if it looks like a path
            elif searchQuery.startswith('/'):
                # It's a path only
                return searchQuery, "*", searchType, searchOptions

            else:
                # It's a search term only - keep words as-is, do NOT convert to paths
                # "root document lesson" stays as "root document lesson"
                # "root, gose" stays as "root, gose"
                return "*", searchQuery, searchType, searchOptions

        except Exception as e:
            logger.error(f"Error parsing searchQuery '{searchQuery}': {str(e)}")
            raise ValueError(f"Failed to parse searchQuery '{searchQuery}': {str(e)}")

    def resolvePathQuery(self, pathQuery: str) -> List[str]:
        """
        Resolve pathQuery into a list of search paths for SharePoint operations.

        Parameters:
            pathQuery (str): Query string that can contain:
                - Direct paths (e.g., "/Documents/Project1")
                - Wildcards (e.g., "/Documents/*")
                - Multiple paths separated by semicolons (e.g., "/Docs; /Files")
                - Single word relative paths (e.g., "Project1" -> resolved to default folder)
                - Empty string or "*" for global search
                - Space-separated words are treated as search terms, NOT folder paths

        Returns:
            List[str]: List of resolved paths
        """
        try:
            if not pathQuery or not pathQuery.strip() or pathQuery.strip() == "*":
                return ["*"]  # Global search across all sites

            # Split by semicolon to handle multiple paths
            rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()]
            resolvedPaths = []

            for rawPath in rawPaths:
                # Handle wildcards - return as-is
                if '*' in rawPath:
                    resolvedPaths.append(rawPath)
                # Handle absolute paths
                elif rawPath.startswith('/'):
                    resolvedPaths.append(rawPath)
                # Handle single word relative paths - prepend default folder
                # BUT NOT space-separated words (those are search terms, not paths)
                elif ' ' not in rawPath:
                    resolvedPaths.append(f"/Documents/{rawPath}")
                else:
                    # Check if this looks like a path (has path separators) or search terms
                    if '\\' in rawPath or '/' in rawPath:
                        # This looks like a path with spaces in folder names - treat as valid path
                        resolvedPaths.append(rawPath)
                        logger.info(f"Path with spaces '{rawPath}' treated as valid folder path")
                    else:
                        # Space-separated words without path separators are search terms
                        # Return as "*" to search globally
                        logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path")
                        resolvedPaths.append("*")

            # Remove duplicates while preserving order
            seen = set()
            uniquePaths = []
            for path in resolvedPaths:
                if path not in seen:
                    seen.add(path)
                    uniquePaths.append(path)

            logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}")
            return uniquePaths

        except Exception as e:
            logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}")
            raise ValueError(f"Failed to resolve pathQuery '{pathQuery}': {str(e)}")

    def cleanSearchQuery(self, query: str) -> str:
        """
        Clean search query to make it compatible with Graph API KQL syntax.
        Removes path-like syntax and invalid KQL constructs.

        Parameters:
            query (str): Raw search query that may contain paths and invalid syntax

        Returns:
            str: Cleaned query suitable for Graph API search endpoint
        """
        if not query or not query.strip():
            return ""

        query = query.strip()

        # Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf"
        # Extract the search term and file extension

        # First, extract file extension if present (format: :files:.pdf or just .pdf at the end)
        fileExtension = ""
        if ':files:' in query.lower() or ':folders:' in query.lower():
            # Extract extension after the type filter
            extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE)
            if extMatch:
                fileExtension = extMatch.group(1)
            # Remove the type filter part
            query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE)
        elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')):
            # Extract extension from end
            extMatch = re.search(r'(\.\w+)$', query)
            if extMatch:
                fileExtension = extMatch.group(1)
                query = query[:-len(fileExtension)]

        # Extract search term: get the last segment after the last slash (filename part)
        queryNormalized = query.replace('\\', '/')
        if '/' in queryNormalized:
            # Extract the last segment (usually the filename/search term)
            lastSegment = queryNormalized.split('/')[-1]
            # Remove any remaining colons or type filters
            if ':' in lastSegment:
                lastSegment = lastSegment.split(':')[0]
            searchTerm = lastSegment.strip()
        else:
            # No path separators, use the query as-is but remove type filters
            if ':' in query:
                searchTerm = query.split(':')[0].strip()
            else:
                searchTerm = query.strip()

        # Remove any remaining type filters or invalid syntax
        searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE)
        searchTerm = searchTerm.strip()

        # If we have a file extension, include it in the search term
        # Note: Graph API search endpoint may not support filetype: syntax
        # So we include the extension as part of the search term or filter results after
        if fileExtension:
            extWithoutDot = fileExtension.lstrip('.')
            # Try simple approach: add extension as search term
            # If this doesn't work, we'll filter results after search
            if searchTerm:
                # Include extension in search - Graph API will search in filename
                searchTerm = f"{searchTerm} {extWithoutDot}"
            else:
                searchTerm = extWithoutDot

        # Final cleanup: remove any remaining invalid characters for KQL
        # Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators
        searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm)
        searchTerm = ' '.join(searchTerm.split())  # Normalize whitespace

        return searchTerm if searchTerm else "*"