# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Path Processing helper for SharePoint operations. Handles search query parsing, path resolution, and query cleaning. """ import logging import re from typing import List, Optional, Dict, Any logger = logging.getLogger(__name__) class PathProcessingHelper: """Helper for path and query processing""" def __init__(self, methodInstance): """ Initialize path processing helper. Args: methodInstance: Instance of MethodSharepoint (for access to services) """ self.method = methodInstance self.services = methodInstance.services def parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]: """ Parse searchQuery to extract path, search terms, search type, and search options. CRITICAL: NEVER convert words to paths! Words stay as search terms. - "root document lesson" → fileQuery="root document lesson" (NOT "/root/document/lesson") - "root, gose" → fileQuery="root, gose" (NOT "/root/gose") - "druckersteuerung eskalation logobject" → fileQuery="druckersteuerung eskalation logobject" Parameters: searchQuery (str): Enhanced search query with options: - "budget" -> pathQuery="*", fileQuery="budget", searchType="all", options={} - "root document lesson" -> pathQuery="*", fileQuery="root document lesson", searchType="all", options={} - "root, gose" -> pathQuery="*", fileQuery="root, gose", searchType="all", options={} - "/Documents:budget" -> pathQuery="/Documents", fileQuery="budget", searchType="all", options={} - "files:budget" -> pathQuery="*", fileQuery="budget", searchType="files", options={} - "folders:DELTA" -> pathQuery="*", fileQuery="DELTA", searchType="folders", options={} - "exact:\"Operations 2025\"" -> exact phrase matching - "regex:^Operations.*2025$" -> regex pattern matching - "case:DELTA" -> case-sensitive search - "and:DELTA AND 2025 Mars AND Group" -> all AND terms must be present Returns: tuple[str, str, str, dict]: (pathQuery, fileQuery, searchType, searchOptions) """ try: if not searchQuery or not searchQuery.strip() or searchQuery.strip() == "*": return "*", "*", "all", {} searchQuery = searchQuery.strip() searchOptions = {} # CRITICAL: Do NOT convert space-separated or comma-separated words to paths! # "root document lesson" should stay as "root document lesson", NOT "/root/document/lesson" # "root, gose" should stay as "root, gose", NOT "/root/gose" # Check for search type specification (files:, folders:, all:) FIRST searchType = "all" # Default if searchQuery.startswith(("files:", "folders:", "all:")): typeParts = searchQuery.split(':', 1) searchType = typeParts[0].strip() searchQuery = typeParts[1].strip() # Extract optional site hint tokens: support "site=Name" or leading "site:Name" def _extractSiteHint(q: str) -> tuple[str, Optional[str]]: try: qStrip = q.strip() # Leading form: site:KM LayerFinance ... if qStrip.lower().startswith("site:"): after = qStrip[5:].lstrip() # site name until next space or end if ' ' in after: siteName, rest = after.split(' ', 1) else: siteName, rest = after, '' return rest.strip(), siteName.strip() # Inline key=value form anywhere m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE) if m: siteName = m.group(1).strip() # remove the token from query qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip() return qNew, siteName except Exception: pass return q, None searchQuery, extractedSite = _extractSiteHint(searchQuery) if extractedSite: searchOptions["site_hint"] = extractedSite logger.info(f"Extracted site hint: '{extractedSite}'") # Extract name="..." if present (for quoted multi-word names) nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery) if nameMatch: searchQuery = nameMatch.group(1) logger.info(f"Extracted name from quotes: '{searchQuery}'") # Check for search mode specification (exact:, regex:, case:, and:) if searchQuery.startswith(("exact:", "regex:", "case:", "and:")): modeParts = searchQuery.split(':', 1) mode = modeParts[0].strip() searchQuery = modeParts[1].strip() if mode == "exact": searchOptions["exact_match"] = True # Remove quotes if present if searchQuery.startswith('"') and searchQuery.endswith('"'): searchQuery = searchQuery[1:-1] elif mode == "regex": searchOptions["regex_match"] = True elif mode == "case": searchOptions["case_sensitive"] = True elif mode == "and": searchOptions["and_terms"] = True # Check if it contains path:search format # Microsoft-standard paths: /sites/SiteName/Path:files:.pdf if ':' in searchQuery: # For Microsoft-standard paths (/sites/...), find the colon that separates path from search if searchQuery.startswith('/sites/'): # Find the colon that separates path from search (after the full path) # Look for pattern: /sites/SiteName/Path/...:files:.pdf # We need to find the colon that's followed by search type or file extension colonPositions = [] for i, char in enumerate(searchQuery): if char == ':': colonPositions.append(i) # If we have colons, find the one that's followed by search type or file extension splitPos = None if colonPositions: for pos in colonPositions: afterColon = searchQuery[pos+1:pos+10].strip().lower() # Check if this colon is followed by search type or looks like a file extension if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '': splitPos = pos break # If no clear split found, use the last colon if splitPos is None and colonPositions: splitPos = colonPositions[-1] if splitPos: pathPart = searchQuery[:splitPos].strip() searchPart = searchQuery[splitPos+1:].strip() else: # Fallback: split on first colon parts = searchQuery.split(':', 1) pathPart = parts[0].strip() searchPart = parts[1].strip() else: # Regular path:search format - split on first colon parts = searchQuery.split(':', 1) pathPart = parts[0].strip() searchPart = parts[1].strip() # Check if searchPart starts with search type (files:, folders:, all:) if searchPart.startswith(("files:", "folders:", "all:")): typeParts = searchPart.split(':', 1) searchType = typeParts[0].strip() # Update searchType searchPart = typeParts[1].strip() if len(typeParts) > 1 else "" # Handle path part if not pathPart or pathPart == "*": pathQuery = "*" elif pathPart.startswith('/'): pathQuery = pathPart else: pathQuery = f"/Documents/{pathPart}" # Handle search part if not searchPart or searchPart == "*": fileQuery = "*" else: fileQuery = searchPart return pathQuery, fileQuery, searchType, searchOptions # No colon - check if it looks like a path elif searchQuery.startswith('/'): # It's a path only return searchQuery, "*", searchType, searchOptions else: # It's a search term only - keep words as-is, do NOT convert to paths # "root document lesson" stays as "root document lesson" # "root, gose" stays as "root, gose" return "*", searchQuery, searchType, searchOptions except Exception as e: logger.error(f"Error parsing searchQuery '{searchQuery}': {str(e)}") raise ValueError(f"Failed to parse searchQuery '{searchQuery}': {str(e)}") def resolvePathQuery(self, pathQuery: str) -> List[str]: """ Resolve pathQuery into a list of search paths for SharePoint operations. Parameters: pathQuery (str): Query string that can contain: - Direct paths (e.g., "/Documents/Project1") - Wildcards (e.g., "/Documents/*") - Multiple paths separated by semicolons (e.g., "/Docs; /Files") - Single word relative paths (e.g., "Project1" -> resolved to default folder) - Empty string or "*" for global search - Space-separated words are treated as search terms, NOT folder paths Returns: List[str]: List of resolved paths """ try: if not pathQuery or not pathQuery.strip() or pathQuery.strip() == "*": return ["*"] # Global search across all sites # Split by semicolon to handle multiple paths rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()] resolvedPaths = [] for rawPath in rawPaths: # Handle wildcards - return as-is if '*' in rawPath: resolvedPaths.append(rawPath) # Handle absolute paths elif rawPath.startswith('/'): resolvedPaths.append(rawPath) # Handle single word relative paths - prepend default folder # BUT NOT space-separated words (those are search terms, not paths) elif ' ' not in rawPath: resolvedPaths.append(f"/Documents/{rawPath}") else: # Check if this looks like a path (has path separators) or search terms if '\\' in rawPath or '/' in rawPath: # This looks like a path with spaces in folder names - treat as valid path resolvedPaths.append(rawPath) logger.info(f"Path with spaces '{rawPath}' treated as valid folder path") else: # Space-separated words without path separators are search terms # Return as "*" to search globally logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path") resolvedPaths.append("*") # Remove duplicates while preserving order seen = set() uniquePaths = [] for path in resolvedPaths: if path not in seen: seen.add(path) uniquePaths.append(path) logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}") return uniquePaths except Exception as e: logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}") raise ValueError(f"Failed to resolve pathQuery '{pathQuery}': {str(e)}") def cleanSearchQuery(self, query: str) -> str: """ Clean search query to make it compatible with Graph API KQL syntax. Removes path-like syntax and invalid KQL constructs. Parameters: query (str): Raw search query that may contain paths and invalid syntax Returns: str: Cleaned query suitable for Graph API search endpoint """ if not query or not query.strip(): return "" query = query.strip() # Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf" # Extract the search term and file extension # First, extract file extension if present (format: :files:.pdf or just .pdf at the end) fileExtension = "" if ':files:' in query.lower() or ':folders:' in query.lower(): # Extract extension after the type filter extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE) if extMatch: fileExtension = extMatch.group(1) # Remove the type filter part query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE) elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')): # Extract extension from end extMatch = re.search(r'(\.\w+)$', query) if extMatch: fileExtension = extMatch.group(1) query = query[:-len(fileExtension)] # Extract search term: get the last segment after the last slash (filename part) queryNormalized = query.replace('\\', '/') if '/' in queryNormalized: # Extract the last segment (usually the filename/search term) lastSegment = queryNormalized.split('/')[-1] # Remove any remaining colons or type filters if ':' in lastSegment: lastSegment = lastSegment.split(':')[0] searchTerm = lastSegment.strip() else: # No path separators, use the query as-is but remove type filters if ':' in query: searchTerm = query.split(':')[0].strip() else: searchTerm = query.strip() # Remove any remaining type filters or invalid syntax searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE) searchTerm = searchTerm.strip() # If we have a file extension, include it in the search term # Note: Graph API search endpoint may not support filetype: syntax # So we include the extension as part of the search term or filter results after if fileExtension: extWithoutDot = fileExtension.lstrip('.') # Try simple approach: add extension as search term # If this doesn't work, we'll filter results after search if searchTerm: # Include extension in search - Graph API will search in filename searchTerm = f"{searchTerm} {extWithoutDot}" else: searchTerm = extWithoutDot # Final cleanup: remove any remaining invalid characters for KQL # Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm) searchTerm = ' '.join(searchTerm.split()) # Normalize whitespace return searchTerm if searchTerm else "*"