338 lines
16 KiB
Python
338 lines
16 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
|
|
"""
|
|
Path Processing helper for SharePoint operations.
|
|
Handles search query parsing, path resolution, and query cleaning.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import List, Optional, Dict, Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class PathProcessingHelper:
|
|
"""Helper for path and query processing"""
|
|
|
|
def __init__(self, methodInstance):
|
|
"""
|
|
Initialize path processing helper.
|
|
|
|
Args:
|
|
methodInstance: Instance of MethodSharepoint (for access to services)
|
|
"""
|
|
self.method = methodInstance
|
|
self.services = methodInstance.services
|
|
|
|
def parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]:
|
|
"""
|
|
Parse searchQuery to extract path, search terms, search type, and search options.
|
|
|
|
CRITICAL: NEVER convert words to paths! Words stay as search terms.
|
|
- "root document lesson" → fileQuery="root document lesson" (NOT "/root/document/lesson")
|
|
- "root, gose" → fileQuery="root, gose" (NOT "/root/gose")
|
|
- "druckersteuerung eskalation logobject" → fileQuery="druckersteuerung eskalation logobject"
|
|
|
|
Parameters:
|
|
searchQuery (str): Enhanced search query with options:
|
|
- "budget" -> pathQuery="*", fileQuery="budget", searchType="all", options={}
|
|
- "root document lesson" -> pathQuery="*", fileQuery="root document lesson", searchType="all", options={}
|
|
- "root, gose" -> pathQuery="*", fileQuery="root, gose", searchType="all", options={}
|
|
- "/Documents:budget" -> pathQuery="/Documents", fileQuery="budget", searchType="all", options={}
|
|
- "files:budget" -> pathQuery="*", fileQuery="budget", searchType="files", options={}
|
|
- "folders:DELTA" -> pathQuery="*", fileQuery="DELTA", searchType="folders", options={}
|
|
- "exact:\"Operations 2025\"" -> exact phrase matching
|
|
- "regex:^Operations.*2025$" -> regex pattern matching
|
|
- "case:DELTA" -> case-sensitive search
|
|
- "and:DELTA AND 2025 Mars AND Group" -> all AND terms must be present
|
|
|
|
Returns:
|
|
tuple[str, str, str, dict]: (pathQuery, fileQuery, searchType, searchOptions)
|
|
"""
|
|
try:
|
|
if not searchQuery or not searchQuery.strip() or searchQuery.strip() == "*":
|
|
return "*", "*", "all", {}
|
|
|
|
searchQuery = searchQuery.strip()
|
|
searchOptions = {}
|
|
|
|
# CRITICAL: Do NOT convert space-separated or comma-separated words to paths!
|
|
# "root document lesson" should stay as "root document lesson", NOT "/root/document/lesson"
|
|
# "root, gose" should stay as "root, gose", NOT "/root/gose"
|
|
|
|
# Check for search type specification (files:, folders:, all:) FIRST
|
|
searchType = "all" # Default
|
|
if searchQuery.startswith(("files:", "folders:", "all:")):
|
|
typeParts = searchQuery.split(':', 1)
|
|
searchType = typeParts[0].strip()
|
|
searchQuery = typeParts[1].strip()
|
|
|
|
# Extract optional site hint tokens: support "site=Name" or leading "site:Name"
|
|
def _extractSiteHint(q: str) -> tuple[str, Optional[str]]:
|
|
try:
|
|
qStrip = q.strip()
|
|
# Leading form: site:KM LayerFinance ...
|
|
if qStrip.lower().startswith("site:"):
|
|
after = qStrip[5:].lstrip()
|
|
# site name until next space or end
|
|
if ' ' in after:
|
|
siteName, rest = after.split(' ', 1)
|
|
else:
|
|
siteName, rest = after, ''
|
|
return rest.strip(), siteName.strip()
|
|
# Inline key=value form anywhere
|
|
m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE)
|
|
if m:
|
|
siteName = m.group(1).strip()
|
|
# remove the token from query
|
|
qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip()
|
|
return qNew, siteName
|
|
except Exception:
|
|
pass
|
|
return q, None
|
|
|
|
searchQuery, extractedSite = _extractSiteHint(searchQuery)
|
|
if extractedSite:
|
|
searchOptions["site_hint"] = extractedSite
|
|
logger.info(f"Extracted site hint: '{extractedSite}'")
|
|
|
|
# Extract name="..." if present (for quoted multi-word names)
|
|
nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery)
|
|
if nameMatch:
|
|
searchQuery = nameMatch.group(1)
|
|
logger.info(f"Extracted name from quotes: '{searchQuery}'")
|
|
|
|
# Check for search mode specification (exact:, regex:, case:, and:)
|
|
if searchQuery.startswith(("exact:", "regex:", "case:", "and:")):
|
|
modeParts = searchQuery.split(':', 1)
|
|
mode = modeParts[0].strip()
|
|
searchQuery = modeParts[1].strip()
|
|
|
|
if mode == "exact":
|
|
searchOptions["exact_match"] = True
|
|
# Remove quotes if present
|
|
if searchQuery.startswith('"') and searchQuery.endswith('"'):
|
|
searchQuery = searchQuery[1:-1]
|
|
elif mode == "regex":
|
|
searchOptions["regex_match"] = True
|
|
elif mode == "case":
|
|
searchOptions["case_sensitive"] = True
|
|
elif mode == "and":
|
|
searchOptions["and_terms"] = True
|
|
|
|
# Check if it contains path:search format
|
|
# Microsoft-standard paths: /sites/SiteName/Path:files:.pdf
|
|
if ':' in searchQuery:
|
|
# For Microsoft-standard paths (/sites/...), find the colon that separates path from search
|
|
if searchQuery.startswith('/sites/'):
|
|
# Find the colon that separates path from search (after the full path)
|
|
# Look for pattern: /sites/SiteName/Path/...:files:.pdf
|
|
# We need to find the colon that's followed by search type or file extension
|
|
colonPositions = []
|
|
for i, char in enumerate(searchQuery):
|
|
if char == ':':
|
|
colonPositions.append(i)
|
|
|
|
# If we have colons, find the one that's followed by search type or file extension
|
|
splitPos = None
|
|
if colonPositions:
|
|
for pos in colonPositions:
|
|
afterColon = searchQuery[pos+1:pos+10].strip().lower()
|
|
# Check if this colon is followed by search type or looks like a file extension
|
|
if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '':
|
|
splitPos = pos
|
|
break
|
|
|
|
# If no clear split found, use the last colon
|
|
if splitPos is None and colonPositions:
|
|
splitPos = colonPositions[-1]
|
|
|
|
if splitPos:
|
|
pathPart = searchQuery[:splitPos].strip()
|
|
searchPart = searchQuery[splitPos+1:].strip()
|
|
else:
|
|
# Fallback: split on first colon
|
|
parts = searchQuery.split(':', 1)
|
|
pathPart = parts[0].strip()
|
|
searchPart = parts[1].strip()
|
|
else:
|
|
# Regular path:search format - split on first colon
|
|
parts = searchQuery.split(':', 1)
|
|
pathPart = parts[0].strip()
|
|
searchPart = parts[1].strip()
|
|
|
|
# Check if searchPart starts with search type (files:, folders:, all:)
|
|
if searchPart.startswith(("files:", "folders:", "all:")):
|
|
typeParts = searchPart.split(':', 1)
|
|
searchType = typeParts[0].strip() # Update searchType
|
|
searchPart = typeParts[1].strip() if len(typeParts) > 1 else ""
|
|
|
|
# Handle path part
|
|
if not pathPart or pathPart == "*":
|
|
pathQuery = "*"
|
|
elif pathPart.startswith('/'):
|
|
pathQuery = pathPart
|
|
else:
|
|
pathQuery = f"/Documents/{pathPart}"
|
|
|
|
# Handle search part
|
|
if not searchPart or searchPart == "*":
|
|
fileQuery = "*"
|
|
else:
|
|
fileQuery = searchPart
|
|
|
|
return pathQuery, fileQuery, searchType, searchOptions
|
|
|
|
# No colon - check if it looks like a path
|
|
elif searchQuery.startswith('/'):
|
|
# It's a path only
|
|
return searchQuery, "*", searchType, searchOptions
|
|
|
|
else:
|
|
# It's a search term only - keep words as-is, do NOT convert to paths
|
|
# "root document lesson" stays as "root document lesson"
|
|
# "root, gose" stays as "root, gose"
|
|
return "*", searchQuery, searchType, searchOptions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing searchQuery '{searchQuery}': {str(e)}")
|
|
raise ValueError(f"Failed to parse searchQuery '{searchQuery}': {str(e)}")
|
|
|
|
def resolvePathQuery(self, pathQuery: str) -> List[str]:
|
|
"""
|
|
Resolve pathQuery into a list of search paths for SharePoint operations.
|
|
|
|
Parameters:
|
|
pathQuery (str): Query string that can contain:
|
|
- Direct paths (e.g., "/Documents/Project1")
|
|
- Wildcards (e.g., "/Documents/*")
|
|
- Multiple paths separated by semicolons (e.g., "/Docs; /Files")
|
|
- Single word relative paths (e.g., "Project1" -> resolved to default folder)
|
|
- Empty string or "*" for global search
|
|
- Space-separated words are treated as search terms, NOT folder paths
|
|
|
|
Returns:
|
|
List[str]: List of resolved paths
|
|
"""
|
|
try:
|
|
if not pathQuery or not pathQuery.strip() or pathQuery.strip() == "*":
|
|
return ["*"] # Global search across all sites
|
|
|
|
# Split by semicolon to handle multiple paths
|
|
rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()]
|
|
resolvedPaths = []
|
|
|
|
for rawPath in rawPaths:
|
|
# Handle wildcards - return as-is
|
|
if '*' in rawPath:
|
|
resolvedPaths.append(rawPath)
|
|
# Handle absolute paths
|
|
elif rawPath.startswith('/'):
|
|
resolvedPaths.append(rawPath)
|
|
# Handle single word relative paths - prepend default folder
|
|
# BUT NOT space-separated words (those are search terms, not paths)
|
|
elif ' ' not in rawPath:
|
|
resolvedPaths.append(f"/Documents/{rawPath}")
|
|
else:
|
|
# Check if this looks like a path (has path separators) or search terms
|
|
if '\\' in rawPath or '/' in rawPath:
|
|
# This looks like a path with spaces in folder names - treat as valid path
|
|
resolvedPaths.append(rawPath)
|
|
logger.info(f"Path with spaces '{rawPath}' treated as valid folder path")
|
|
else:
|
|
# Space-separated words without path separators are search terms
|
|
# Return as "*" to search globally
|
|
logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path")
|
|
resolvedPaths.append("*")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
uniquePaths = []
|
|
for path in resolvedPaths:
|
|
if path not in seen:
|
|
seen.add(path)
|
|
uniquePaths.append(path)
|
|
|
|
logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}")
|
|
return uniquePaths
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}")
|
|
raise ValueError(f"Failed to resolve pathQuery '{pathQuery}': {str(e)}")
|
|
|
|
def cleanSearchQuery(self, query: str) -> str:
|
|
"""
|
|
Clean search query to make it compatible with Graph API KQL syntax.
|
|
Removes path-like syntax and invalid KQL constructs.
|
|
|
|
Parameters:
|
|
query (str): Raw search query that may contain paths and invalid syntax
|
|
|
|
Returns:
|
|
str: Cleaned query suitable for Graph API search endpoint
|
|
"""
|
|
if not query or not query.strip():
|
|
return ""
|
|
|
|
query = query.strip()
|
|
|
|
# Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf"
|
|
# Extract the search term and file extension
|
|
|
|
# First, extract file extension if present (format: :files:.pdf or just .pdf at the end)
|
|
fileExtension = ""
|
|
if ':files:' in query.lower() or ':folders:' in query.lower():
|
|
# Extract extension after the type filter
|
|
extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE)
|
|
if extMatch:
|
|
fileExtension = extMatch.group(1)
|
|
# Remove the type filter part
|
|
query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE)
|
|
elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')):
|
|
# Extract extension from end
|
|
extMatch = re.search(r'(\.\w+)$', query)
|
|
if extMatch:
|
|
fileExtension = extMatch.group(1)
|
|
query = query[:-len(fileExtension)]
|
|
|
|
# Extract search term: get the last segment after the last slash (filename part)
|
|
queryNormalized = query.replace('\\', '/')
|
|
if '/' in queryNormalized:
|
|
# Extract the last segment (usually the filename/search term)
|
|
lastSegment = queryNormalized.split('/')[-1]
|
|
# Remove any remaining colons or type filters
|
|
if ':' in lastSegment:
|
|
lastSegment = lastSegment.split(':')[0]
|
|
searchTerm = lastSegment.strip()
|
|
else:
|
|
# No path separators, use the query as-is but remove type filters
|
|
if ':' in query:
|
|
searchTerm = query.split(':')[0].strip()
|
|
else:
|
|
searchTerm = query.strip()
|
|
|
|
# Remove any remaining type filters or invalid syntax
|
|
searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE)
|
|
searchTerm = searchTerm.strip()
|
|
|
|
# If we have a file extension, include it in the search term
|
|
# Note: Graph API search endpoint may not support filetype: syntax
|
|
# So we include the extension as part of the search term or filter results after
|
|
if fileExtension:
|
|
extWithoutDot = fileExtension.lstrip('.')
|
|
# Try simple approach: add extension as search term
|
|
# If this doesn't work, we'll filter results after search
|
|
if searchTerm:
|
|
# Include extension in search - Graph API will search in filename
|
|
searchTerm = f"{searchTerm} {extWithoutDot}"
|
|
else:
|
|
searchTerm = extWithoutDot
|
|
|
|
# Final cleanup: remove any remaining invalid characters for KQL
|
|
# Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators
|
|
searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm)
|
|
searchTerm = ' '.join(searchTerm.split()) # Normalize whitespace
|
|
|
|
return searchTerm if searchTerm else "*"
|
|
|