gateway/modules/workflows/methods/methodSharepoint/helpers/pathProcessing.py
2025-12-17 10:45:09 +01:00

338 lines
16 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Path Processing helper for SharePoint operations.
Handles search query parsing, path resolution, and query cleaning.
"""
import logging
import re
from typing import List, Optional, Dict, Any
logger = logging.getLogger(__name__)
class PathProcessingHelper:
"""Helper for path and query processing"""
def __init__(self, methodInstance):
"""
Initialize path processing helper.
Args:
methodInstance: Instance of MethodSharepoint (for access to services)
"""
self.method = methodInstance
self.services = methodInstance.services
def parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]:
"""
Parse searchQuery to extract path, search terms, search type, and search options.
CRITICAL: NEVER convert words to paths! Words stay as search terms.
- "root document lesson" → fileQuery="root document lesson" (NOT "/root/document/lesson")
- "root, gose" → fileQuery="root, gose" (NOT "/root/gose")
- "druckersteuerung eskalation logobject" → fileQuery="druckersteuerung eskalation logobject"
Parameters:
searchQuery (str): Enhanced search query with options:
- "budget" -> pathQuery="*", fileQuery="budget", searchType="all", options={}
- "root document lesson" -> pathQuery="*", fileQuery="root document lesson", searchType="all", options={}
- "root, gose" -> pathQuery="*", fileQuery="root, gose", searchType="all", options={}
- "/Documents:budget" -> pathQuery="/Documents", fileQuery="budget", searchType="all", options={}
- "files:budget" -> pathQuery="*", fileQuery="budget", searchType="files", options={}
- "folders:DELTA" -> pathQuery="*", fileQuery="DELTA", searchType="folders", options={}
- "exact:\"Operations 2025\"" -> exact phrase matching
- "regex:^Operations.*2025$" -> regex pattern matching
- "case:DELTA" -> case-sensitive search
- "and:DELTA AND 2025 Mars AND Group" -> all AND terms must be present
Returns:
tuple[str, str, str, dict]: (pathQuery, fileQuery, searchType, searchOptions)
"""
try:
if not searchQuery or not searchQuery.strip() or searchQuery.strip() == "*":
return "*", "*", "all", {}
searchQuery = searchQuery.strip()
searchOptions = {}
# CRITICAL: Do NOT convert space-separated or comma-separated words to paths!
# "root document lesson" should stay as "root document lesson", NOT "/root/document/lesson"
# "root, gose" should stay as "root, gose", NOT "/root/gose"
# Check for search type specification (files:, folders:, all:) FIRST
searchType = "all" # Default
if searchQuery.startswith(("files:", "folders:", "all:")):
typeParts = searchQuery.split(':', 1)
searchType = typeParts[0].strip()
searchQuery = typeParts[1].strip()
# Extract optional site hint tokens: support "site=Name" or leading "site:Name"
def _extractSiteHint(q: str) -> tuple[str, Optional[str]]:
try:
qStrip = q.strip()
# Leading form: site:KM LayerFinance ...
if qStrip.lower().startswith("site:"):
after = qStrip[5:].lstrip()
# site name until next space or end
if ' ' in after:
siteName, rest = after.split(' ', 1)
else:
siteName, rest = after, ''
return rest.strip(), siteName.strip()
# Inline key=value form anywhere
m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE)
if m:
siteName = m.group(1).strip()
# remove the token from query
qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip()
return qNew, siteName
except Exception:
pass
return q, None
searchQuery, extractedSite = _extractSiteHint(searchQuery)
if extractedSite:
searchOptions["site_hint"] = extractedSite
logger.info(f"Extracted site hint: '{extractedSite}'")
# Extract name="..." if present (for quoted multi-word names)
nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery)
if nameMatch:
searchQuery = nameMatch.group(1)
logger.info(f"Extracted name from quotes: '{searchQuery}'")
# Check for search mode specification (exact:, regex:, case:, and:)
if searchQuery.startswith(("exact:", "regex:", "case:", "and:")):
modeParts = searchQuery.split(':', 1)
mode = modeParts[0].strip()
searchQuery = modeParts[1].strip()
if mode == "exact":
searchOptions["exact_match"] = True
# Remove quotes if present
if searchQuery.startswith('"') and searchQuery.endswith('"'):
searchQuery = searchQuery[1:-1]
elif mode == "regex":
searchOptions["regex_match"] = True
elif mode == "case":
searchOptions["case_sensitive"] = True
elif mode == "and":
searchOptions["and_terms"] = True
# Check if it contains path:search format
# Microsoft-standard paths: /sites/SiteName/Path:files:.pdf
if ':' in searchQuery:
# For Microsoft-standard paths (/sites/...), find the colon that separates path from search
if searchQuery.startswith('/sites/'):
# Find the colon that separates path from search (after the full path)
# Look for pattern: /sites/SiteName/Path/...:files:.pdf
# We need to find the colon that's followed by search type or file extension
colonPositions = []
for i, char in enumerate(searchQuery):
if char == ':':
colonPositions.append(i)
# If we have colons, find the one that's followed by search type or file extension
splitPos = None
if colonPositions:
for pos in colonPositions:
afterColon = searchQuery[pos+1:pos+10].strip().lower()
# Check if this colon is followed by search type or looks like a file extension
if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '':
splitPos = pos
break
# If no clear split found, use the last colon
if splitPos is None and colonPositions:
splitPos = colonPositions[-1]
if splitPos:
pathPart = searchQuery[:splitPos].strip()
searchPart = searchQuery[splitPos+1:].strip()
else:
# Fallback: split on first colon
parts = searchQuery.split(':', 1)
pathPart = parts[0].strip()
searchPart = parts[1].strip()
else:
# Regular path:search format - split on first colon
parts = searchQuery.split(':', 1)
pathPart = parts[0].strip()
searchPart = parts[1].strip()
# Check if searchPart starts with search type (files:, folders:, all:)
if searchPart.startswith(("files:", "folders:", "all:")):
typeParts = searchPart.split(':', 1)
searchType = typeParts[0].strip() # Update searchType
searchPart = typeParts[1].strip() if len(typeParts) > 1 else ""
# Handle path part
if not pathPart or pathPart == "*":
pathQuery = "*"
elif pathPart.startswith('/'):
pathQuery = pathPart
else:
pathQuery = f"/Documents/{pathPart}"
# Handle search part
if not searchPart or searchPart == "*":
fileQuery = "*"
else:
fileQuery = searchPart
return pathQuery, fileQuery, searchType, searchOptions
# No colon - check if it looks like a path
elif searchQuery.startswith('/'):
# It's a path only
return searchQuery, "*", searchType, searchOptions
else:
# It's a search term only - keep words as-is, do NOT convert to paths
# "root document lesson" stays as "root document lesson"
# "root, gose" stays as "root, gose"
return "*", searchQuery, searchType, searchOptions
except Exception as e:
logger.error(f"Error parsing searchQuery '{searchQuery}': {str(e)}")
raise ValueError(f"Failed to parse searchQuery '{searchQuery}': {str(e)}")
def resolvePathQuery(self, pathQuery: str) -> List[str]:
"""
Resolve pathQuery into a list of search paths for SharePoint operations.
Parameters:
pathQuery (str): Query string that can contain:
- Direct paths (e.g., "/Documents/Project1")
- Wildcards (e.g., "/Documents/*")
- Multiple paths separated by semicolons (e.g., "/Docs; /Files")
- Single word relative paths (e.g., "Project1" -> resolved to default folder)
- Empty string or "*" for global search
- Space-separated words are treated as search terms, NOT folder paths
Returns:
List[str]: List of resolved paths
"""
try:
if not pathQuery or not pathQuery.strip() or pathQuery.strip() == "*":
return ["*"] # Global search across all sites
# Split by semicolon to handle multiple paths
rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()]
resolvedPaths = []
for rawPath in rawPaths:
# Handle wildcards - return as-is
if '*' in rawPath:
resolvedPaths.append(rawPath)
# Handle absolute paths
elif rawPath.startswith('/'):
resolvedPaths.append(rawPath)
# Handle single word relative paths - prepend default folder
# BUT NOT space-separated words (those are search terms, not paths)
elif ' ' not in rawPath:
resolvedPaths.append(f"/Documents/{rawPath}")
else:
# Check if this looks like a path (has path separators) or search terms
if '\\' in rawPath or '/' in rawPath:
# This looks like a path with spaces in folder names - treat as valid path
resolvedPaths.append(rawPath)
logger.info(f"Path with spaces '{rawPath}' treated as valid folder path")
else:
# Space-separated words without path separators are search terms
# Return as "*" to search globally
logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path")
resolvedPaths.append("*")
# Remove duplicates while preserving order
seen = set()
uniquePaths = []
for path in resolvedPaths:
if path not in seen:
seen.add(path)
uniquePaths.append(path)
logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}")
return uniquePaths
except Exception as e:
logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}")
raise ValueError(f"Failed to resolve pathQuery '{pathQuery}': {str(e)}")
def cleanSearchQuery(self, query: str) -> str:
"""
Clean search query to make it compatible with Graph API KQL syntax.
Removes path-like syntax and invalid KQL constructs.
Parameters:
query (str): Raw search query that may contain paths and invalid syntax
Returns:
str: Cleaned query suitable for Graph API search endpoint
"""
if not query or not query.strip():
return ""
query = query.strip()
# Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf"
# Extract the search term and file extension
# First, extract file extension if present (format: :files:.pdf or just .pdf at the end)
fileExtension = ""
if ':files:' in query.lower() or ':folders:' in query.lower():
# Extract extension after the type filter
extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE)
if extMatch:
fileExtension = extMatch.group(1)
# Remove the type filter part
query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE)
elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')):
# Extract extension from end
extMatch = re.search(r'(\.\w+)$', query)
if extMatch:
fileExtension = extMatch.group(1)
query = query[:-len(fileExtension)]
# Extract search term: get the last segment after the last slash (filename part)
queryNormalized = query.replace('\\', '/')
if '/' in queryNormalized:
# Extract the last segment (usually the filename/search term)
lastSegment = queryNormalized.split('/')[-1]
# Remove any remaining colons or type filters
if ':' in lastSegment:
lastSegment = lastSegment.split(':')[0]
searchTerm = lastSegment.strip()
else:
# No path separators, use the query as-is but remove type filters
if ':' in query:
searchTerm = query.split(':')[0].strip()
else:
searchTerm = query.strip()
# Remove any remaining type filters or invalid syntax
searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE)
searchTerm = searchTerm.strip()
# If we have a file extension, include it in the search term
# Note: Graph API search endpoint may not support filetype: syntax
# So we include the extension as part of the search term or filter results after
if fileExtension:
extWithoutDot = fileExtension.lstrip('.')
# Try simple approach: add extension as search term
# If this doesn't work, we'll filter results after search
if searchTerm:
# Include extension in search - Graph API will search in filename
searchTerm = f"{searchTerm} {extWithoutDot}"
else:
searchTerm = extWithoutDot
# Final cleanup: remove any remaining invalid characters for KQL
# Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators
searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm)
searchTerm = ' '.join(searchTerm.split()) # Normalize whitespace
return searchTerm if searchTerm else "*"