diff --git a/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt b/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt deleted file mode 100644 index 7a71526d..00000000 --- a/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt +++ /dev/null @@ -1,72 +0,0 @@ -Message 0 (user) -Length: 2015 chars -================================================================================ -User request: "Generate the first 1000 prime numbers." - -Generate a NEW, COMPLETE JSON response. The template below shows ONLY the structure pattern - it is NOT existing content. Start from the beginning. - -JSON structure template (reference only - shows the pattern): -{ - "metadata": { - "split_strategy": "single_document", - "source_documents": [], - "extraction_method": "ai_generation" - }, - "documents": [ - { - "id": "doc_1", - "title": "Generated Document", - "filename": "document.json", - "sections": [ - { - "id": "section_heading_example", - "content_type": "heading", - "elements": [ - {"level": 1, "text": "Heading Text"} - ], - "order": 0 - }, - { - "id": "section_paragraph_example", - "content_type": "paragraph", - "elements": [ - {"text": "Paragraph text content"} - ], - "order": 0 - }, - { - "id": "section_list_example", - "content_type": "list", - "elements": [ - { - "items": [ - {"text": "Item 1"}, - {"text": "Item 2"} - ], - "list_type": "numbered" - } - ], - "order": 0 - }, - { - "id": "section_table_example", - "content_type": "table", - "elements": [ - { - "headers": ["Column 1", "Column 2"], - "rows": [ - ["Row 1 Col 1", "Row 1 Col 2"], - ["Row 2 Col 1", "Row 2 Col 2"] - ], - "caption": "Table caption" - } - ], - "order": 0 - }, - { - "id": "section_code_example", - "content_type": "code", - "elements": [ - { - "code... [truncated] -================================================================================ diff --git a/local/logs/debug/prompts/20251029-223033-api_received_response.txt b/local/logs/debug/prompts/20251029-223033-api_received_response.txt deleted file mode 100644 index 31f8964c..00000000 --- a/local/logs/debug/prompts/20251029-223033-api_received_response.txt +++ /dev/null @@ -1,200 +0,0 @@ -API Response -Length: 6082 chars -================================================================================ -{ - "metadata": { - "split_strategy": "single_document", - "source_documents": [], - "extraction_method": "ai_generation" - }, - "documents": [ - { - "id": "doc_1", - "title": "List of First 1000 Prime Numbers", - "filename": "prime_numbers.json", - "sections": [ - { - "id": "section_prime_numbers", - "content_type": "list", - "elements": [ - { - "items": [ - {"number": 2}, - {"number": 3}, - {"number": 5}, - {"number": 7}, - {"number": 11}, - {"number": 13}, - {"number": 17}, - {"number": 19}, - {"number": 23}, - {"number": 29}, - {"number": 31}, - {"number": 37}, - {"number": 41}, - {"number": 43}, - {"number": 47}, - {"number": 53}, - {"number": 59}, - {"number": 61}, - {"number": 67}, - {"number": 71}, - {"number": 73}, - {"number": 79}, - {"number": 83}, - {"number": 89}, - {"number": 97}, - {"number": 101}, - {"number": 103}, - {"number": 107}, - {"number": 109}, - {"number": 113}, - {"number": 127}, - {"number": 131}, - {"number": 137}, - {"number": 139}, - {"number": 149}, - {"number": 151}, - {"number": 157}, - {"number": 163}, - {"number": 167}, - {"number": 173}, - {"number": 179}, - {"number": 181}, - {"number": 191}, - {"number": 193}, - {"number": 197}, - {"number": 199}, - {"number": 211}, - {"number": 223}, - {"number": 227}, - {"number": 229}, - {"number": 233}, - {"number": 239}, - {"number": 241}, - {"number": 251}, - {"number": 257}, - {"number": 263}, - {"number": 269}, - {"number": 271}, - {"number": 277}, - {"number": 281}, - {"number": 283}, - {"number": 293}, - {"number": 307}, - {"number": 311}, - {"number": 313}, - {"number": 317}, - {"number": 331}, - {"number": 337}, - {"number": 347}, - {"number": 349}, - {"number": 353}, - {"number": 359}, - {"number": 367}, - {"number": 373}, - {"number": 379}, - {"number": 383}, - {"number": 389}, - {"number": 397}, - {"number": 401}, - {"number": 409}, - {"number": 419}, - {"number": 421}, - {"number": 431}, - {"number": 433}, - {"number": 439}, - {"number": 443}, - {"number": 449}, - {"number": 457}, - {"number": 461}, - {"number": 463}, - {"number": 467}, - {"number": 479}, - {"number": 487}, - {"number": 491}, - {"number": 499}, - {"number": 503}, - {"number": 509}, - {"number": 521}, - {"number": 523}, - {"number": 541}, - {"number": 547}, - {"number": 557}, - {"number": 563}, - {"number": 569}, - {"number": 571}, - {"number": 577}, - {"number": 587}, - {"number": 593}, - {"number": 599}, - {"number": 601}, - {"number": 607}, - {"number": 613}, - {"number": 617}, - {"number": 619}, - {"number": 631}, - {"number": 641}, - {"number": 643}, - {"number": 647}, - {"number": 653}, - {"number": 659}, - {"number": 661}, - {"number": 673}, - {"number": 677}, - {"number": 683}, - {"number": 691}, - {"number": 701}, - {"number": 709}, - {"number": 719}, - {"number": 727}, - {"number": 733}, - {"number": 739}, - {"number": 743}, - {"number": 751}, - {"number": 757}, - {"number": 761}, - {"number": 769}, - {"number": 773}, - {"number": 787}, - {"number": 797}, - {"number": 809}, - {"number": 811}, - {"number": 821}, - {"number": 823}, - {"number": 827}, - {"number": 829}, - {"number": 839}, - {"number": 853}, - {"number": 857}, - {"number": 859}, - {"number": 863}, - {"number": 877}, - {"number": 881}, - {"number": 883}, - {"number": 887}, - {"number": 907}, - {"number": 911}, - {"number": 919}, - {"number": 929}, - {"number": 937}, - {"number": 941}, - {"number": 947}, - {"number": 953}, - {"number": 967}, - {"number": 971}, - {"number": 977}, - {"number": 983}, - {"number": 991}, - {"number": 997} - ], - "list_type": "numbered" - } - ], - "order": 0 - } - ] - } - ] -} -================================================================================ diff --git a/modules/interfaces/interfaceDbChatObjects.py b/modules/interfaces/interfaceDbChatObjects.py index 35ed2f32..94cbe974 100644 --- a/modules/interfaces/interfaceDbChatObjects.py +++ b/modules/interfaces/interfaceDbChatObjects.py @@ -1573,13 +1573,7 @@ class ChatObjects: executionLog["workflowId"] = workflow.id executionLog["status"] = "completed" executionLog["messages"].append(f"Workflow {workflow.id} started successfully") - - # Also store plan in module-level cache as backup (keyed by workflow ID) - from modules.workflows.processing.modes import modeAutomation - if not hasattr(modeAutomation, '_templatePlanCache'): - modeAutomation._templatePlanCache = {} - modeAutomation._templatePlanCache[workflow.id] = plan - logger.info(f"Stored template plan for workflow {workflow.id} (cache + prompt) with {len(plan.get('tasks', []))} tasks") + logger.info(f"Started workflow {workflow.id} with plan containing {len(plan.get('tasks', []))} tasks (plan embedded in userInput)") # Update automation with execution log executionLogs = automation.get("executionLogs", []) diff --git a/modules/workflows/methods/methodSharepoint.py b/modules/workflows/methods/methodSharepoint.py index e53b43a5..6dabdaf5 100644 --- a/modules/workflows/methods/methodSharepoint.py +++ b/modules/workflows/methods/methodSharepoint.py @@ -62,16 +62,22 @@ class MethodSharepoint(MethodBase): logger.error(f"Error getting Microsoft connection: {str(e)}") return None - async def _discoverSharePointSites(self) -> List[Dict[str, Any]]: + async def _discoverSharePointSites(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: """ - Discover all SharePoint sites accessible to the user via Microsoft Graph API + Discover SharePoint sites accessible to the user via Microsoft Graph API + + Parameters: + limit (Optional[int]): Limit number of sites to return (for optimization when only hostname is needed) Returns: List[Dict[str, Any]]: List of SharePoint site information """ try: - # Query Microsoft Graph to get all sites the user has access to + # Query Microsoft Graph to get sites the user has access to endpoint = "sites?search=*" + if limit: + endpoint += f"&$top={limit}" + result = await self._makeGraphApiCall(endpoint) if "error" in result: @@ -79,12 +85,15 @@ class MethodSharepoint(MethodBase): return [] sites = result.get("value", []) - logger.info(f"Discovered {len(sites)} SharePoint sites") + if limit: + sites = sites[:limit] + + logger.info(f"Discovered {len(sites)} SharePoint sites" + (f" (limited to {limit})" if limit else "")) # Process and return site information - processed_sites = [] + processedSites = [] for site in sites: - site_info = { + siteInfo = { "id": site.get("id"), "displayName": site.get("displayName"), "name": site.get("name"), @@ -93,59 +102,122 @@ class MethodSharepoint(MethodBase): "createdDateTime": site.get("createdDateTime"), "lastModifiedDateTime": site.get("lastModifiedDateTime") } - processed_sites.append(site_info) - logger.debug(f"Site: {site_info['displayName']} - {site_info['webUrl']}") + processedSites.append(siteInfo) + logger.debug(f"Site: {siteInfo['displayName']} - {siteInfo['webUrl']}") - return processed_sites + return processedSites except Exception as e: logger.error(f"Error discovering SharePoint sites: {str(e)}") return [] - def _filter_sites_by_hint(self, sites: List[Dict[str, Any]], site_hint: str) -> List[Dict[str, Any]]: + def _extractHostnameFromWebUrl(self, webUrl: str) -> Optional[str]: + """Extract hostname from SharePoint webUrl (e.g., https://pcuster.sharepoint.com)""" + try: + if not webUrl: + return None + parsed = urllib.parse.urlparse(webUrl) + return parsed.hostname + except Exception as e: + logger.error(f"Error extracting hostname from webUrl '{webUrl}': {str(e)}") + return None + + async def _getSiteByStandardPath(self, sitePath: str) -> Optional[Dict[str, Any]]: + """ + Get SharePoint site directly by Microsoft-standard path (/sites/SiteName) + without loading all sites. Uses hostname from first available site. + + Parameters: + sitePath (str): Site path like 'company-share' (without /sites/ prefix) + + Returns: + Optional[Dict[str, Any]]: Site information if found, None otherwise + """ + try: + # Get hostname from first available site (minimal load - only 1 site) + minimalSites = await self._discoverSharePointSites(limit=1) + if not minimalSites: + logger.warning("No sites available to extract hostname") + return None + + hostname = self._extractHostnameFromWebUrl(minimalSites[0].get("webUrl")) + if not hostname: + logger.warning("Could not extract hostname from site") + return None + + logger.info(f"Extracted hostname '{hostname}' from first site, now getting site by path: {sitePath}") + + # Get site directly using hostname + path + endpoint = f"sites/{hostname}:/sites/{sitePath}" + result = await self._makeGraphApiCall(endpoint) + + if "error" in result: + logger.warning(f"Could not get site directly by path '{sitePath}': {result['error']}") + return None + + siteInfo = { + "id": result.get("id"), + "displayName": result.get("displayName"), + "name": result.get("name"), + "webUrl": result.get("webUrl"), + "description": result.get("description"), + "createdDateTime": result.get("createdDateTime"), + "lastModifiedDateTime": result.get("lastModifiedDateTime") + } + + logger.info(f"Successfully got site by standard path: {siteInfo['displayName']} (ID: {siteInfo['id']})") + return siteInfo + + except Exception as e: + logger.error(f"Error getting site by standard path '{sitePath}': {str(e)}") + return None + + def _filterSitesByHint(self, sites: List[Dict[str, Any]], siteHint: str) -> List[Dict[str, Any]]: """Filter discovered sites by a human-entered site hint (case-insensitive substring).""" try: - if not site_hint: + if not siteHint: return sites - hint = site_hint.strip().lower() + hint = siteHint.strip().lower() filtered: List[Dict[str, Any]] = [] for site in sites: name = (site.get("displayName") or "").lower() - web_url = (site.get("webUrl") or "").lower() - if hint in name or hint in web_url: + webUrl = (site.get("webUrl") or "").lower() + if hint in name or hint in webUrl: filtered.append(site) return filtered if filtered else sites except Exception as e: - logger.error(f"Error filtering sites by hint '{site_hint}': {str(e)}") + logger.error(f"Error filtering sites by hint '{siteHint}': {str(e)}") return sites - - def _parse_site_scoped_path(self, path_query: str) -> Optional[Dict[str, str]]: + def _extractSiteFromStandardPath(self, pathQuery: str) -> Optional[Dict[str, str]]: """ - Parse a site-scoped path of the form: - /site:KM LayerFinance/Documents/Work or /site:KM LayerFinance/Shared Documents/Work - + Extract site name from Microsoft-standard server-relative path: + /sites/company-share/Freigegebene Dokumente/... + Returns dict with keys: siteName, innerPath (no leading slash) on success, else None. """ try: - if not path_query or not path_query.startswith('/'): + if not pathQuery or not pathQuery.startswith('/sites/'): return None - # expected syntax prefix - prefix = '/site:' - if not path_query.startswith(prefix): - return None - remainder = path_query[len(prefix):] - # split once on the next '/' + + # Remove leading /sites/ prefix + remainder = pathQuery[7:] # len('/sites/') = 7 + + # Split on first '/' to get site name if '/' not in remainder: + # Only site name, no inner path + return {"siteName": remainder, "innerPath": ""} + + siteName, inner = remainder.split('/', 1) + siteName = siteName.strip() + innerPath = inner.strip() + + if not siteName: return None - site_name, inner = remainder.split('/', 1) - site_name = site_name.strip() - inner_path = inner.strip() - if not site_name or not inner_path: - return None - return {"siteName": site_name, "innerPath": inner_path} + + return {"siteName": siteName, "innerPath": innerPath} except Exception as e: - logger.error(f"Error parsing site-scoped path '{path_query}': {str(e)}") + logger.error(f"Error extracting site from standard path '{pathQuery}': {str(e)}") return None def _parseSearchQuery(self, searchQuery: str) -> tuple[str, str, str, dict]: @@ -187,50 +259,50 @@ class MethodSharepoint(MethodBase): # Check for search type specification (files:, folders:, all:) FIRST searchType = "all" # Default if searchQuery.startswith(("files:", "folders:", "all:")): - type_parts = searchQuery.split(':', 1) - searchType = type_parts[0].strip() - searchQuery = type_parts[1].strip() + typeParts = searchQuery.split(':', 1) + searchType = typeParts[0].strip() + searchQuery = typeParts[1].strip() # Extract optional site hint tokens: support "site=Name" or leading "site:Name" - def _extract_site_hint(q: str) -> tuple[str, Optional[str]]: + def _extractSiteHint(q: str) -> tuple[str, Optional[str]]: try: - q_strip = q.strip() + qStrip = q.strip() # Leading form: site:KM LayerFinance ... - if q_strip.lower().startswith("site:"): - after = q_strip[5:].lstrip() + if qStrip.lower().startswith("site:"): + after = qStrip[5:].lstrip() # site name until next space or end if ' ' in after: - site_name, rest = after.split(' ', 1) + siteName, rest = after.split(' ', 1) else: - site_name, rest = after, '' - return rest.strip(), site_name.strip() + siteName, rest = after, '' + return rest.strip(), siteName.strip() # Inline key=value form anywhere - m = re.search(r"\bsite=([^;\s]+)", q_strip, flags=re.IGNORECASE) + m = re.search(r"\bsite=([^;\s]+)", qStrip, flags=re.IGNORECASE) if m: - site_name = m.group(1).strip() + siteName = m.group(1).strip() # remove the token from query - q_new = re.sub(r"\bsite=[^;\s]+;?", "", q_strip, flags=re.IGNORECASE).strip() - return q_new, site_name + qNew = re.sub(r"\bsite=[^;\s]+;?", "", qStrip, flags=re.IGNORECASE).strip() + return qNew, siteName except Exception: pass return q, None - searchQuery, extracted_site = _extract_site_hint(searchQuery) - if extracted_site: - searchOptions["site_hint"] = extracted_site - logger.info(f"Extracted site hint: '{extracted_site}'") + searchQuery, extractedSite = _extractSiteHint(searchQuery) + if extractedSite: + searchOptions["site_hint"] = extractedSite + logger.info(f"Extracted site hint: '{extractedSite}'") # Extract name="..." if present (for quoted multi-word names) - name_match = re.search(r"name=\"([^\"]+)\"", searchQuery) - if name_match: - searchQuery = name_match.group(1) + nameMatch = re.search(r"name=\"([^\"]+)\"", searchQuery) + if nameMatch: + searchQuery = nameMatch.group(1) logger.info(f"Extracted name from quotes: '{searchQuery}'") # Check for search mode specification (exact:, regex:, case:, and:) if searchQuery.startswith(("exact:", "regex:", "case:", "and:")): - mode_parts = searchQuery.split(':', 1) - mode = mode_parts[0].strip() - searchQuery = mode_parts[1].strip() + modeParts = searchQuery.split(':', 1) + mode = modeParts[0].strip() + searchQuery = modeParts[1].strip() if mode == "exact": searchOptions["exact_match"] = True @@ -245,26 +317,66 @@ class MethodSharepoint(MethodBase): searchOptions["and_terms"] = True # Check if it contains path:search format + # Microsoft-standard paths: /sites/SiteName/Path:files:.pdf if ':' in searchQuery: - parts = searchQuery.split(':', 1) # Split only on first colon - path_part = parts[0].strip() - search_part = parts[1].strip() + # For Microsoft-standard paths (/sites/...), find the colon that separates path from search + if searchQuery.startswith('/sites/'): + # Find the colon that separates path from search (after the full path) + # Look for pattern: /sites/SiteName/Path/...:files:.pdf + # We need to find the colon that's followed by search type or file extension + colonPositions = [] + for i, char in enumerate(searchQuery): + if char == ':': + colonPositions.append(i) + + # If we have colons, find the one that's followed by search type or file extension + splitPos = None + if colonPositions: + for pos in colonPositions: + afterColon = searchQuery[pos+1:pos+10].strip().lower() + # Check if this colon is followed by search type or looks like a file extension + if afterColon.startswith(('files:', 'folders:', 'all:', '.')) or afterColon == '': + splitPos = pos + break + + # If no clear split found, use the last colon + if splitPos is None and colonPositions: + splitPos = colonPositions[-1] + + if splitPos: + pathPart = searchQuery[:splitPos].strip() + searchPart = searchQuery[splitPos+1:].strip() + else: + # Fallback: split on first colon + parts = searchQuery.split(':', 1) + pathPart = parts[0].strip() + searchPart = parts[1].strip() + else: + # Regular path:search format - split on first colon + parts = searchQuery.split(':', 1) + pathPart = parts[0].strip() + searchPart = parts[1].strip() + + # Check if searchPart starts with search type (files:, folders:, all:) + if searchPart.startswith(("files:", "folders:", "all:")): + typeParts = searchPart.split(':', 1) + searchType = typeParts[0].strip() # Update searchType + searchPart = typeParts[1].strip() if len(typeParts) > 1 else "" # Handle path part - if not path_part or path_part == "*": + if not pathPart or pathPart == "*": pathQuery = "*" - elif path_part.startswith('/'): - pathQuery = path_part + elif pathPart.startswith('/'): + pathQuery = pathPart else: - pathQuery = f"/Documents/{path_part}" + pathQuery = f"/Documents/{pathPart}" # Handle search part - if not search_part or search_part == "*": + if not searchPart or searchPart == "*": fileQuery = "*" else: - fileQuery = search_part + fileQuery = searchPart - # Use search_part as fileQuery (name extraction already handled above) return pathQuery, fileQuery, searchType, searchOptions # No colon - check if it looks like a path @@ -303,42 +415,42 @@ class MethodSharepoint(MethodBase): return ["*"] # Global search across all sites # Split by semicolon to handle multiple paths - raw_paths = [path.strip() for path in pathQuery.split(';') if path.strip()] - resolved_paths = [] + rawPaths = [path.strip() for path in pathQuery.split(';') if path.strip()] + resolvedPaths = [] - for raw_path in raw_paths: + for rawPath in rawPaths: # Handle wildcards - return as-is - if '*' in raw_path: - resolved_paths.append(raw_path) + if '*' in rawPath: + resolvedPaths.append(rawPath) # Handle absolute paths - elif raw_path.startswith('/'): - resolved_paths.append(raw_path) + elif rawPath.startswith('/'): + resolvedPaths.append(rawPath) # Handle single word relative paths - prepend default folder # BUT NOT space-separated words (those are search terms, not paths) - elif ' ' not in raw_path: - resolved_paths.append(f"/Documents/{raw_path}") + elif ' ' not in rawPath: + resolvedPaths.append(f"/Documents/{rawPath}") else: # Check if this looks like a path (has path separators) or search terms - if '\\' in raw_path or '/' in raw_path: + if '\\' in rawPath or '/' in rawPath: # This looks like a path with spaces in folder names - treat as valid path - resolved_paths.append(raw_path) - logger.info(f"Path with spaces '{raw_path}' treated as valid folder path") + resolvedPaths.append(rawPath) + logger.info(f"Path with spaces '{rawPath}' treated as valid folder path") else: # Space-separated words without path separators are search terms # Return as "*" to search globally - logger.info(f"Space-separated words '{raw_path}' treated as search terms, not folder path") - resolved_paths.append("*") + logger.info(f"Space-separated words '{rawPath}' treated as search terms, not folder path") + resolvedPaths.append("*") # Remove duplicates while preserving order seen = set() - unique_paths = [] - for path in resolved_paths: + uniquePaths = [] + for path in resolvedPaths: if path not in seen: seen.add(path) - unique_paths.append(path) + uniquePaths.append(path) - logger.info(f"Resolved pathQuery '{pathQuery}' to {len(unique_paths)} paths: {unique_paths}") - return unique_paths + logger.info(f"Resolved pathQuery '{pathQuery}' to {len(uniquePaths)} paths: {uniquePaths}") + return uniquePaths except Exception as e: logger.error(f"Error resolving pathQuery '{pathQuery}': {str(e)}") @@ -359,6 +471,81 @@ class MethodSharepoint(MethodBase): logger.error(f"Error parsing site URL {siteUrl}: {str(e)}") return {"hostname": "", "sitePath": ""} + def _cleanSearchQuery(self, query: str) -> str: + """ + Clean search query to make it compatible with Graph API KQL syntax. + Removes path-like syntax and invalid KQL constructs. + + Parameters: + query (str): Raw search query that may contain paths and invalid syntax + + Returns: + str: Cleaned query suitable for Graph API search endpoint + """ + if not query or not query.strip(): + return "" + + query = query.strip() + + # Handle patterns like: "Company Share/Freigegebene Dokumente/.../expenses:files:.pdf" + # Extract the search term and file extension + + # First, extract file extension if present (format: :files:.pdf or just .pdf at the end) + fileExtension = "" + if ':files:' in query.lower() or ':folders:' in query.lower(): + # Extract extension after the type filter + extMatch = re.search(r':(?:files|folders):(\.\w+)', query, re.IGNORECASE) + if extMatch: + fileExtension = extMatch.group(1) + # Remove the type filter part + query = re.sub(r':(?:files|folders):\.?\w*', '', query, flags=re.IGNORECASE) + elif query.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv', '.ppt', '.pptx')): + # Extract extension from end + extMatch = re.search(r'(\.\w+)$', query) + if extMatch: + fileExtension = extMatch.group(1) + query = query[:-len(fileExtension)] + + # Extract search term: get the last segment after the last slash (filename part) + queryNormalized = query.replace('\\', '/') + if '/' in queryNormalized: + # Extract the last segment (usually the filename/search term) + lastSegment = queryNormalized.split('/')[-1] + # Remove any remaining colons or type filters + if ':' in lastSegment: + lastSegment = lastSegment.split(':')[0] + searchTerm = lastSegment.strip() + else: + # No path separators, use the query as-is but remove type filters + if ':' in query: + searchTerm = query.split(':')[0].strip() + else: + searchTerm = query.strip() + + # Remove any remaining type filters or invalid syntax + searchTerm = re.sub(r':(?:files|folders|all):?', '', searchTerm, flags=re.IGNORECASE) + searchTerm = searchTerm.strip() + + # If we have a file extension, include it in the search term + # Note: Graph API search endpoint may not support filetype: syntax + # So we include the extension as part of the search term or filter results after + if fileExtension: + extWithoutDot = fileExtension.lstrip('.') + # Try simple approach: add extension as search term + # If this doesn't work, we'll filter results after search + if searchTerm: + # Include extension in search - Graph API will search in filename + searchTerm = f"{searchTerm} {extWithoutDot}" + else: + searchTerm = extWithoutDot + + # Final cleanup: remove any remaining invalid characters for KQL + # Keep alphanumeric, spaces, hyphens, underscores, dots, and common search operators + searchTerm = re.sub(r'[^\w\s\-\.\*]', ' ', searchTerm) + searchTerm = ' '.join(searchTerm.split()) # Normalize whitespace + + return searchTerm if searchTerm else "*" + async def _makeGraphApiCall(self, endpoint: str, method: str = "GET", data: bytes = None) -> Dict[str, Any]: """Make a Microsoft Graph API call with timeout and detailed logging""" try: @@ -386,9 +573,9 @@ class MethodSharepoint(MethodBase): logger.debug(f"Graph API success: {len(str(result))} characters response") return result else: - error_text = await response.text() - logger.error(f"Graph API call failed: {response.status} - {error_text}") - return {"error": f"API call failed: {response.status} - {error_text}"} + errorText = await response.text() + logger.error(f"Graph API call failed: {response.status} - {errorText}") + return {"error": f"API call failed: {response.status} - {errorText}"} elif method == "PUT": logger.debug(f"Starting PUT request to {url}") @@ -399,9 +586,9 @@ class MethodSharepoint(MethodBase): logger.debug(f"Graph API success: {len(str(result))} characters response") return result else: - error_text = await response.text() - logger.error(f"Graph API call failed: {response.status} - {error_text}") - return {"error": f"API call failed: {response.status} - {error_text}"} + errorText = await response.text() + logger.error(f"Graph API call failed: {response.status} - {errorText}") + return {"error": f"API call failed: {response.status} - {errorText}"} elif method == "POST": logger.debug(f"Starting POST request to {url}") @@ -412,9 +599,9 @@ class MethodSharepoint(MethodBase): logger.debug(f"Graph API success: {len(str(result))} characters response") return result else: - error_text = await response.text() - logger.error(f"Graph API call failed: {response.status} - {error_text}") - return {"error": f"API call failed: {response.status} - {error_text}"} + errorText = await response.text() + logger.error(f"Graph API call failed: {response.status} - {errorText}") + return {"error": f"API call failed: {response.status} - {errorText}"} except asyncio.TimeoutError: logger.error(f"Graph API call timed out after 30 seconds: {endpoint}") @@ -423,10 +610,10 @@ class MethodSharepoint(MethodBase): logger.error(f"Error making Graph API call: {str(e)}") return {"error": f"Error making Graph API call: {str(e)}"} - async def _getSiteId(self, hostname: str, site_path: str) -> str: + async def _getSiteId(self, hostname: str, sitePath: str) -> str: """Get SharePoint site ID from hostname and site path""" try: - endpoint = f"sites/{hostname}:/{site_path}" + endpoint = f"sites/{hostname}:/{sitePath}" result = await self._makeGraphApiCall(endpoint) if "error" in result: @@ -464,35 +651,61 @@ class MethodSharepoint(MethodBase): # Parse searchQuery to extract path, search terms, search type, and options pathQuery, fileQuery, searchType, searchOptions = self._parseSearchQuery(searchQuery) + logger.debug(f"Parsed searchQuery '{searchQuery}' -> pathQuery='{pathQuery}', fileQuery='{fileQuery}', searchType='{searchType}'") connection = self._getMicrosoftConnection(connectionReference) if not connection: return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - # Discover SharePoint sites - use targeted approach when site parameter is provided - if site: - # When site parameter is provided, discover all sites first, then filter - all_sites = await self._discoverSharePointSites() - if not all_sites: - return ActionResult.isFailure(error="No SharePoint sites found or accessible") + # Extract site name from pathQuery if it contains Microsoft-standard path (/sites/SiteName/...) + siteFromPath = None + directSite = None + if pathQuery and pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + siteFromPath = parsedPath.get("siteName") + logger.info(f"Extracted site from Microsoft-standard pathQuery '{pathQuery}': '{siteFromPath}'") + + # Try to get site directly by path (optimization - no need to load all 60 sites) + directSite = await self._getSiteByStandardPath(siteFromPath) + if directSite: + logger.info(f"Got site directly by standard path - no need to discover all sites") + sites = [directSite] + else: + logger.warning(f"Could not get site directly, falling back to site discovery") + directSite = None + else: + logger.warning(f"Failed to parse site from standard pathQuery '{pathQuery}'") + + # If we didn't get the site directly, use discovery and filtering + if not directSite: + # Determine which site hint to use (priority: site parameter > site from pathQuery > site_hint from searchOptions) + siteHintToUse = site or siteFromPath or searchOptions.get("site_hint") - sites = self._filter_sites_by_hint(all_sites, site) - logger.info(f"Filtered sites by site parameter: '{site}' -> {len(sites)} sites") - if not sites: - return ActionResult.isFailure(error=f"No SharePoint sites found matching '{site}'") - else: - # No site parameter - discover all sites - sites = await self._discoverSharePointSites() - if not sites: - return ActionResult.isFailure(error="No SharePoint sites found or accessible") + # Discover SharePoint sites - use targeted approach when site hint is available + if siteHintToUse: + # When site hint is available, discover all sites first, then filter + allSites = await self._discoverSharePointSites() + if not allSites: + return ActionResult.isFailure(error="No SharePoint sites found or accessible") + + sites = self._filterSitesByHint(allSites, siteHintToUse) + logger.info(f"Filtered sites by site hint '{siteHintToUse}' -> {len(sites)} sites") + if not sites: + return ActionResult.isFailure(error=f"No SharePoint sites found matching '{siteHintToUse}'") + else: + # No site hint - discover all sites + sites = await self._discoverSharePointSites() + if not sites: + return ActionResult.isFailure(error="No SharePoint sites found or accessible") # Resolve path query into search paths - search_paths = self._resolvePathQuery(pathQuery) + searchPaths = self._resolvePathQuery(pathQuery) try: # Search across all discovered sites - found_documents = [] - all_sites_searched = [] + foundDocuments = [] + allSitesSearched = [] # Handle different search approaches based on search type if searchType == "folders" and fileQuery and fileQuery.strip() != "" and fileQuery.strip() != "*": @@ -504,40 +717,40 @@ class MethodSharepoint(MethodBase): if len(terms) > 1: # Multiple terms: search for ALL terms (AND) - more specific results - query_string = " AND ".join(terms) + queryString = " AND ".join(terms) else: # Single term: search for the term - query_string = terms[0] if terms else fileQuery - logger.info(f"Using unified search for folders: {query_string}") + queryString = terms[0] if terms else fileQuery + logger.info(f"Using unified search for folders: {queryString}") payload = { "requests": [ { "entityTypes": ["driveItem"], - "query": {"queryString": query_string}, + "query": {"queryString": queryString}, "from": 0, "size": 50 } ] } - logger.info(f"Using unified search API for folders with queryString: {query_string}") + logger.info(f"Using unified search API for folders with queryString: {queryString}") # Use global search endpoint (site-specific search not available) - unified_result = await self._makeGraphApiCall( + unifiedResult = await self._makeGraphApiCall( "search/query", method="POST", data=json.dumps(payload).encode("utf-8") ) - if "error" in unified_result: - logger.warning(f"Unified search failed: {unified_result['error']}") + if "error" in unifiedResult: + logger.warning(f"Unified search failed: {unifiedResult['error']}") items = [] else: # Flatten hits -> driveItem resources items = [] - for container in (unified_result.get("value", []) or []): - for hits_container in (container.get("hitsContainers", []) or []): - for hit in (hits_container.get("hits", []) or []): + for container in (unifiedResult.get("value", []) or []): + for hitsContainer in (container.get("hitsContainers", []) or []): + for hit in (hitsContainer.get("hits", []) or []): resource = hit.get("resource") if resource: items.append(resource) @@ -545,125 +758,125 @@ class MethodSharepoint(MethodBase): logger.info(f"Unified search returned {len(items)} items (pre-filter)") # Apply our improved folder detection logic - folder_items = [] + folderItems = [] for item in items: resource = item # Use the same detection logic as our test - is_folder = False + isFolder = False if 'folder' in resource: - is_folder = True + isFolder = True else: # Try to detect by URL pattern or other indicators - web_url = resource.get('webUrl', '') + webUrl = resource.get('webUrl', '') name = resource.get('name', '') # Check if URL has no file extension and looks like a folder path - if '.' not in name and ('/' in web_url or '\\' in web_url): - is_folder = True + if '.' not in name and ('/' in webUrl or '\\' in webUrl): + isFolder = True - if is_folder: - folder_items.append(item) + if isFolder: + folderItems.append(item) - items = folder_items + items = folderItems logger.info(f"Filtered to {len(items)} folders using improved detection logic") # Process unified search results - extract site information from webUrl for item in items: - item_name = item.get("name", "") - web_url = item.get("webUrl", "") + itemName = item.get("name", "") + webUrl = item.get("webUrl", "") # Extract site information from webUrl - site_name = "Unknown Site" - site_id = "unknown" + siteName = "Unknown Site" + siteId = "unknown" - if web_url and '/sites/' in web_url: + if webUrl and '/sites/' in webUrl: try: # Extract site name from URL: https://pcuster.sharepoint.com/sites/SiteName/... - url_parts = web_url.split('/sites/') - if len(url_parts) > 1: - site_path = url_parts[1].split('/')[0] + urlParts = webUrl.split('/sites/') + if len(urlParts) > 1: + sitePath = urlParts[1].split('/')[0] # Find matching site from discovered sites # First try to match by site name (URL path) for site in sites: - if site.get("name") == site_path: - site_name = site.get("displayName", site_path) - site_id = site.get("id", "unknown") + if site.get("name") == sitePath: + siteName = site.get("displayName", sitePath) + siteId = site.get("id", "unknown") break else: # If no match by name, try to match by displayName for site in sites: - if site.get("displayName") == site_path: - site_name = site.get("displayName", site_path) - site_id = site.get("id", "unknown") + if site.get("displayName") == sitePath: + siteName = site.get("displayName", sitePath) + siteId = site.get("id", "unknown") break else: # If no exact match, use the site path as site name - site_name = site_path + siteName = sitePath # Try to find a site with similar name for site in sites: - if site_path.lower() in site.get("name", "").lower() or site_path.lower() in site.get("displayName", "").lower(): - site_name = site.get("displayName", site_path) - site_id = site.get("id", "unknown") + if sitePath.lower() in site.get("name", "").lower() or sitePath.lower() in site.get("displayName", "").lower(): + siteName = site.get("displayName", sitePath) + siteId = site.get("id", "unknown") break except Exception as e: - logger.warning(f"Error extracting site info from URL {web_url}: {e}") + logger.warning(f"Error extracting site info from URL {webUrl}: {e}") # Use improved folder detection logic - is_folder = False + isFolder = False if 'folder' in item: - is_folder = True + isFolder = True else: # Try to detect by URL pattern or other indicators name = item.get('name', '') # Check if URL has no file extension and looks like a folder path - if '.' not in name and ('/' in web_url or '\\' in web_url): - is_folder = True + if '.' not in name and ('/' in webUrl or '\\' in webUrl): + isFolder = True - item_type = "folder" if is_folder else "file" - item_path = item.get("parentReference", {}).get("path", "") - logger.debug(f"Processing {item_type}: '{item_name}' at path: '{item_path}'") + itemType = "folder" if isFolder else "file" + itemPath = item.get("parentReference", {}).get("path", "") + logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") # Simple filtering like test file - just check search type - if searchType == "files" and is_folder: + if searchType == "files" and isFolder: continue # Skip folders when searching for files - elif searchType == "folders" and not is_folder: + elif searchType == "folders" and not isFolder: continue # Skip files when searching for folders # Simple approach like test file - no complex filtering - logger.debug(f"Item '{item_name}' found - adding to results") + logger.debug(f"Item '{itemName}' found - adding to results") # Create result with full path information for proper action chaining - parent_path = item.get("parentReference", {}).get("path", "") + parentPath = item.get("parentReference", {}).get("path", "") # Extract the full SharePoint path from webUrl or parentReference - full_path = "" - if web_url: + fullPath = "" + if webUrl: # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung - if '/sites/' in web_url: - path_part = web_url.split('/sites/')[1] + if '/sites/' in webUrl: + pathPart = webUrl.split('/sites/')[1] # Decode URL encoding and convert to backslash format - decoded_path = urllib.parse.unquote(path_part) - full_path = "\\" + decoded_path.replace('/', '\\') - elif parent_path: + decodedPath = urllib.parse.unquote(pathPart) + fullPath = "\\" + decodedPath.replace('/', '\\') + elif parentPath: # Use parentReference path if available - full_path = parent_path.replace('/', '\\') + fullPath = parentPath.replace('/', '\\') - doc_info = { + docInfo = { "id": item.get("id"), "name": item.get("name"), - "type": "folder" if is_folder else "file", - "siteName": site_name, - "siteId": site_id, - "webUrl": web_url, - "fullPath": full_path, - "parentPath": parent_path + "type": "folder" if isFolder else "file", + "siteName": siteName, + "siteId": siteId, + "webUrl": webUrl, + "fullPath": fullPath, + "parentPath": parentPath } - found_documents.append(doc_info) + foundDocuments.append(docInfo) - logger.info(f"Found {len(found_documents)} documents from unified search") + logger.info(f"Found {len(foundDocuments)} documents from unified search") except Exception as e: logger.error(f"Error performing unified folder search: {str(e)}") @@ -671,118 +884,174 @@ class MethodSharepoint(MethodBase): pass # If no unified search was performed or it failed, fall back to site-by-site search - if not found_documents: + if not foundDocuments: # Use simple approach like test file - no complex filtering - site_scoped_sites = sites + siteScopedSites = sites - for site in site_scoped_sites: - site_id = site["id"] - site_name = site["displayName"] - site_url = site["webUrl"] + for site in siteScopedSites: + siteId = site["id"] + siteName = site["displayName"] + siteUrl = site["webUrl"] - logger.info(f"Searching in site: {site_name} ({site_url})") + logger.info(f"Searching in site: {siteName} ({siteUrl})") + + # Check if pathQuery contains a specific folder path (not just /sites/SiteName) + folderPath = None + if pathQuery and pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + innerPath = parsedPath.get("innerPath", "") + if innerPath and innerPath.strip(): + # Remove leading slash if present + folderPath = innerPath.lstrip('/') + + # Generic approach: Try to find the folder, if it fails, remove first segment + # This works for all languages because we test the actual API response + # In SharePoint Graph API, /drive/root already points to the default document library, + # so library names in paths should be removed + pathSegments = [s for s in folderPath.split('/') if s.strip()] + if len(pathSegments) > 1: + # Try with first segment removed (first segment is likely the document library) + testPath = '/'.join(pathSegments[1:]) + # Quick test: try to get folder info (this is fast and doesn't require full search) + testEndpoint = f"sites/{siteId}/drive/root:/{urllib.parse.quote(testPath, safe='')}:" + testResult = await self._makeGraphApiCall(testEndpoint) + if testResult and "error" not in testResult: + # Path without first segment works - first segment was likely the document library + folderPath = testPath + logger.info(f"Removed document library name '{pathSegments[0]}' from folder path (tested via API)") + else: + # Keep original path - first segment is not a document library + logger.info(f"Keeping original folder path '{folderPath}' (first segment is not a document library)") + elif len(pathSegments) == 1: + # Only one segment - likely the document library itself, use root + folderPath = None + logger.info(f"Only one segment '{pathSegments[0]}' found, likely document library - using root") + + if folderPath: + logger.info(f"Extracted folder path from pathQuery: '{folderPath}'") + else: + logger.info(f"Folder path resolved to root (only document library in path)") # Use Microsoft Graph API for this specific site # Handle empty or wildcard queries if not fileQuery or fileQuery.strip() == "" or fileQuery.strip() == "*": - # For wildcard/empty queries, list all items in the drive - endpoint = f"sites/{site_id}/drive/root/children" + # For wildcard/empty queries, list all items + if folderPath: + # List items in specific folder + encodedPath = urllib.parse.quote(folderPath, safe='') + endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/children" + logger.info(f"Listing items in folder: '{folderPath}'") + else: + # List all items in the drive root + endpoint = f"sites/{siteId}/drive/root/children" else: # For files, use regular search API - search_query = fileQuery.replace("'", "''") # Escape single quotes for OData - endpoint = f"sites/{site_id}/drive/root/search(q='{search_query}')" - logger.info(f"Using search API for files with query: '{search_query}'") + # Clean the query: remove path-like syntax and invalid KQL syntax + searchQuery = self._cleanSearchQuery(fileQuery) + # URL-encode the query parameter + encodedQuery = urllib.parse.quote(searchQuery, safe='') + + if folderPath: + # Search in specific folder + encodedPath = urllib.parse.quote(folderPath, safe='') + endpoint = f"sites/{siteId}/drive/root:/{encodedPath}:/search(q='{encodedQuery}')" + logger.info(f"Searching in folder '{folderPath}' with query: '{searchQuery}' (encoded: '{encodedQuery}')") + else: + # Search in drive root + endpoint = f"sites/{siteId}/drive/root/search(q='{encodedQuery}')" + logger.info(f"Using search API for files with query: '{searchQuery}' (encoded: '{encodedQuery}')") # Make the search API call (files) - search_result = await self._makeGraphApiCall(endpoint) - if "error" in search_result: - logger.warning(f"Search failed for site {site_name}: {search_result['error']}") + searchResult = await self._makeGraphApiCall(endpoint) + if "error" in searchResult: + logger.warning(f"Search failed for site {siteName}: {searchResult['error']}") continue # Process search results for this site (files) - items = search_result.get("value", []) - logger.info(f"Retrieved {len(items)} items from site {site_name}") + items = searchResult.get("value", []) + logger.info(f"Retrieved {len(items)} items from site {siteName}") - site_documents = [] + siteDocuments = [] for item in items: - item_name = item.get("name", "") + itemName = item.get("name", "") # Use improved folder detection logic - is_folder = False + isFolder = False if 'folder' in item: - is_folder = True + isFolder = True else: # Try to detect by URL pattern or other indicators - web_url = item.get('webUrl', '') + webUrl = item.get('webUrl', '') name = item.get('name', '') # Check if URL has no file extension and looks like a folder path - if '.' not in name and ('/' in web_url or '\\' in web_url): - is_folder = True + if '.' not in name and ('/' in webUrl or '\\' in webUrl): + isFolder = True - item_type = "folder" if is_folder else "file" - item_path = item.get("parentReference", {}).get("path", "") - logger.debug(f"Processing {item_type}: '{item_name}' at path: '{item_path}'") + itemType = "folder" if isFolder else "file" + itemPath = item.get("parentReference", {}).get("path", "") + logger.debug(f"Processing {itemType}: '{itemName}' at path: '{itemPath}'") # Simple filtering like test file - just check search type - if searchType == "files" and is_folder: + if searchType == "files" and isFolder: continue # Skip folders when searching for files - elif searchType == "folders" and not is_folder: + elif searchType == "folders" and not isFolder: continue # Skip files when searching for folders # Simple approach like test file - no complex filtering - logger.debug(f"Item '{item_name}' found - adding to results") + logger.debug(f"Item '{itemName}' found - adding to results") # Create result with full path information for proper action chaining - web_url = item.get("webUrl", "") - parent_path = item.get("parentReference", {}).get("path", "") + webUrl = item.get("webUrl", "") + parentPath = item.get("parentReference", {}).get("path", "") # Extract the full SharePoint path from webUrl or parentReference - full_path = "" - if web_url: + fullPath = "" + if webUrl: # Extract path from webUrl: https://pcuster.sharepoint.com/sites/SSSRESYNachfolge/Freigegebene%20Dokumente/General/Eskalation%20LogObject/Druckersteuerung - if '/sites/' in web_url: - path_part = web_url.split('/sites/')[1] + if '/sites/' in webUrl: + pathPart = webUrl.split('/sites/')[1] # Decode URL encoding and convert to backslash format - decoded_path = urllib.parse.unquote(path_part) - full_path = "\\" + decoded_path.replace('/', '\\') - elif parent_path: + decodedPath = urllib.parse.unquote(pathPart) + fullPath = "\\" + decodedPath.replace('/', '\\') + elif parentPath: # Use parentReference path if available - full_path = parent_path.replace('/', '\\') + fullPath = parentPath.replace('/', '\\') - doc_info = { + docInfo = { "id": item.get("id"), "name": item.get("name"), - "type": "folder" if is_folder else "file", - "siteName": site_name, - "siteId": site_id, - "webUrl": web_url, - "fullPath": full_path, - "parentPath": parent_path + "type": "folder" if isFolder else "file", + "siteName": siteName, + "siteId": siteId, + "webUrl": webUrl, + "fullPath": fullPath, + "parentPath": parentPath } - site_documents.append(doc_info) + siteDocuments.append(docInfo) - found_documents.extend(site_documents) - all_sites_searched.append({ - "siteName": site_name, - "siteUrl": site_url, - "siteId": site_id, - "documentsFound": len(site_documents) + foundDocuments.extend(siteDocuments) + allSitesSearched.append({ + "siteName": siteName, + "siteUrl": siteUrl, + "siteId": siteId, + "documentsFound": len(siteDocuments) }) - logger.info(f"Found {len(site_documents)} documents in site {site_name}") + logger.info(f"Found {len(siteDocuments)} documents in site {siteName}") # Limit total results to maxResults - if len(found_documents) > maxResults: - found_documents = found_documents[:maxResults] + if len(foundDocuments) > maxResults: + foundDocuments = foundDocuments[:maxResults] logger.info(f"Limited results to {maxResults} items") - result_data = { + resultData = { "searchQuery": searchQuery, - "totalResults": len(found_documents), + "totalResults": len(foundDocuments), "maxResults": maxResults, - "foundDocuments": found_documents, + "foundDocuments": foundDocuments, "timestamp": self.services.utils.timestampGetUtc() } @@ -791,17 +1060,17 @@ class MethodSharepoint(MethodBase): return ActionResult.isFailure(error=str(e)) # Use default JSON format for output - output_extension = ".json" # Default - output_mime_type = "application/json" # Default + outputExtension = ".json" # Default + outputMimeType = "application/json" # Default return ActionResult( success=True, documents=[ ActionDocument( - documentName=f"sharepoint_find_path_{self._format_timestamp_for_filename()}{output_extension}", - documentData=json.dumps(result_data, indent=2), - mimeType=output_mime_type + documentName=f"sharepoint_find_path_{self._format_timestamp_for_filename()}{outputExtension}", + documentData=json.dumps(resultData, indent=2), + mimeType=outputMimeType ) ] ) @@ -815,15 +1084,24 @@ class MethodSharepoint(MethodBase): """ GENERAL: - Purpose: Read documents from SharePoint and extract content/metadata. - - Input requirements: connectionReference (required); documentList (required); optional pathObject or pathQuery; includeMetadata. - - Output format: JSON with read results per document. + - Input requirements: connectionReference (required); optional documentList, pathObject, or pathQuery; includeMetadata. + - Output format: Standardized ActionDocument format (documentName, documentData, mimeType). + - Binary files (PDFs, etc.) are Base64-encoded in documentData. + - Text files are stored as plain text in documentData. + - Returns ActionResult with documents list for template processing. Parameters: - - documentList (list, required): Document list reference(s) to read. - connectionReference (str, required): Microsoft connection label. - - pathObject (str, optional): Reference to a previous path result. - - pathQuery (str, optional): Path query if no pathObject. + - pathObject (str, optional): Reference to a previous path result (from findDocumentPath). + - documentList (list, optional): Document list reference(s) to read (backward compatibility). + - pathQuery (str, optional): Path query if no pathObject (backward compatibility). - includeMetadata (bool, optional): Include metadata. Default: True. + + Returns: + - ActionResult with documents: List[ActionDocument] where each ActionDocument contains: + - documentName: File name + - documentData: Base64-encoded content (binary files) or plain text (text files) + - mimeType: MIME type (e.g., application/pdf, text/plain) """ try: documentList = parameters.get("documentList") @@ -834,48 +1112,196 @@ class MethodSharepoint(MethodBase): pathObject = parameters.get("pathObject") includeMetadata = parameters.get("includeMetadata", True) - if not documentList or not connectionReference: - return ActionResult.isFailure(error="Document list reference and connection reference are required") + # Validate connection reference + if not connectionReference: + return ActionResult.isFailure(error="Connection reference is required") - # If pathObject is provided, extract folder IDs from it - # Note: pathObject takes precedence over pathQuery when both are provided + # Get connection first - needed for both pathObject and documentList approaches + connection = self._getMicrosoftConnection(connectionReference) + if not connection: + return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") + + # If pathObject is provided, extract SharePoint file IDs and read them directly + # pathObject contains the result from findDocumentPath with foundDocuments array + sharePointFileIds = None + sites = None if pathObject: if pathQuery and pathQuery != "*": logger.debug(f"Both pathObject and pathQuery provided - using pathObject (pathQuery '{pathQuery}' will be ignored)") try: # Resolve the reference label to get the actual document list - document_list = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) - if not document_list or len(document_list) == 0: + pathObjectDocuments = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) + if not pathObjectDocuments or len(pathObjectDocuments) == 0: return ActionResult.isFailure(error=f"No document list found for reference: {pathObject}") - # Get the first document's content (which should be the JSON) - first_document = document_list[0] - file_data = self.services.chat.getFileData(first_document.fileId) - if not file_data: + # Get the first document's content (which should be the JSON from findDocumentPath) + firstDocument = pathObjectDocuments[0] + fileData = self.services.chat.getFileData(firstDocument.fileId) + if not fileData: return ActionResult.isFailure(error=f"No file data found for document: {pathObject}") # Parse the JSON content - result_data = json.loads(file_data) - found_documents = result_data.get("foundDocuments", []) + resultData = json.loads(fileData) + foundDocuments = resultData.get("foundDocuments", []) - # Extract folder IDs from the result - folder_ids = [] - for doc in found_documents: - if doc.get("type") == "folder": - folder_ids.append(doc.get("id")) - - if folder_ids: - # Use the first folder ID found as pathQuery - pathQuery = folder_ids[0] - logger.info(f"Using folder ID from pathObject: {pathQuery}") + if foundDocuments: + # Extract SharePoint file IDs from foundDocuments + sharePointFileIds = [doc.get("id") for doc in foundDocuments if doc.get("type") == "file"] + if not sharePointFileIds: + return ActionResult.isFailure(error=f"No files found in pathObject '{pathObject}'") + logger.info(f"Extracted {len(sharePointFileIds)} SharePoint file IDs from pathObject '{pathObject}'") + + # Extract site information from foundDocuments + if foundDocuments: + firstDoc = foundDocuments[0] + siteName = firstDoc.get("siteName") + siteId = firstDoc.get("siteId") + + if siteName and siteId: + sites = [{ + "id": siteId, + "displayName": siteName, + "webUrl": firstDoc.get("webUrl", "") + }] + logger.info(f"Using specific site from pathObject: {siteName} (ID: {siteId})") + else: + return ActionResult.isFailure(error="Site information missing from pathObject. Cannot determine target site for read operation.") else: - return ActionResult.isFailure(error="No folders found in pathObject") + return ActionResult.isFailure(error=f"No documents found in pathObject '{pathObject}'") except json.JSONDecodeError as e: return ActionResult.isFailure(error=f"Invalid JSON in pathObject: {str(e)}") except Exception as e: return ActionResult.isFailure(error=f"Error resolving pathObject reference: {str(e)}") + # If we have SharePoint file IDs from pathObject, read them directly + if sharePointFileIds and sites: + # Read SharePoint files directly using their IDs + readResults = [] + siteId = sites[0]['id'] + + for fileId in sharePointFileIds: + try: + # Get file info from SharePoint + endpoint = f"sites/{siteId}/drive/items/{fileId}" + fileInfo = await self._makeGraphApiCall(endpoint) + + if "error" in fileInfo: + logger.warning(f"Failed to get file info for {fileId}: {fileInfo['error']}") + continue + + # Get file content using SharePoint service (handles binary data correctly) + fileName = fileInfo.get("name", f"file_{fileId}") + fileContent = await self.services.sharepoint.downloadFile(siteId, fileId) + + # Create result document + resultItem = { + "fileId": fileId, + "fileName": fileName, + "sharepointFileId": fileId, + "siteName": sites[0]['displayName'], + "siteUrl": sites[0]['webUrl'], + "size": fileInfo.get("size", 0), + "createdDateTime": fileInfo.get("createdDateTime"), + "lastModifiedDateTime": fileInfo.get("lastModifiedDateTime"), + "webUrl": fileInfo.get("webUrl") + } + + # Add content if available + if fileContent: + resultItem["content"] = fileContent + + # Add metadata if requested + if includeMetadata: + resultItem["metadata"] = { + "mimeType": fileInfo.get("file", {}).get("mimeType"), + "downloadUrl": fileInfo.get("@microsoft.graph.downloadUrl"), + "createdBy": fileInfo.get("createdBy", {}), + "lastModifiedBy": fileInfo.get("lastModifiedBy", {}), + "parentReference": fileInfo.get("parentReference", {}) + } + + readResults.append(resultItem) + except Exception as e: + logger.error(f"Error reading file {fileId}: {str(e)}") + continue + + if not readResults: + return ActionResult.isFailure(error="No files could be read from pathObject") + + # Convert read results to ActionDocument objects + # IMPORTANT: For binary files (PDFs), store Base64-encoded content directly in documentData + # The system will create FileData and ChatDocument automatically + from modules.datamodels.datamodelChat import ActionDocument + import base64 + + actionDocuments = [] + for resultItem in readResults: + fileContent = resultItem.get("content") + fileName = resultItem.get("fileName", f"file_{resultItem.get('fileId')}") + + # Determine MIME type from metadata or file extension + mimeType = "application/octet-stream" + if resultItem.get("metadata", {}).get("mimeType"): + mimeType = resultItem["metadata"]["mimeType"] + elif fileName: + if fileName.endswith('.pdf'): + mimeType = "application/pdf" + elif fileName.endswith('.txt'): + mimeType = "text/plain" + elif fileName.endswith('.json'): + mimeType = "application/json" + + # For binary files (PDFs, etc.), store Base64-encoded content directly + # The GenerationService will detect PDF mimeType and handle base64 decoding + if fileContent and isinstance(fileContent, bytes): + # Encode binary content as Base64 string + base64Content = base64.b64encode(fileContent).decode('utf-8') + actionDoc = ActionDocument( + documentName=fileName, + documentData=base64Content, # Base64 string for binary files + mimeType=mimeType + ) + actionDocuments.append(actionDoc) + logger.info(f"Stored binary file {fileName} ({len(fileContent)} bytes) as Base64 in ActionDocument") + elif fileContent: + # Text content - store directly in documentData + actionDoc = ActionDocument( + documentName=fileName, + documentData=fileContent if isinstance(fileContent, str) else str(fileContent), + mimeType=mimeType + ) + actionDocuments.append(actionDoc) + else: + # No content - store metadata only + docData = { + "fileName": fileName, + "sharepointFileId": resultItem.get("sharepointFileId"), + "siteName": resultItem.get("siteName"), + "siteUrl": resultItem.get("siteUrl"), + "size": resultItem.get("size"), + "createdDateTime": resultItem.get("createdDateTime"), + "lastModifiedDateTime": resultItem.get("lastModifiedDateTime"), + "webUrl": resultItem.get("webUrl") + } + if resultItem.get("metadata"): + docData["metadata"] = resultItem["metadata"] + + actionDoc = ActionDocument( + documentName=fileName, + documentData=json.dumps(docData, indent=2), + mimeType=mimeType + ) + actionDocuments.append(actionDoc) + + # Return success with action documents + return ActionResult.isSuccess(documents=actionDocuments) + + # Fallback: Use documentList parameter (for backward compatibility) + # Validate documentList + if not documentList: + return ActionResult.isFailure(error="Document list reference is required. Either provide documentList parameter or use pathObject that contains files.") + # Get documents from reference - ensure documentList is a list, not a string # documentList is already normalized above chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(documentList) @@ -883,86 +1309,66 @@ class MethodSharepoint(MethodBase): if not chatDocuments: return ActionResult.isFailure(error="No documents found for the provided reference") - connection = self._getMicrosoftConnection(connectionReference) - if not connection: - return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - # Determine sites to use - strict validation: pathObject → pathQuery → ERROR - sites = None - - # Step 1: Check pathObject first - if pathObject: - # When pathObject is provided, we should have specific site information - # Extract site information from the pathObject result - try: - # Get the site information from the first folder in pathObject - if 'found_documents' in locals() and found_documents: - first_folder = found_documents[0] - site_name = first_folder.get("siteName") - site_id = first_folder.get("siteId") - - if site_name and site_id: - # Use the specific site from pathObject instead of discovering all sites - sites = [{ - "id": site_id, - "displayName": site_name, - "webUrl": first_folder.get("webUrl", "") - }] - logger.info(f"Using specific site from pathObject: {site_name} (ID: {site_id})") - else: - # Site info missing from pathObject - this is an error - return ActionResult.isFailure(error="Site information missing from pathObject. Cannot determine target site for read operation.") - else: - # No documents found in pathObject - this is an error - return ActionResult.isFailure(error="No valid folder information found in pathObject. Cannot determine target site for read operation.") - except Exception as e: - # Error processing pathObject - this is an error - return ActionResult.isFailure(error=f"Error processing pathObject: {str(e)}. Cannot determine target site for read operation.") - - # Step 2: If no pathObject, check pathQuery - elif pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": - # Validate pathQuery format - if not pathQuery.startswith('/'): - return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:/... e.g. /site:KM LayerFinance/Documents/Work") - - # Check if pathQuery contains search terms (words without proper path structure) - valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents'] - if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes): - return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.") - - # For pathQuery, we need to discover sites to find the specific one - all_sites = await self._discoverSharePointSites() - if not all_sites: - return ActionResult.isFailure(error="No SharePoint sites found or accessible") - - # If pathQuery starts with /site:, extract site name and filter - if pathQuery.startswith('/site:'): - # Extract site name from /site:Company Share/... format - site_path_part = pathQuery[6:] # Remove '/site:' - if '/' in site_path_part: - site_name = site_path_part.split('/', 1)[0] - else: - site_name = site_path_part + if not sites: + # Step 2: If no pathObject, check pathQuery + if pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": + # Validate pathQuery format + if not pathQuery.startswith('/'): + return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with Microsoft-standard syntax /sites//... e.g. /sites/company-share/Freigegebene Dokumente/Work") - # Filter sites by name (case-insensitive substring match) - sites = self._filter_sites_by_hint(all_sites, site_name) - if not sites: - return ActionResult.isFailure(error=f"No SharePoint site found matching '{site_name}'") - logger.info(f"Filtered to site(s) matching '{site_name}': {[s['displayName'] for s in sites]}") + # Check if pathQuery contains search terms (words without proper path structure) + validPathPrefixes = ['/sites/', '/Documents', '/documents', '/Shared Documents', '/shared documents'] + if not any(pathQuery.startswith(prefix) for prefix in validPathPrefixes): + return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.") + + # If pathQuery starts with Microsoft-standard /sites/, try to get site directly + directSite = None + if pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + siteName = parsedPath.get("siteName") + # Try to get site directly by path (optimization - no need to load all 60 sites) + directSite = await self._getSiteByStandardPath(siteName) + if directSite: + logger.info(f"Got site directly by standard path - no need to discover all sites") + sites = [directSite] + else: + logger.warning(f"Could not get site directly, falling back to site discovery") + + # If we didn't get the site directly, use discovery and filtering + if not directSite: + # For pathQuery, we need to discover sites to find the specific one + allSites = await self._discoverSharePointSites() + if not allSites: + return ActionResult.isFailure(error="No SharePoint sites found or accessible") + + # If pathQuery starts with Microsoft-standard /sites/, extract site name and filter + if pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + siteName = parsedPath.get("siteName") + # Filter sites by name (case-insensitive substring match) + sites = self._filterSitesByHint(allSites, siteName) + if not sites: + return ActionResult.isFailure(error=f"No SharePoint site found matching '{siteName}'") + logger.info(f"Filtered to site(s) matching '{siteName}': {[s['displayName'] for s in sites]}") + else: + sites = allSites + else: + sites = allSites else: - sites = all_sites - else: - # Step 3: Both pathObject and pathQuery failed - ERROR, NO FALLBACK - return ActionResult.isFailure(error="No valid upload path provided. Either provide pathObject (from findDocumentPath) or a valid pathQuery with specific site information.") + # Step 3: Both pathObject and pathQuery failed - ERROR, NO FALLBACK + return ActionResult.isFailure(error="No valid read path provided. Either provide pathObject (from findDocumentPath) or a valid pathQuery with specific site information.") if not sites: return ActionResult.isFailure(error="No valid target site determined for read operation") # Resolve path query into search paths - search_paths = self._resolvePathQuery(pathQuery) + searchPaths = self._resolvePathQuery(pathQuery) # Process each chat document across all sites - read_results = [] + readResults = [] for i, chatDocument in enumerate(chatDocuments): try: @@ -970,90 +1376,90 @@ class MethodSharepoint(MethodBase): fileName = chatDocument.fileName # Search for this file across all sites - file_found = False + fileFound = False for site in sites: - site_id = site["id"] - site_name = site["displayName"] - site_url = site["webUrl"] + siteId = site["id"] + siteName = site["displayName"] + siteUrl = site["webUrl"] # Try to find the file by name in this site - search_query = fileName.replace("'", "''") # Escape single quotes for OData - endpoint = f"sites/{site_id}/drive/root/search(q='{search_query}')" + searchQuery = fileName.replace("'", "''") # Escape single quotes for OData + endpoint = f"sites/{siteId}/drive/root/search(q='{searchQuery}')" - search_result = await self._makeGraphApiCall(endpoint) + searchResult = await self._makeGraphApiCall(endpoint) - if "error" in search_result: + if "error" in searchResult: continue - items = search_result.get("value", []) + items = searchResult.get("value", []) for item in items: if item.get("name") == fileName: # Found the file, get its details - file_id = item.get("id") - file_endpoint = f"sites/{site_id}/drive/items/{file_id}" + fileId = item.get("id") + fileEndpoint = f"sites/{siteId}/drive/items/{fileId}" # Get file metadata - file_info_result = await self._makeGraphApiCall(file_endpoint) + fileInfoResult = await self._makeGraphApiCall(fileEndpoint) - if "error" in file_info_result: + if "error" in fileInfoResult: continue # Build result with metadata - result_item = { + resultItem = { "fileId": fileId, "fileName": fileName, - "sharepointFileId": file_id, - "siteName": site_name, - "siteUrl": site_url, - "size": file_info_result.get("size", 0), - "createdDateTime": file_info_result.get("createdDateTime"), - "lastModifiedDateTime": file_info_result.get("lastModifiedDateTime"), - "webUrl": file_info_result.get("webUrl") + "sharepointFileId": fileId, + "siteName": siteName, + "siteUrl": siteUrl, + "size": fileInfoResult.get("size", 0), + "createdDateTime": fileInfoResult.get("createdDateTime"), + "lastModifiedDateTime": fileInfoResult.get("lastModifiedDateTime"), + "webUrl": fileInfoResult.get("webUrl") } # Add metadata if requested if includeMetadata: - result_item["metadata"] = { - "mimeType": file_info_result.get("file", {}).get("mimeType"), - "downloadUrl": file_info_result.get("@microsoft.graph.downloadUrl"), - "createdBy": file_info_result.get("createdBy", {}), - "lastModifiedBy": file_info_result.get("lastModifiedBy", {}), - "parentReference": file_info_result.get("parentReference", {}) + resultItem["metadata"] = { + "mimeType": fileInfoResult.get("file", {}).get("mimeType"), + "downloadUrl": fileInfoResult.get("@microsoft.graph.downloadUrl"), + "createdBy": fileInfoResult.get("createdBy", {}), + "lastModifiedBy": fileInfoResult.get("lastModifiedBy", {}), + "parentReference": fileInfoResult.get("parentReference", {}) } # Get file content if it's a readable format - mime_type = file_info_result.get("file", {}).get("mimeType", "") - if mime_type.startswith("text/") or mime_type in [ + mimeType = fileInfoResult.get("file", {}).get("mimeType", "") + if mimeType.startswith("text/") or mimeType in [ "application/json", "application/xml", "application/javascript" ]: # Download the file content - content_endpoint = f"sites/{site_id}/drive/items/{file_id}/content" + contentEndpoint = f"sites/{siteId}/drive/items/{fileId}/content" # For content download, we need to handle binary data try: async with aiohttp.ClientSession() as session: headers = {"Authorization": f"Bearer {self.services.sharepoint._target.accessToken}"} - async with session.get(f"https://graph.microsoft.com/v1.0/{content_endpoint}", headers=headers) as response: + async with session.get(f"https://graph.microsoft.com/v1.0/{contentEndpoint}", headers=headers) as response: if response.status == 200: content = await response.text() - result_item["content"] = content + resultItem["content"] = content else: - result_item["content"] = f"Could not download content: HTTP {response.status}" + resultItem["content"] = f"Could not download content: HTTP {response.status}" except Exception as e: - result_item["content"] = f"Error downloading content: {str(e)}" + resultItem["content"] = f"Error downloading content: {str(e)}" else: - result_item["content"] = f"Binary file type ({mime_type}) - content not retrieved" + resultItem["content"] = f"Binary file type ({mimeType}) - content not retrieved" - read_results.append(result_item) - file_found = True + readResults.append(resultItem) + fileFound = True break - if file_found: + if fileFound: break - if not file_found: - read_results.append({ + if not fileFound: + readResults.append({ "fileId": fileId, "fileName": fileName, "error": "File not found in any accessible SharePoint site", @@ -1062,20 +1468,20 @@ class MethodSharepoint(MethodBase): except Exception as e: logger.error(f"Error reading document {chatDocument.fileName}: {str(e)}") - read_results.append({ + readResults.append({ "fileId": chatDocument.fileId, "fileName": chatDocument.fileName, "error": str(e), "content": None }) - result_data = { + resultData = { "connectionReference": connectionReference, "pathQuery": pathQuery, "documentList": documentList, "includeMetadata": includeMetadata, "sitesSearched": len(sites), - "readResults": read_results, + "readResults": readResults, "connection": { "id": connection["id"], "authority": "microsoft", @@ -1085,17 +1491,17 @@ class MethodSharepoint(MethodBase): } # Use default JSON format for output - output_extension = ".json" # Default - output_mime_type = "application/json" # Default + outputExtension = ".json" # Default + outputMimeType = "application/json" # Default return ActionResult( success=True, documents=[ ActionDocument( - documentName=f"sharepoint_documents_{self._format_timestamp_for_filename()}{output_extension}", - documentData=json.dumps(result_data, indent=2), - mimeType=output_mime_type + documentName=f"sharepoint_documents_{self._format_timestamp_for_filename()}{outputExtension}", + documentData=json.dumps(resultData, indent=2), + mimeType=outputMimeType ) ] ) @@ -1128,7 +1534,7 @@ class MethodSharepoint(MethodBase): documentList = [documentList] pathObject = parameters.get("pathObject") - upload_path = pathQuery + uploadPath = pathQuery logger.debug(f"Using pathQuery: {pathQuery}") if not connectionReference or not documentList: @@ -1138,63 +1544,63 @@ class MethodSharepoint(MethodBase): if pathObject: try: # Resolve the reference label to get the actual document list - document_list = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) - if not document_list or len(document_list) == 0: + documentList = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) + if not documentList or len(documentList) == 0: return ActionResult.isFailure(error=f"No document list found for reference: {pathObject}") # Get the first document's content (which should be the JSON) - first_document = document_list[0] - file_data = self.services.chat.getFileData(first_document.fileId) - if not file_data: + firstDocument = documentList[0] + fileData = self.services.chat.getFileData(firstDocument.fileId) + if not fileData: return ActionResult.isFailure(error=f"No file data found for document: {pathObject}") # Parse the JSON content - result_data = json.loads(file_data) + resultData = json.loads(fileData) # Debug: Log the structure of the result document - logger.info(f"Result document keys: {list(result_data.keys())}") + logger.info(f"Result document keys: {list(resultData.keys())}") # Handle different result document formats - found_documents = [] + foundDocuments = [] # Check if it's a direct SharePoint result (has foundDocuments) - if "foundDocuments" in result_data: - found_documents = result_data.get("foundDocuments", []) - logger.info(f"Found {len(found_documents)} documents in foundDocuments array") + if "foundDocuments" in resultData: + foundDocuments = resultData.get("foundDocuments", []) + logger.info(f"Found {len(foundDocuments)} documents in foundDocuments array") # Check if it's an AI validation result (has result string with validationReport) - elif "result" in result_data and "validationReport" in result_data["result"]: + elif "result" in resultData and "validationReport" in resultData["result"]: try: # Parse the nested JSON in the result field - nested_result = json.loads(result_data["result"]) - validation_report = nested_result.get("validationReport", {}) - document_details = validation_report.get("documentDetails", {}) + nestedResult = json.loads(resultData["result"]) + validationReport = nestedResult.get("validationReport", {}) + documentDetails = validationReport.get("documentDetails", {}) - if document_details: + if documentDetails: # Convert the single document details to the expected format doc = { - "id": document_details.get("id"), - "name": document_details.get("name"), - "type": document_details.get("type", "").lower(), # Convert "Folder" to "folder" - "siteName": document_details.get("siteName"), - "siteId": document_details.get("siteId"), - "fullPath": document_details.get("fullPath"), - "webUrl": document_details.get("webUrl", ""), - "parentPath": document_details.get("parentPath", "") + "id": documentDetails.get("id"), + "name": documentDetails.get("name"), + "type": documentDetails.get("type", "").lower(), # Convert "Folder" to "folder" + "siteName": documentDetails.get("siteName"), + "siteId": documentDetails.get("siteId"), + "fullPath": documentDetails.get("fullPath"), + "webUrl": documentDetails.get("webUrl", ""), + "parentPath": documentDetails.get("parentPath", "") } - found_documents = [doc] + foundDocuments = [doc] logger.info(f"Extracted 1 document from validation report") except json.JSONDecodeError as e: logger.error(f"Failed to parse nested JSON in result field: {e}") return ActionResult.isFailure(error=f"Invalid nested JSON in pathObject: {str(e)}") # Debug: Log what we found in the result document - logger.info(f"Result document contains {len(found_documents)} documents") - for i, doc in enumerate(found_documents): + logger.info(f"Result document contains {len(foundDocuments)} documents") + for i, doc in enumerate(foundDocuments): logger.info(f" Document {i+1}: name='{doc.get('name')}', type='{doc.get('type')}', id='{doc.get('id')}'") # Extract folder information from the result folders = [] - for doc in found_documents: + for doc in foundDocuments: if doc.get("type") == "folder": folders.append(doc) @@ -1202,25 +1608,25 @@ class MethodSharepoint(MethodBase): if folders: # Use the first folder found - prefer folder ID for direct API calls - first_folder = folders[0] - if first_folder.get("id"): + firstFolder = folders[0] + if firstFolder.get("id"): # Use folder ID directly for most reliable API calls - upload_path = first_folder.get("id") - logger.info(f"Using folder ID from pathObject: {upload_path}") - elif first_folder.get("fullPath"): + uploadPath = firstFolder.get("id") + logger.info(f"Using folder ID from pathObject: {uploadPath}") + elif firstFolder.get("fullPath"): # Extract the correct path portion from fullPath by removing site name - full_path = first_folder.get("fullPath") + fullPath = firstFolder.get("fullPath") # fullPath format: \\SiteName\\Library\\Folder\\SubFolder # We need to remove the first two parts (\\SiteName\\) to get the actual folder path - path_parts = full_path.lstrip('\\').split('\\') - if len(path_parts) > 1: + pathParts = fullPath.lstrip('\\').split('\\') + if len(pathParts) > 1: # Remove the first part (site name) and reconstruct the path - actual_path = '\\'.join(path_parts[1:]) - upload_path = actual_path - logger.info(f"Extracted path from fullPath: {upload_path}") + actualPath = '\\'.join(pathParts[1:]) + uploadPath = actualPath + logger.info(f"Extracted path from fullPath: {uploadPath}") else: - upload_path = full_path - logger.info(f"Using full path from pathObject (no site name to remove): {upload_path}") + uploadPath = fullPath + logger.info(f"Using full path from pathObject (no site name to remove): {uploadPath}") else: return ActionResult.isFailure(error="No valid folder information found in pathObject") else: @@ -1250,19 +1656,19 @@ class MethodSharepoint(MethodBase): # Extract site information from the pathObject result try: # Get the site information from the first folder in pathObject - if 'found_documents' in locals() and found_documents: - first_folder = found_documents[0] - site_name = first_folder.get("siteName") - site_id = first_folder.get("siteId") + if 'foundDocuments' in locals() and foundDocuments: + firstFolder = foundDocuments[0] + siteName = firstFolder.get("siteName") + siteId = firstFolder.get("siteId") - if site_name and site_id: + if siteName and siteId: # Use the specific site from pathObject instead of discovering all sites sites = [{ - "id": site_id, - "displayName": site_name, - "webUrl": first_folder.get("webUrl", "") + "id": siteId, + "displayName": siteName, + "webUrl": firstFolder.get("webUrl", "") }] - logger.info(f"Using specific site from pathObject: {site_name} (ID: {site_id})") + logger.info(f"Using specific site from pathObject: {siteName} (ID: {siteId})") else: # Site info missing from pathObject - this is an error, not a fallback return ActionResult.isFailure(error="Site information missing from pathObject. Cannot determine target site for upload.") @@ -1274,73 +1680,106 @@ class MethodSharepoint(MethodBase): return ActionResult.isFailure(error=f"Error processing pathObject: {str(e)}. Cannot determine target site for upload.") else: # No pathObject provided - check if pathQuery is valid - if not upload_path or upload_path.strip() == "" or upload_path.strip() == "*": + if not uploadPath or uploadPath.strip() == "" or uploadPath.strip() == "*": return ActionResult.isFailure(error="No valid upload path provided. Either provide pathObject (from findDocumentPath) or a valid pathQuery with specific site information.") # Validate pathQuery format - if not upload_path.startswith('/'): - return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:/... e.g. /site:KM LayerFinance/Documents/Work") + if not uploadPath.startswith('/'): + return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with Microsoft-standard syntax /sites//... e.g. /sites/company-share/Freigegebene Dokumente/Work") - # Check if upload_path contains search terms (words without proper path structure) - if not upload_path.startswith('/site:') and not upload_path.startswith('/Documents') and not upload_path.startswith('/Shared Documents'): - return ActionResult.isFailure(error=f"Invalid pathQuery '{upload_path}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.") + # Check if uploadPath contains search terms (words without proper path structure) + validPathPrefixes = ['/sites/', '/Documents', '/documents', '/Shared Documents', '/shared documents'] + if not any(uploadPath.startswith(prefix) for prefix in validPathPrefixes): + return ActionResult.isFailure(error=f"Invalid pathQuery '{uploadPath}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.") - # For pathQuery, we need to discover sites to find the specific one - sites = await self._discoverSharePointSites() - if not sites: - return ActionResult.isFailure(error="No SharePoint sites found or accessible") + # If uploadPath starts with Microsoft-standard /sites/, try to get site directly + directSite = None + if uploadPath.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(uploadPath) + if parsedPath: + siteName = parsedPath.get("siteName") + # Try to get site directly by path (optimization - no need to load all 60 sites) + directSite = await self._getSiteByStandardPath(siteName) + if directSite: + logger.info(f"Got site directly by standard path - no need to discover all sites") + sites = [directSite] + else: + logger.warning(f"Could not get site directly, falling back to site discovery") + + # If we didn't get the site directly, use discovery and filtering + if not directSite: + # For pathQuery, we need to discover sites to find the specific one + allSites = await self._discoverSharePointSites() + if not allSites: + return ActionResult.isFailure(error="No SharePoint sites found or accessible") + + # If uploadPath starts with Microsoft-standard /sites/, extract site name and filter + if uploadPath.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(uploadPath) + if parsedPath: + siteName = parsedPath.get("siteName") + # Filter sites by name (case-insensitive substring match) + sites = self._filterSitesByHint(allSites, siteName) + if not sites: + return ActionResult.isFailure(error=f"No SharePoint site found matching '{siteName}'") + logger.info(f"Filtered to site(s) matching '{siteName}': {[s['displayName'] for s in sites]}") + else: + sites = allSites + else: + sites = allSites if not sites: return ActionResult.isFailure(error="No valid target site determined for upload") # Process upload paths based on whether pathObject was provided - upload_site_scope = None + uploadSiteScope = None if not pathObject: # Parse the validated pathQuery to extract site and path information - parsed = self._parse_site_scoped_path(upload_path) - if not parsed: - return ActionResult.isFailure(error="Invalid upload_path. Use /site:/") + parsed = self._extractSiteFromStandardPath(uploadPath) - # Find matching site - candidate_sites = self._filter_sites_by_hint(sites, parsed["siteName"]) # substring match + if not parsed: + return ActionResult.isFailure(error="Invalid uploadPath. Use Microsoft-standard /sites//") + + # Find matching site (already filtered above, but ensure we have the right one) + candidateSites = self._filterSitesByHint(sites, parsed["siteName"]) # substring match # Choose exact displayName match if available - exact = [s for s in candidate_sites if (s.get("displayName") or "").strip().lower() == parsed["siteName"].strip().lower()] - selected_site = exact[0] if exact else (candidate_sites[0] if candidate_sites else None) - if not selected_site: + exact = [s for s in candidateSites if (s.get("displayName") or "").strip().lower() == parsed["siteName"].strip().lower()] + selectedSite = exact[0] if exact else (candidateSites[0] if candidateSites else None) + if not selectedSite: return ActionResult.isFailure(error=f"SharePoint site '{parsed['siteName']}' not found or not accessible") - upload_site_scope = selected_site + uploadSiteScope = selectedSite # Use the inner path portion as the actual upload target path # Remove document library name from path (same logic as listDocuments) - inner_path = parsed['innerPath'].lstrip('/') - path_segments = [s for s in inner_path.split('/') if s.strip()] - if len(path_segments) > 1: + innerPath = parsed.get('innerPath', '').lstrip('/') + pathSegments = [s for s in innerPath.split('/') if s.strip()] + if len(pathSegments) > 1: # Path has multiple segments - first might be a library name # Try without first segment (assuming it's a library name) - inner_path = '/'.join(path_segments[1:]) - logger.info(f"Removed first path segment (potential library name), path changed from '{parsed['innerPath']}' to '{inner_path}'") - elif len(path_segments) == 1: + innerPath = '/'.join(pathSegments[1:]) + logger.info(f"Removed first path segment (potential library name), path changed from '{parsed['innerPath']}' to '{innerPath}'") + elif len(pathSegments) == 1: # Only one segment - if it's a common library-like name, use empty path (root) - first_segment_lower = path_segments[0].lower() - library_indicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek'] - if any(indicator in first_segment_lower for indicator in library_indicators): - inner_path = '' - logger.info(f"First segment '{path_segments[0]}' appears to be a library name, using root") + firstSegmentLower = pathSegments[0].lower() + libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek'] + if any(indicator in firstSegmentLower for indicator in libraryIndicators): + innerPath = '' + logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root") - upload_paths = [f"/{inner_path}" if inner_path else "/"] - sites = [selected_site] + uploadPaths = [f"/{innerPath}" if innerPath else "/"] + sites = [selectedSite] else: - # When using pathObject, check if upload_path is a folder ID or a path - if upload_path.startswith('01PPXICCB') or upload_path.startswith('01'): + # When using pathObject, check if uploadPath is a folder ID or a path + if uploadPath.startswith('01PPXICCB') or uploadPath.startswith('01'): # It's a folder ID - use it directly - upload_paths = [upload_path] - logger.info(f"Using folder ID directly for upload: {upload_path}") + uploadPaths = [uploadPath] + logger.info(f"Using folder ID directly for upload: {uploadPath}") else: # It's a path - resolve it normally - upload_paths = self._resolvePathQuery(upload_path) + uploadPaths = self._resolvePathQuery(uploadPath) # Process each document upload - upload_results = [] + uploadResults = [] # Extract file names from documents fileNames = [doc.fileName for doc in chatDocuments] @@ -1349,11 +1788,11 @@ class MethodSharepoint(MethodBase): for i, (chatDocument, fileName) in enumerate(zip(chatDocuments, fileNames)): try: fileId = chatDocument.fileId - file_data = self.services.chat.getFileData(fileId) + fileData = self.services.chat.getFileData(fileId) - if not file_data: + if not fileData: logger.warning(f"File data not found for fileId: {fileId}") - upload_results.append({ + uploadResults.append({ "fileName": fileName, "fileId": fileId, "error": "File data not found", @@ -1362,77 +1801,77 @@ class MethodSharepoint(MethodBase): continue # Upload to the first available site (or could be made configurable) - upload_successful = False + uploadSuccessful = False for site in sites: - site_id = site["id"] - site_name = site["displayName"] - site_url = site["webUrl"] + siteId = site["id"] + siteName = site["displayName"] + siteUrl = site["webUrl"] # Use the first upload path or default to Documents - upload_path = upload_paths[0] if upload_paths else "/Documents" + uploadPath = uploadPaths[0] if uploadPaths else "/Documents" # Handle wildcard paths - replace with default Documents folder - if upload_path == "*": - upload_path = "/Documents" + if uploadPath == "*": + uploadPath = "/Documents" logger.warning(f"Wildcard path '*' detected, using default '/Documents' folder for upload") - # Check if upload_path is a folder ID or a regular path - if upload_path.startswith('01PPXICCB') or upload_path.startswith('01'): + # Check if uploadPath is a folder ID or a regular path + if uploadPath.startswith('01PPXICCB') or uploadPath.startswith('01'): # It's a folder ID - use the folder-specific upload endpoint - upload_endpoint = f"sites/{site_id}/drive/items/{upload_path}:/{fileName}:/content" - logger.info(f"Using folder ID upload endpoint: {upload_endpoint}") + uploadEndpoint = f"sites/{siteId}/drive/items/{uploadPath}:/{fileName}:/content" + logger.info(f"Using folder ID upload endpoint: {uploadEndpoint}") else: # It's a regular path - use the root-based upload endpoint - upload_path = upload_path.rstrip('/') + '/' + fileName - upload_path_clean = upload_path.lstrip('/') - upload_endpoint = f"sites/{site_id}/drive/root:/{upload_path_clean}:/content" - logger.info(f"Using path-based upload endpoint: {upload_endpoint}") + uploadPath = uploadPath.rstrip('/') + '/' + fileName + uploadPathClean = uploadPath.lstrip('/') + uploadEndpoint = f"sites/{siteId}/drive/root:/{uploadPathClean}:/content" + logger.info(f"Using path-based upload endpoint: {uploadEndpoint}") # Upload endpoint for small files (< 4MB) - if len(file_data) < 4 * 1024 * 1024: # 4MB + if len(fileData) < 4 * 1024 * 1024: # 4MB # Upload the file - upload_result = await self._makeGraphApiCall( - upload_endpoint, + uploadResult = await self._makeGraphApiCall( + uploadEndpoint, method="PUT", - data=file_data + data=fileData ) - if "error" not in upload_result: - upload_results.append({ + if "error" not in uploadResult: + uploadResults.append({ "fileName": fileName, "fileId": fileId, "uploadStatus": "success", - "siteName": site_name, - "siteUrl": site_url, - "uploadPath": upload_path, - "uploadEndpoint": upload_endpoint, - "sharepointFileId": upload_result.get("id"), - "webUrl": upload_result.get("webUrl"), - "size": upload_result.get("size"), - "createdDateTime": upload_result.get("createdDateTime") + "siteName": siteName, + "siteUrl": siteUrl, + "uploadPath": uploadPath, + "uploadEndpoint": uploadEndpoint, + "sharepointFileId": uploadResult.get("id"), + "webUrl": uploadResult.get("webUrl"), + "size": uploadResult.get("size"), + "createdDateTime": uploadResult.get("createdDateTime") }) - upload_successful = True + uploadSuccessful = True break else: - logger.warning(f"Upload failed to site {site_name}: {upload_result['error']}") + logger.warning(f"Upload failed to site {siteName}: {uploadResult['error']}") else: # For large files, we would need to implement resumable upload - logger.warning(f"File too large ({len(file_data)} bytes) for site {site_name}") + logger.warning(f"File too large ({len(fileData)} bytes) for site {siteName}") continue - if not upload_successful: - upload_results.append({ + if not uploadSuccessful: + uploadResults.append({ "fileName": fileName, "fileId": fileId, - "error": f"File too large ({len(file_data)} bytes) or upload failed to all sites. Files larger than 4MB require resumable upload (not implemented).", + "error": f"File too large ({len(fileData)} bytes) or upload failed to all sites. Files larger than 4MB require resumable upload (not implemented).", "uploadStatus": "failed" }) except Exception as e: logger.error(f"Error uploading document {fileName}: {str(e)}") - upload_results.append({ + uploadResults.append({ "fileName": fileName, "fileId": fileId, "error": str(e), @@ -1440,13 +1879,13 @@ class MethodSharepoint(MethodBase): }) # Create result data - result_data = { + resultData = { "connectionReference": connectionReference, - "pathQuery": upload_path, + "pathQuery": uploadPath, "documentList": documentList, "fileNames": fileNames, "sitesAvailable": len(sites), - "uploadResults": upload_results, + "uploadResults": uploadResults, "connection": { "id": connection["id"], "authority": "microsoft", @@ -1456,17 +1895,17 @@ class MethodSharepoint(MethodBase): } # Use default JSON format for output - output_extension = ".json" # Default - output_mime_type = "application/json" # Default + outputExtension = ".json" # Default + outputMimeType = "application/json" # Default return ActionResult( success=True, documents=[ ActionDocument( - documentName=f"sharepoint_upload_{self._format_timestamp_for_filename()}{output_extension}", - documentData=json.dumps(result_data, indent=2), - mimeType=output_mime_type + documentName=f"sharepoint_upload_{self._format_timestamp_for_filename()}{outputExtension}", + documentData=json.dumps(resultData, indent=2), + mimeType=outputMimeType ) ] ) @@ -1498,7 +1937,7 @@ class MethodSharepoint(MethodBase): pathQuery = parameters.get("pathQuery") includeSubfolders = parameters.get("includeSubfolders", False) # Default to False for better UX - list_query = pathQuery + listQuery = pathQuery logger.info(f"Using pathQuery: {pathQuery}") if not connectionReference: @@ -1511,65 +1950,65 @@ class MethodSharepoint(MethodBase): logger.debug(f"Both pathObject and pathQuery provided - using pathObject (pathQuery '{pathQuery}' will be ignored)") try: # Resolve the reference label to get the actual document list - document_list = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) - if not document_list or len(document_list) == 0: + documentList = self.services.chat.getChatDocumentsFromDocumentList([pathObject]) + if not documentList or len(documentList) == 0: return ActionResult.isFailure(error=f"No document list found for reference: {pathObject}") # Get the first document's content (which should be the JSON) - first_document = document_list[0] - logger.info(f"Document fileId: {first_document.fileId}, fileName: {first_document.fileName}") - file_data = self.services.chat.getFileData(first_document.fileId) - if not file_data: - return ActionResult.isFailure(error=f"No file data found for document: {pathObject} (fileId: {first_document.fileId})") - logger.info(f"File data length: {len(file_data) if file_data else 0}") + firstDocument = documentList[0] + logger.info(f"Document fileId: {firstDocument.fileId}, fileName: {firstDocument.fileName}") + fileData = self.services.chat.getFileData(firstDocument.fileId) + if not fileData: + return ActionResult.isFailure(error=f"No file data found for document: {pathObject} (fileId: {firstDocument.fileId})") + logger.info(f"File data length: {len(fileData) if fileData else 0}") # Parse the JSON content - result_data = json.loads(file_data) + resultData = json.loads(fileData) # Debug: Log the structure of the result document - logger.info(f"Result document keys: {list(result_data.keys())}") + logger.info(f"Result document keys: {list(resultData.keys())}") # Handle different result document formats - found_documents = [] + foundDocuments = [] # Check if it's a direct SharePoint result (has foundDocuments) - if "foundDocuments" in result_data: - found_documents = result_data.get("foundDocuments", []) - logger.info(f"Found {len(found_documents)} documents in foundDocuments array") + if "foundDocuments" in resultData: + foundDocuments = resultData.get("foundDocuments", []) + logger.info(f"Found {len(foundDocuments)} documents in foundDocuments array") # Check if it's an AI validation result (has result string with validationReport) - elif "result" in result_data and "validationReport" in result_data["result"]: + elif "result" in resultData and "validationReport" in resultData["result"]: try: # Parse the nested JSON in the result field - nested_result = json.loads(result_data["result"]) - validation_report = nested_result.get("validationReport", {}) - document_details = validation_report.get("documentDetails", {}) + nestedResult = json.loads(resultData["result"]) + validationReport = nestedResult.get("validationReport", {}) + documentDetails = validationReport.get("documentDetails", {}) - if document_details: + if documentDetails: # Convert the single document details to the expected format doc = { - "id": document_details.get("id"), - "name": document_details.get("name"), - "type": document_details.get("type", "").lower(), # Convert "Folder" to "folder" - "siteName": document_details.get("siteName"), - "siteId": document_details.get("siteId"), - "fullPath": document_details.get("fullPath"), - "webUrl": document_details.get("webUrl", ""), - "parentPath": document_details.get("parentPath", "") + "id": documentDetails.get("id"), + "name": documentDetails.get("name"), + "type": documentDetails.get("type", "").lower(), # Convert "Folder" to "folder" + "siteName": documentDetails.get("siteName"), + "siteId": documentDetails.get("siteId"), + "fullPath": documentDetails.get("fullPath"), + "webUrl": documentDetails.get("webUrl", ""), + "parentPath": documentDetails.get("parentPath", "") } - found_documents = [doc] + foundDocuments = [doc] logger.info(f"Extracted 1 document from validation report") except ValueError as e: logger.error(f"Failed to parse nested JSON in result field: {e}") return ActionResult.isFailure(error=f"Invalid nested JSON in pathObject: {str(e)}") # Debug: Log what we found in the result document - logger.info(f"Result document contains {len(found_documents)} documents") - for i, doc in enumerate(found_documents): + logger.info(f"Result document contains {len(foundDocuments)} documents") + for i, doc in enumerate(foundDocuments): logger.info(f" Document {i+1}: name='{doc.get('name')}', type='{doc.get('type')}', id='{doc.get('id')}'") # Extract folder information from the result folders = [] - for doc in found_documents: + for doc in foundDocuments: if doc.get("type") == "folder": folders.append(doc) @@ -1577,25 +2016,25 @@ class MethodSharepoint(MethodBase): if folders: # Use the first folder found - prefer folder ID for direct API calls - first_folder = folders[0] - if first_folder.get("id"): + firstFolder = folders[0] + if firstFolder.get("id"): # Use folder ID directly for most reliable API calls - list_query = first_folder.get("id") - logger.info(f"Using folder ID from pathObject: {list_query}") - elif first_folder.get("fullPath"): + listQuery = firstFolder.get("id") + logger.info(f"Using folder ID from pathObject: {listQuery}") + elif firstFolder.get("fullPath"): # Extract the correct path portion from fullPath by removing site name - full_path = first_folder.get("fullPath") + fullPath = firstFolder.get("fullPath") # fullPath format: \\SiteName\\Library\\Folder\\SubFolder # We need to remove the first two parts (\\SiteName\\) to get the actual folder path - path_parts = full_path.lstrip('\\').split('\\') - if len(path_parts) > 1: + pathParts = fullPath.lstrip('\\').split('\\') + if len(pathParts) > 1: # Remove the first part (site name) and reconstruct the path - actual_path = '\\'.join(path_parts[1:]) - list_query = actual_path - logger.info(f"Extracted path from fullPath: {list_query}") + actualPath = '\\'.join(pathParts[1:]) + listQuery = actualPath + logger.info(f"Extracted path from fullPath: {listQuery}") else: - list_query = full_path - logger.info(f"Using full path from pathObject (no site name to remove): {list_query}") + listQuery = fullPath + logger.info(f"Using full path from pathObject (no site name to remove): {listQuery}") else: return ActionResult.isFailure(error="No valid folder information found in pathObject") else: @@ -1611,19 +2050,11 @@ class MethodSharepoint(MethodBase): if not connection: return ActionResult.isFailure(error="No valid Microsoft connection found for the provided connection reference") - logger.info(f"Starting SharePoint listDocuments for list_query: {list_query}") + logger.info(f"Starting SharePoint listDocuments for listQuery: {listQuery}") logger.debug(f"Connection ID: {connection['id']}") - # For listDocuments, if pathQuery starts with /site:, use it directly without parsing - # (parsing would split on the colon and break the site name) - if list_query and list_query.strip().startswith('/site:'): - pathQuery = list_query.strip() - fileQuery = "*" - searchType = "all" - searchOptions = {} - else: - # Parse list_query to extract path, search terms, search type, and options - pathQuery, fileQuery, searchType, searchOptions = self._parseSearchQuery(list_query) + # Parse listQuery to extract path, search terms, search type, and options + pathQuery, fileQuery, searchType, searchOptions = self._parseSearchQuery(listQuery) # Determine sites to use - strict validation: pathObject → pathQuery → ERROR sites = None @@ -1634,19 +2065,19 @@ class MethodSharepoint(MethodBase): # Extract site information from the pathObject result try: # Get the site information from the first folder in pathObject - if 'found_documents' in locals() and found_documents: - first_folder = found_documents[0] - site_name = first_folder.get("siteName") - site_id = first_folder.get("siteId") + if 'foundDocuments' in locals() and foundDocuments: + firstFolder = foundDocuments[0] + siteName = firstFolder.get("siteName") + siteId = firstFolder.get("siteId") - if site_name and site_id: + if siteName and siteId: # Use the specific site from pathObject instead of discovering all sites sites = [{ - "id": site_id, - "displayName": site_name, - "webUrl": first_folder.get("webUrl", "") + "id": siteId, + "displayName": siteName, + "webUrl": firstFolder.get("webUrl", "") }] - logger.info(f"Using specific site from pathObject: {site_name} (ID: {site_id})") + logger.info(f"Using specific site from pathObject: {siteName} (ID: {siteId})") else: # Site info missing from pathObject - this is an error return ActionResult.isFailure(error="Site information missing from pathObject. Cannot determine target site for list operation.") @@ -1661,34 +2092,48 @@ class MethodSharepoint(MethodBase): elif pathQuery and pathQuery.strip() != "" and pathQuery.strip() != "*": # Validate pathQuery format if not pathQuery.startswith('/'): - return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with syntax /site:/... e.g. /site:KM LayerFinance/Documents/Work") + return ActionResult.isFailure(error="pathQuery must start with '/' and include site name with Microsoft-standard syntax /sites//... e.g. /sites/company-share/Freigegebene Dokumente/Work") # Check if pathQuery contains search terms (words without proper path structure) - valid_path_prefixes = ['/site:', '/Documents', '/documents', '/Shared Documents', '/shared documents'] - if not any(pathQuery.startswith(prefix) for prefix in valid_path_prefixes): + validPathPrefixes = ['/sites/', '/Documents', '/documents', '/Shared Documents', '/shared documents'] + if not any(pathQuery.startswith(prefix) for prefix in validPathPrefixes): return ActionResult.isFailure(error=f"Invalid pathQuery '{pathQuery}'. This appears to be search terms, not a valid SharePoint path. Use findDocumentPath action first to search for folders, then use the returned folder path as pathQuery.") - # For pathQuery, we need to discover sites to find the specific one - all_sites = await self._discoverSharePointSites() - if not all_sites: - return ActionResult.isFailure(error="No SharePoint sites found or accessible") + # If pathQuery starts with Microsoft-standard /sites/, try to get site directly + directSite = None + if pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + siteName = parsedPath.get("siteName") + # Try to get site directly by path (optimization - no need to load all 60 sites) + directSite = await self._getSiteByStandardPath(siteName) + if directSite: + logger.info(f"Got site directly by standard path - no need to discover all sites") + sites = [directSite] + else: + logger.warning(f"Could not get site directly, falling back to site discovery") - # If pathQuery starts with /site:, extract site name and filter - if pathQuery.startswith('/site:'): - # Extract site name from /site:Company Share/... format - site_path_part = pathQuery[6:] # Remove '/site:' - if '/' in site_path_part: - site_name = site_path_part.split('/', 1)[0] - else: - site_name = site_path_part + # If we didn't get the site directly, use discovery and filtering + if not directSite: + # For pathQuery, we need to discover sites to find the specific one + allSites = await self._discoverSharePointSites() + if not allSites: + return ActionResult.isFailure(error="No SharePoint sites found or accessible") - # Filter sites by name (case-insensitive substring match) - sites = self._filter_sites_by_hint(all_sites, site_name) - if not sites: - return ActionResult.isFailure(error=f"No SharePoint site found matching '{site_name}'") - logger.info(f"Filtered to site(s) matching '{site_name}': {[s['displayName'] for s in sites]}") - else: - sites = all_sites + # If pathQuery starts with Microsoft-standard /sites/, extract site name and filter + if pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + siteName = parsedPath.get("siteName") + # Filter sites by name (case-insensitive substring match) + sites = self._filterSitesByHint(allSites, siteName) + if not sites: + return ActionResult.isFailure(error=f"No SharePoint site found matching '{siteName}'") + logger.info(f"Filtered to site(s) matching '{siteName}': {[s['displayName'] for s in sites]}") + else: + sites = allSites + else: + sites = allSites else: # Step 3: Both pathObject and pathQuery failed - ERROR, NO FALLBACK return ActionResult.isFailure(error="No valid list path provided. Either provide pathObject (from findDocumentPath) or a valid pathQuery with specific site information.") @@ -1696,232 +2141,231 @@ class MethodSharepoint(MethodBase): if not sites: return ActionResult.isFailure(error="No valid target site determined for list operation") - # Check if list_query is a folder ID (starts with 01PPXICCB...) - if list_query.startswith('01PPXICCB') or list_query.startswith('01'): + # Check if listQuery is a folder ID (starts with 01PPXICCB...) + if listQuery.startswith('01PPXICCB') or listQuery.startswith('01'): # Direct folder ID - use it directly - folder_paths = [list_query] - logger.info(f"Using direct folder ID: {list_query}") + folderPaths = [listQuery] + logger.info(f"Using direct folder ID: {listQuery}") else: - # Remove /site:SiteName prefix from pathQuery before resolving (it's only for site filtering) + # Remove site prefix from pathQuery before resolving (it's only for site filtering) pathQueryForResolve = pathQuery - if pathQuery.startswith('/site:'): - # Remove /site:SiteName/ and keep the rest - site_path_part = pathQuery[6:] # Remove '/site:' - if '/' in site_path_part: - # Remove the site name part, keep the folder path - pathQueryForResolve = '/' + site_path_part.split('/', 1)[1] + # Microsoft-standard path: /sites/SiteName/Path -> /Path + if pathQuery.startswith('/sites/'): + parsedPath = self._extractSiteFromStandardPath(pathQuery) + if parsedPath: + innerPath = parsedPath.get("innerPath", "") + pathQueryForResolve = '/' + innerPath if innerPath else '/' else: - # Only site name, no path - use root pathQueryForResolve = '/' # Remove first path segment if it looks like a document library name # In SharePoint Graph API, /drive/root already points to the default document library, # so library names in paths should be removed # Generic approach: if path has multiple segments, store original for fallback - path_segments = [s for s in pathQueryForResolve.split('/') if s.strip()] - if len(path_segments) > 1: + pathSegments = [s for s in pathQueryForResolve.split('/') if s.strip()] + if len(pathSegments) > 1: # Path has multiple segments - first might be a library name # Store original for potential fallback - original_path = pathQueryForResolve + originalPath = pathQueryForResolve # Try without first segment (assuming it's a library name) - pathQueryForResolve = '/' + '/'.join(path_segments[1:]) - logger.info(f"Removed first path segment (potential library name), path changed from '{original_path}' to '{pathQueryForResolve}'") - elif len(path_segments) == 1: + pathQueryForResolve = '/' + '/'.join(pathSegments[1:]) + logger.info(f"Removed first path segment (potential library name), path changed from '{originalPath}' to '{pathQueryForResolve}'") + elif len(pathSegments) == 1: # Only one segment - if it's a common library-like name, use root - first_segment_lower = path_segments[0].lower() - library_indicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek'] - if any(indicator in first_segment_lower for indicator in library_indicators): + firstSegmentLower = pathSegments[0].lower() + libraryIndicators = ['document', 'dokument', 'shared', 'freigegeben', 'library', 'bibliothek'] + if any(indicator in firstSegmentLower for indicator in libraryIndicators): pathQueryForResolve = '/' - logger.info(f"First segment '{path_segments[0]}' appears to be a library name, using root") + logger.info(f"First segment '{pathSegments[0]}' appears to be a library name, using root") # Resolve path query into folder paths - folder_paths = self._resolvePathQuery(pathQueryForResolve) - logger.info(f"Resolved folder paths: {folder_paths}") + folderPaths = self._resolvePathQuery(pathQueryForResolve) + logger.info(f"Resolved folder paths: {folderPaths}") # Process each folder path across all sites - list_results = [] + listResults = [] - for folderPath in folder_paths: + for folderPath in folderPaths: try: - folder_results = [] + folderResults = [] for site in sites: - site_id = site["id"] - site_name = site["displayName"] - site_url = site["webUrl"] + siteId = site["id"] + siteName = site["displayName"] + siteUrl = site["webUrl"] - logger.info(f"Listing folder {folderPath} in site: {site_name}") + logger.info(f"Listing folder {folderPath} in site: {siteName}") # Determine the endpoint based on folder path if folderPath in ["/", ""] or folderPath == "*": # Root folder - endpoint = f"sites/{site_id}/drive/root/children" + endpoint = f"sites/{siteId}/drive/root/children" elif folderPath.startswith('01PPXICCB') or folderPath.startswith('01'): # Direct folder ID - endpoint = f"sites/{site_id}/drive/items/{folderPath}/children" + endpoint = f"sites/{siteId}/drive/items/{folderPath}/children" else: # Specific folder path - remove leading slash if present and URL encode - folder_path_clean = folderPath.lstrip('/') + folderPathClean = folderPath.lstrip('/') # URL encode the path for Graph API (spaces and special characters need encoding) - folder_path_encoded = urllib.parse.quote(folder_path_clean, safe='/') - endpoint = f"sites/{site_id}/drive/root:/{folder_path_encoded}:/children" + folderPathEncoded = urllib.parse.quote(folderPathClean, safe='/') + endpoint = f"sites/{siteId}/drive/root:/{folderPathEncoded}:/children" # Make the API call to list folder contents - api_result = await self._makeGraphApiCall(endpoint) + apiResult = await self._makeGraphApiCall(endpoint) - if "error" in api_result: - logger.warning(f"Failed to list folder {folderPath} in site {site_name}: {api_result['error']}") + if "error" in apiResult: + logger.warning(f"Failed to list folder {folderPath} in site {siteName}: {apiResult['error']}") continue # Process the results - items = api_result.get("value", []) - processed_items = [] + items = apiResult.get("value", []) + processedItems = [] for item in items: # Use improved folder detection logic - is_folder = False + isFolder = False if 'folder' in item: - is_folder = True + isFolder = True else: # Try to detect by URL pattern or other indicators - web_url = item.get('webUrl', '') + webUrl = item.get('webUrl', '') name = item.get('name', '') # Check if URL has no file extension and looks like a folder path - if '.' not in name and ('/' in web_url or '\\' in web_url): - is_folder = True + if '.' not in name and ('/' in webUrl or '\\' in webUrl): + isFolder = True - item_info = { + itemInfo = { "id": item.get("id"), "name": item.get("name"), "size": item.get("size", 0), "createdDateTime": item.get("createdDateTime"), "lastModifiedDateTime": item.get("lastModifiedDateTime"), "webUrl": item.get("webUrl"), - "type": "folder" if is_folder else "file", - "siteName": site_name, - "siteUrl": site_url + "type": "folder" if isFolder else "file", + "siteName": siteName, + "siteUrl": siteUrl } # Add file-specific information if "file" in item: - item_info.update({ + itemInfo.update({ "mimeType": item["file"].get("mimeType"), "downloadUrl": item.get("@microsoft.graph.downloadUrl") }) # Add folder-specific information if "folder" in item: - item_info.update({ + itemInfo.update({ "childCount": item["folder"].get("childCount", 0) }) - processed_items.append(item_info) + processedItems.append(itemInfo) # If include subfolders is enabled, get ONLY direct subfolder contents (1 level deep only) if includeSubfolders: - folder_items = [item for item in processed_items if item['type'] == 'folder'] - logger.info(f"Including subfolders - processing {len(folder_items)} folders") - subfolder_count = 0 - max_subfolders = 10 # Limit to prevent infinite loops + folderItems = [item for item in processedItems if item['type'] == 'folder'] + logger.info(f"Including subfolders - processing {len(folderItems)} folders") + subfolderCount = 0 + maxSubfolders = 10 # Limit to prevent infinite loops - for item in processed_items[:]: # Use slice to avoid modifying list during iteration - if item["type"] == "folder" and subfolder_count < max_subfolders: - subfolder_count += 1 - subfolder_path = f"{folderPath.rstrip('/')}/{item['name']}" - subfolder_endpoint = f"sites/{site_id}/drive/items/{item['id']}/children" + for item in processedItems[:]: # Use slice to avoid modifying list during iteration + if item["type"] == "folder" and subfolderCount < maxSubfolders: + subfolderCount += 1 + subfolderPath = f"{folderPath.rstrip('/')}/{item['name']}" + subfolderEndpoint = f"sites/{siteId}/drive/items/{item['id']}/children" logger.debug(f"Getting contents of subfolder: {item['name']}") - subfolder_result = await self._makeGraphApiCall(subfolder_endpoint) - if "error" not in subfolder_result: - subfolder_items = subfolder_result.get("value", []) - logger.debug(f"Found {len(subfolder_items)} items in subfolder {item['name']}") + subfolderResult = await self._makeGraphApiCall(subfolderEndpoint) + if "error" not in subfolderResult: + subfolderItems = subfolderResult.get("value", []) + logger.debug(f"Found {len(subfolderItems)} items in subfolder {item['name']}") - for subfolder_item in subfolder_items: + for subfolderItem in subfolderItems: # Use improved folder detection logic for subfolder items - subfolder_is_folder = False - if 'folder' in subfolder_item: - subfolder_is_folder = True + subfolderIsFolder = False + if 'folder' in subfolderItem: + subfolderIsFolder = True else: # Try to detect by URL pattern or other indicators - subfolder_web_url = subfolder_item.get('webUrl', '') - subfolder_name = subfolder_item.get('name', '') + subfolderWebUrl = subfolderItem.get('webUrl', '') + subfolderName = subfolderItem.get('name', '') # Check if URL has no file extension and looks like a folder path - if '.' not in subfolder_name and ('/' in subfolder_web_url or '\\' in subfolder_web_url): - subfolder_is_folder = True + if '.' not in subfolderName and ('/' in subfolderWebUrl or '\\' in subfolderWebUrl): + subfolderIsFolder = True # Only add files and direct subfolders, NO RECURSION - subfolder_item_info = { - "id": subfolder_item.get("id"), - "name": subfolder_item.get("name"), - "size": subfolder_item.get("size", 0), - "createdDateTime": subfolder_item.get("createdDateTime"), - "lastModifiedDateTime": subfolder_item.get("lastModifiedDateTime"), - "webUrl": subfolder_item.get("webUrl"), - "type": "folder" if subfolder_is_folder else "file", - "parentPath": subfolder_path, - "siteName": site_name, - "siteUrl": site_url + subfolderItemInfo = { + "id": subfolderItem.get("id"), + "name": subfolderItem.get("name"), + "size": subfolderItem.get("size", 0), + "createdDateTime": subfolderItem.get("createdDateTime"), + "lastModifiedDateTime": subfolderItem.get("lastModifiedDateTime"), + "webUrl": subfolderItem.get("webUrl"), + "type": "folder" if subfolderIsFolder else "file", + "parentPath": subfolderPath, + "siteName": siteName, + "siteUrl": siteUrl } - if "file" in subfolder_item: - subfolder_item_info.update({ - "mimeType": subfolder_item["file"].get("mimeType"), - "downloadUrl": subfolder_item.get("@microsoft.graph.downloadUrl") + if "file" in subfolderItem: + subfolderItemInfo.update({ + "mimeType": subfolderItem["file"].get("mimeType"), + "downloadUrl": subfolderItem.get("@microsoft.graph.downloadUrl") }) - processed_items.append(subfolder_item_info) + processedItems.append(subfolderItemInfo) else: - logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolder_result.get('error')}") - elif subfolder_count >= max_subfolders: - logger.warning(f"Reached maximum subfolder limit ({max_subfolders}), skipping remaining folders") + logger.warning(f"Failed to get contents of subfolder {item['name']}: {subfolderResult.get('error')}") + elif subfolderCount >= maxSubfolders: + logger.warning(f"Reached maximum subfolder limit ({maxSubfolders}), skipping remaining folders") break - logger.info(f"Processed {subfolder_count} subfolders, total items: {len(processed_items)}") + logger.info(f"Processed {subfolderCount} subfolders, total items: {len(processedItems)}") - folder_results.append({ - "siteName": site_name, - "siteUrl": site_url, - "itemCount": len(processed_items), - "items": processed_items + folderResults.append({ + "siteName": siteName, + "siteUrl": siteUrl, + "itemCount": len(processedItems), + "items": processedItems }) - list_results.append({ + listResults.append({ "folderPath": folderPath, - "sitesProcessed": len(folder_results), - "siteResults": folder_results + "sitesProcessed": len(folderResults), + "siteResults": folderResults }) except Exception as e: logger.error(f"Error listing folder {folderPath}: {str(e)}") - list_results.append({ + listResults.append({ "folderPath": folderPath, "error": str(e), "siteResults": [] }) # Create result data - result_data = { - "pathQuery": list_query, + resultData = { + "pathQuery": listQuery, "includeSubfolders": includeSubfolders, "sitesSearched": len(sites), - "listResults": list_results, + "listResults": listResults, "timestamp": self.services.utils.timestampGetUtc() } # Use default JSON format for output - output_extension = ".json" # Default - output_mime_type = "application/json" # Default + outputExtension = ".json" # Default + outputMimeType = "application/json" # Default return ActionResult( success=True, documents=[ ActionDocument( - documentName=f"sharepoint_document_list_{self._format_timestamp_for_filename()}{output_extension}", - documentData=json.dumps(result_data, indent=2), - mimeType=output_mime_type + documentName=f"sharepoint_document_list_{self._format_timestamp_for_filename()}{outputExtension}", + documentData=json.dumps(resultData, indent=2), + mimeType=outputMimeType ) ] ) diff --git a/modules/workflows/processing/core/taskPlanner.py b/modules/workflows/processing/core/taskPlanner.py index 6a73d971..ec6d3bb7 100644 --- a/modules/workflows/processing/core/taskPlanner.py +++ b/modules/workflows/processing/core/taskPlanner.py @@ -37,16 +37,27 @@ class TaskPlanner: # Check workflow status before calling AI service checkWorkflowStopped(self.services) - + # Analyze user intent to obtain cleaned user objective for planning - # This intent will be reused for workflow-level validation in executeTask - from modules.workflows.processing.adaptive import IntentAnalyzer - intentAnalyzer = IntentAnalyzer(self.services) - workflowIntent = await intentAnalyzer.analyzeUserIntent(actualUserPrompt, None) - # Store workflow intent for reuse in executeTask (avoid redundant analysis) - if not hasattr(workflow, '_workflowIntent'): - workflow._workflowIntent = workflowIntent - cleanedObjective = workflowIntent.get('primaryGoal', actualUserPrompt) if isinstance(workflowIntent, dict) else actualUserPrompt + # SKIP intent analysis for AUTOMATION mode - it uses predefined JSON plans + from modules.datamodels.datamodelChat import WorkflowModeEnum + workflowMode = getattr(workflow, 'workflowMode', None) + skipIntentionAnalysis = (workflowMode == WorkflowModeEnum.WORKFLOW_AUTOMATION) + + if skipIntentionAnalysis: + logger.info("Skipping intent analysis for AUTOMATION mode - using direct user input") + # For automation mode, use user input directly without intent analysis + cleanedObjective = actualUserPrompt + workflowIntent = None + else: + # This intent will be reused for workflow-level validation in executeTask + from modules.workflows.processing.adaptive import IntentAnalyzer + intentAnalyzer = IntentAnalyzer(self.services) + workflowIntent = await intentAnalyzer.analyzeUserIntent(actualUserPrompt, None) + # Store workflow intent for reuse in executeTask (avoid redundant analysis) + if not hasattr(workflow, '_workflowIntent'): + workflow._workflowIntent = workflowIntent + cleanedObjective = workflowIntent.get('primaryGoal', actualUserPrompt) if isinstance(workflowIntent, dict) else actualUserPrompt # Create proper context object for task planning using cleaned intent # For task planning, we need to create a minimal TaskStep since TaskContext requires it diff --git a/modules/workflows/processing/modes/modeAutomation.py b/modules/workflows/processing/modes/modeAutomation.py index a4609e15..96e379db 100644 --- a/modules/workflows/processing/modes/modeAutomation.py +++ b/modules/workflows/processing/modes/modeAutomation.py @@ -26,51 +26,39 @@ class AutomationMode(BaseMode): async def generateTaskPlan(self, userInput: str, workflow: ChatWorkflow) -> TaskPlan: """ - Generate task plan from stored template plan (no AI planning needed). - The plan is stored in module-level cache by executeAutomation. + Generate task plan from JSON plan in userInput (no AI planning needed). + AUTOMATION mode ALWAYS requires a JSON plan to be provided in the user input. + The plan can be: + - Embedded between and + - Or as direct JSON in userInput """ try: - # Get plan from module-level cache (stored by executeAutomation) + # AUTOMATION mode ALWAYS requires a JSON plan to be provided in userInput + # Try to extract plan from userInput (embedded JSON or direct JSON) templatePlan = None - if hasattr(self, '_templatePlanCache') and self._templatePlanCache: - templatePlan = self._templatePlanCache.get(workflow.id) - - # Try module-level cache - if not templatePlan: - try: - from modules.workflows.processing.modes import modeAutomation - if hasattr(modeAutomation, '_templatePlanCache'): - templatePlan = modeAutomation._templatePlanCache.get(workflow.id) - if templatePlan: - logger.info(f"Retrieved template plan from module cache for workflow {workflow.id}") - except Exception as e: - logger.warning(f"Could not access module cache: {str(e)}") - - if not templatePlan: - # Fallback: Extract from prompt (embedded as JSON comment) - try: - # Look for embedded plan in prompt (between and ) - startMarker = "" - endMarker = "" - startIdx = userInput.find(startMarker) - endIdx = userInput.find(endMarker) - - if startIdx >= 0 and endIdx > startIdx: - planJson = userInput[startIdx + len(startMarker):endIdx].strip() - templatePlan = json.loads(planJson) - logger.info("Extracted template plan from embedded JSON in prompt") - elif '{' in userInput and '"tasks"' in userInput: - # Try parsing entire userInput as JSON (fallback) - jsonStart = userInput.find('{') - jsonEnd = userInput.rfind('}') + 1 - if jsonStart >= 0 and jsonEnd > jsonStart: - templatePlan = json.loads(userInput[jsonStart:jsonEnd]) - logger.info("Parsed template plan from userInput JSON (fallback)") - else: - raise ValueError("No template plan found in cache or prompt") - except (json.JSONDecodeError, ValueError) as e: - logger.error(f"Could not parse template plan: {str(e)}") - raise ValueError(f"Template mode requires a predefined plan, but none was found: {str(e)}") + try: + # Look for embedded plan in prompt (between and ) + startMarker = "" + endMarker = "" + startIdx = userInput.find(startMarker) + endIdx = userInput.find(endMarker) + + if startIdx >= 0 and endIdx > startIdx: + planJson = userInput[startIdx + len(startMarker):endIdx].strip() + templatePlan = json.loads(planJson) + logger.info("Extracted template plan from embedded JSON in prompt") + elif '{' in userInput and '"tasks"' in userInput: + # Try parsing entire userInput as JSON (fallback) + jsonStart = userInput.find('{') + jsonEnd = userInput.rfind('}') + 1 + if jsonStart >= 0 and jsonEnd > jsonStart: + templatePlan = json.loads(userInput[jsonStart:jsonEnd]) + logger.info("Parsed template plan from userInput JSON (fallback)") + else: + raise ValueError("No template plan found in userInput. AUTOMATION mode requires a JSON plan to be provided in the user input.") + except (json.JSONDecodeError, ValueError) as e: + logger.error(f"Could not parse template plan: {str(e)}") + raise ValueError(f"AUTOMATION mode requires a predefined JSON plan with 'tasks' array, but none was found. Please provide the plan in the user input (embedded between and or as direct JSON). Error: {str(e)}") logger.info(f"Using template plan with {len(templatePlan.get('tasks', []))} tasks") @@ -109,15 +97,6 @@ class AutomationMode(BaseMode): logger.info(f"Generated task plan from template with {len(tasks)} tasks") - # Clean up cache after retrieving plan (prevent memory leaks) - try: - from modules.workflows.processing.modes import modeAutomation - if hasattr(modeAutomation, '_templatePlanCache') and workflow.id in modeAutomation._templatePlanCache: - del modeAutomation._templatePlanCache[workflow.id] - logger.debug(f"Cleaned up template plan cache for workflow {workflow.id}") - except Exception as e: - logger.warning(f"Could not clean up template plan cache: {str(e)}") - return taskPlan except Exception as e: diff --git a/modules/workflows/workflowManager.py b/modules/workflows/workflowManager.py index 363a42e6..6751bde6 100644 --- a/modules/workflows/workflowManager.py +++ b/modules/workflows/workflowManager.py @@ -204,118 +204,129 @@ class WorkflowManager: } # Analyze the user's input to detect language, normalize request, extract intent, and offload bulky context into documents + # SKIP user intention analysis for AUTOMATION mode - it uses predefined JSON plans createdDocs = [] + workflowMode = getattr(workflow, 'workflowMode', None) + skipIntentionAnalysis = (workflowMode == WorkflowModeEnum.WORKFLOW_AUTOMATION) - try: - analyzerPrompt = ( - "You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n" - "1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n" - "2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n" - "3) intent: concise single-paragraph core request in the detected language for high-level routing.\n" - "4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n" - "Rules:\n" - "- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n" - "- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n" - "- Preserve critical references (URLs, filenames) in intent.\n" - "- Normalize to the primary detected language if mixed-language.\n\n" - "Return ONLY JSON (no markdown) with this shape:\n" - "{\n" - " \"detectedLanguage\": \"de|en|fr|it|...\",\n" - " \"normalizedRequest\": \"Full explicit instruction in detected language\",\n" - " \"intent\": \"Concise normalized request...\",\n" - " \"contextItems\": [\n" - " {\n" - " \"title\": \"User context 1\",\n" - " \"mimeType\": \"text/plain\",\n" - " \"content\": \"Full extracted content block here\"\n" - " }\n" - " ]\n" - "}\n\n" - f"User message:\n{self.services.utils.sanitizePromptContent(userInput.prompt, 'userinput')}" - ) - - # Call AI analyzer (planning call - will use static parameters) - aiResponse = await self.services.ai.callAiPlanning( - prompt=analyzerPrompt, - placeholders=None, - debugType="userintention" - ) - + if skipIntentionAnalysis: + logger.info("Skipping user intention analysis for AUTOMATION mode - using direct user input") + # For automation mode, use user input directly without AI analysis + self.services.currentUserPrompt = userInput.prompt detectedLanguage = None normalizedRequest = None intentText = userInput.prompt contextItems = [] - - # Parse analyzer response (JSON expected) + else: try: - jsonStart = aiResponse.find('{') if aiResponse else -1 - jsonEnd = aiResponse.rfind('}') + 1 if aiResponse else 0 - if jsonStart != -1 and jsonEnd > jsonStart: - parsed = json.loads(aiResponse[jsonStart:jsonEnd]) - detectedLanguage = parsed.get('detectedLanguage') or None - normalizedRequest = parsed.get('normalizedRequest') or None - if parsed.get('intent'): - intentText = parsed.get('intent') - contextItems = parsed.get('contextItems') or [] - except Exception: + analyzerPrompt = ( + "You are an input analyzer. From the user's message, perform ALL of the following in one pass:\n" + "1) detectedLanguage: detect ISO 639-1 language code (e.g., de, en).\n" + "2) normalizedRequest: full, explicit restatement of the user's request in the detected language; do NOT summarize; preserve ALL constraints and details.\n" + "3) intent: concise single-paragraph core request in the detected language for high-level routing.\n" + "4) contextItems: supportive data blocks to attach as separate documents if significantly larger than the intent (large literal content, long lists/tables, code/JSON blocks, transcripts, CSV fragments, detailed specs). Keep URLs in the intent unless they embed large pasted content.\n\n" + "Rules:\n" + "- If total content (intent + data) is < 10% of model max tokens, do not extract; return empty contextItems and keep intent compact and self-contained.\n" + "- If content exceeds that threshold, move bulky parts into contextItems; keep intent short and clear.\n" + "- Preserve critical references (URLs, filenames) in intent.\n" + "- Normalize to the primary detected language if mixed-language.\n\n" + "Return ONLY JSON (no markdown) with this shape:\n" + "{\n" + " \"detectedLanguage\": \"de|en|fr|it|...\",\n" + " \"normalizedRequest\": \"Full explicit instruction in detected language\",\n" + " \"intent\": \"Concise normalized request...\",\n" + " \"contextItems\": [\n" + " {\n" + " \"title\": \"User context 1\",\n" + " \"mimeType\": \"text/plain\",\n" + " \"content\": \"Full extracted content block here\"\n" + " }\n" + " ]\n" + "}\n\n" + f"User message:\n{self.services.utils.sanitizePromptContent(userInput.prompt, 'userinput')}" + ) + + # Call AI analyzer (planning call - will use static parameters) + aiResponse = await self.services.ai.callAiPlanning( + prompt=analyzerPrompt, + placeholders=None, + debugType="userintention" + ) + + detectedLanguage = None + normalizedRequest = None + intentText = userInput.prompt contextItems = [] - # Update services state - if detectedLanguage and isinstance(detectedLanguage, str): - self._setUserLanguage(detectedLanguage) + # Parse analyzer response (JSON expected) try: - setattr(self.services, 'currentUserLanguage', detectedLanguage) + jsonStart = aiResponse.find('{') if aiResponse else -1 + jsonEnd = aiResponse.rfind('}') + 1 if aiResponse else 0 + if jsonStart != -1 and jsonEnd > jsonStart: + parsed = json.loads(aiResponse[jsonStart:jsonEnd]) + detectedLanguage = parsed.get('detectedLanguage') or None + normalizedRequest = parsed.get('normalizedRequest') or None + if parsed.get('intent'): + intentText = parsed.get('intent') + contextItems = parsed.get('contextItems') or [] + except Exception: + contextItems = [] + + # Update services state + if detectedLanguage and isinstance(detectedLanguage, str): + self._setUserLanguage(detectedLanguage) + try: + setattr(self.services, 'currentUserLanguage', detectedLanguage) + except Exception: + pass + self.services.currentUserPrompt = intentText or userInput.prompt + try: + if normalizedRequest: + setattr(self.services, 'currentUserPromptNormalized', normalizedRequest) + if contextItems is not None: + setattr(self.services, 'currentUserContextItems', contextItems) except Exception: pass - self.services.currentUserPrompt = intentText or userInput.prompt - try: - if normalizedRequest: - setattr(self.services, 'currentUserPromptNormalized', normalizedRequest) - if contextItems is not None: - setattr(self.services, 'currentUserContextItems', contextItems) - except Exception: - pass + # Create documents for context items + if contextItems and isinstance(contextItems, list): + for idx, item in enumerate(contextItems): + try: + title = item.get('title') if isinstance(item, dict) else None + mime = item.get('mimeType') if isinstance(item, dict) else None + content = item.get('content') if isinstance(item, dict) else None + if not content: + continue + fileName = (title or f"user_context_{idx+1}.txt").strip() + mimeType = (mime or "text/plain").strip() - # Create documents for context items - if contextItems and isinstance(contextItems, list): - for idx, item in enumerate(contextItems): - try: - title = item.get('title') if isinstance(item, dict) else None - mime = item.get('mimeType') if isinstance(item, dict) else None - content = item.get('content') if isinstance(item, dict) else None - if not content: + # Neutralize content before storing if neutralization is enabled + contentBytes = content.encode('utf-8') + contentBytes = await self._neutralizeContentIfEnabled(contentBytes, mimeType) + + # Create file in component storage + fileItem = self.services.interfaceDbComponent.createFile( + name=fileName, + mimeType=mimeType, + content=contentBytes + ) + # Persist file data + self.services.interfaceDbComponent.createFileData(fileItem.id, contentBytes) + + # Collect file info + fileInfo = self.services.chat.getFileInfo(fileItem.id) + from modules.datamodels.datamodelChat import ChatDocument + doc = ChatDocument( + fileId=fileItem.id, + fileName=fileInfo.get("fileName", fileName) if fileInfo else fileName, + fileSize=fileInfo.get("size", len(contentBytes)) if fileInfo else len(contentBytes), + mimeType=fileInfo.get("mimeType", mimeType) if fileInfo else mimeType + ) + createdDocs.append(doc) + except Exception: continue - fileName = (title or f"user_context_{idx+1}.txt").strip() - mimeType = (mime or "text/plain").strip() - - # Neutralize content before storing if neutralization is enabled - contentBytes = content.encode('utf-8') - contentBytes = await self._neutralizeContentIfEnabled(contentBytes, mimeType) - - # Create file in component storage - fileItem = self.services.interfaceDbComponent.createFile( - name=fileName, - mimeType=mimeType, - content=contentBytes - ) - # Persist file data - self.services.interfaceDbComponent.createFileData(fileItem.id, contentBytes) - - # Collect file info - fileInfo = self.services.chat.getFileInfo(fileItem.id) - from modules.datamodels.datamodelChat import ChatDocument - doc = ChatDocument( - fileId=fileItem.id, - fileName=fileInfo.get("fileName", fileName) if fileInfo else fileName, - fileSize=fileInfo.get("size", len(contentBytes)) if fileInfo else len(contentBytes), - mimeType=fileInfo.get("mimeType", mimeType) if fileInfo else mimeType - ) - createdDocs.append(doc) - except Exception: - continue - except Exception as e: - logger.warning(f"Prompt analysis failed or skipped: {str(e)}") + except Exception as e: + logger.warning(f"Prompt analysis failed or skipped: {str(e)}") # Process user-uploaded documents (fileIds) and combine with context documents if userInput.listFileId: