gateway/modules/connectors/connectorSharepoint.py

"""Connector for SharePoint operations using Microsoft Graph API."""

import logging
import json
import aiohttp
import asyncio
from typing import Dict, Any, List, Optional
from datetime import datetime, UTC

logger = logging.getLogger(__name__)


class ConnectorSharepoint:
    """SharePoint connector using Microsoft Graph API for reliable authentication."""

    def __init__(self, access_token: str):
        """Initialize with access token.

        Args:
            access_token: Microsoft Graph access token
        """
        self.access_token = access_token
        self.base_url = "https://graph.microsoft.com/v1.0"

    async def _make_graph_api_call(self, endpoint: str, method: str = "GET", data: bytes = None) -> Dict[str, Any]:
        """Make a Microsoft Graph API call with proper error handling."""
        try:
            headers = {
                "Authorization": f"Bearer {self.access_token}",
                "Content-Type": "application/json" if data and method != "PUT" else "application/octet-stream" if data else "application/json"
            }

            # Remove leading slash from endpoint to avoid double slash
            clean_endpoint = endpoint.lstrip('/')
            url = f"{self.base_url}/{clean_endpoint}"
            logger.debug(f"Making Graph API call: {method} {url}")

            timeout = aiohttp.ClientTimeout(total=30)

            async with aiohttp.ClientSession(timeout=timeout) as session:
                if method == "GET":
                    async with session.get(url, headers=headers) as response:
                        if response.status == 200:
                            return await response.json()
                        else:
                            error_text = await response.text()
                            logger.error(f"Graph API call failed: {response.status} - {error_text}")
                            return {"error": f"API call failed: {response.status} - {error_text}"}

                elif method == "PUT":
                    async with session.put(url, headers=headers, data=data) as response:
                        if response.status in [200, 201]:
                            return await response.json()
                        else:
                            error_text = await response.text()
                            logger.error(f"Graph API call failed: {response.status} - {error_text}")
                            return {"error": f"API call failed: {response.status} - {error_text}"}

                elif method == "POST":
                    async with session.post(url, headers=headers, data=data) as response:
                        if response.status in [200, 201]:
                            return await response.json()
                        else:
                            error_text = await response.text()
                            logger.error(f"Graph API call failed: {response.status} - {error_text}")
                            return {"error": f"API call failed: {response.status} - {error_text}"}

        except asyncio.TimeoutError:
            logger.error(f"Graph API call timed out after 30 seconds: {endpoint}")
            return {"error": f"API call timed out after 30 seconds: {endpoint}"}
        except Exception as e:
            logger.error(f"Error making Graph API call: {str(e)}")
            return {"error": f"Error making Graph API call: {str(e)}"}

    async def discover_sites(self) -> List[Dict[str, Any]]:
        """Discover all SharePoint sites accessible to the user."""
        try:
            result = await self._make_graph_api_call("sites?search=*")

            if "error" in result:
                logger.error(f"Error discovering SharePoint sites: {result['error']}")
                return []

            sites = result.get("value", [])
            logger.info(f"Discovered {len(sites)} SharePoint sites")

            processed_sites = []
            for site in sites:
                site_info = {
                    "id": site.get("id"),
                    "displayName": site.get("displayName"),
                    "name": site.get("name"),
                    "webUrl": site.get("webUrl"),
                    "description": site.get("description"),
                    "createdDateTime": site.get("createdDateTime"),
                    "lastModifiedDateTime": site.get("lastModifiedDateTime")
                }
                processed_sites.append(site_info)
                logger.debug(f"Site: {site_info['displayName']} - {site_info['webUrl']}")

            return processed_sites

        except Exception as e:
            logger.error(f"Error discovering SharePoint sites: {str(e)}")
            return []

    async def find_site_by_name(self, site_name: str) -> Optional[Dict[str, Any]]:
        """Find a specific SharePoint site by name using direct Graph API call."""
        try:
            # Try to get the site directly by name using Graph API
            endpoint = f"sites/{site_name}"
            result = await self._make_graph_api_call(endpoint)

            if result and "error" not in result:
                site_info = {
                    "id": result.get("id"),
                    "displayName": result.get("displayName"),
                    "name": result.get("name"),
                    "webUrl": result.get("webUrl"),
                    "description": result.get("description"),
                    "createdDateTime": result.get("createdDateTime"),
                    "lastModifiedDateTime": result.get("lastModifiedDateTime")
                }
                logger.info(f"Found site directly: {site_info['displayName']} - {site_info['webUrl']}")
                return site_info

        except Exception as e:
            logger.debug(f"Direct site lookup failed for '{site_name}': {str(e)}")

        # Fallback to discovery if direct lookup fails
        logger.info(f"Direct lookup failed, trying discovery for site: {site_name}")
        sites = await self.discover_sites()
        if not sites:
            logger.warning("No sites discovered")
            return None

        logger.info(f"Discovered {len(sites)} SharePoint sites:")
        for site in sites:
            logger.info(f"  - {site.get('displayName', 'Unknown')} (ID: {site.get('id', 'Unknown')})")

        # Try exact match first
        for site in sites:
            if site.get("displayName", "").strip().lower() == site_name.strip().lower():
                logger.info(f"Found exact match: {site.get('displayName')}")
                return site

        # Try partial match
        for site in sites:
            if site_name.lower() in site.get("displayName", "").lower():
                logger.info(f"Found partial match: {site.get('displayName')}")
                return site

        logger.warning(f"No site found matching: {site_name}")
        return None

    async def find_site_by_web_url(self, web_url: str) -> Optional[Dict[str, Any]]:
        """Find a SharePoint site using its web URL (useful for guest sites)."""
        try:
            # Use the web URL format: sites/{hostname}:/sites/{site-path}
            # Extract hostname and site path from the web URL
            if not web_url.startswith("https://"):
                web_url = f"https://{web_url}"

            # Parse the URL to extract hostname and site path
            from urllib.parse import urlparse
            parsed = urlparse(web_url)
            hostname = parsed.hostname
            path_parts = parsed.path.strip('/').split('/')

            if len(path_parts) >= 2 and path_parts[0] == 'sites':
                site_path = '/'.join(path_parts[1:])  # Everything after 'sites/'
            else:
                logger.error(f"Invalid SharePoint URL format: {web_url}")
                return None

            endpoint = f"sites/{hostname}:/sites/{site_path}"
            logger.debug(f"Trying web URL format: {endpoint}")

            result = await self._make_graph_api_call(endpoint)

            if result and "error" not in result:
                site_info = {
                    "id": result.get("id"),
                    "displayName": result.get("displayName"),
                    "name": result.get("name"),
                    "webUrl": result.get("webUrl"),
                    "description": result.get("description"),
                    "createdDateTime": result.get("createdDateTime"),
                    "lastModifiedDateTime": result.get("lastModifiedDateTime")
                }
                logger.info(f"Found site by web URL: {site_info['displayName']} - {site_info['webUrl']} (ID: {site_info['id']})")
                return site_info
            else:
                logger.warning(f"Site not found using web URL: {web_url}")
                return None

        except Exception as e:
            logger.error(f"Error finding site by web URL: {str(e)}")
            return None

    async def find_site_by_url(self, hostname: str, site_path: str) -> Optional[Dict[str, Any]]:
        """Find a SharePoint site using the site URL format."""
        try:
            # For guest sites, try different URL formats
            url_formats = [
                f"sites/{hostname}:/sites/{site_path}",  # Standard format
                f"sites/{hostname}:/sites/{site_path}/",  # With trailing slash
                f"sites/{hostname}:/sites/{site_path.lower()}",  # Lowercase
                f"sites/{hostname}:/sites/{site_path.lower()}/",  # Lowercase with slash
            ]

            for endpoint in url_formats:
                logger.debug(f"Trying URL format: {endpoint}")
                result = await self._make_graph_api_call(endpoint)

                if result and "error" not in result:
                    site_info = {
                        "id": result.get("id"),
                        "displayName": result.get("displayName"),
                        "name": result.get("name"),
                        "webUrl": result.get("webUrl"),
                        "description": result.get("description"),
                        "createdDateTime": result.get("createdDateTime"),
                        "lastModifiedDateTime": result.get("lastModifiedDateTime")
                    }
                    logger.info(f"Found site by URL: {site_info['displayName']} - {site_info['webUrl']} (ID: {site_info['id']})")
                    return site_info
                else:
                    logger.debug(f"URL format failed: {endpoint} - {result.get('error', 'Unknown error')}")

            logger.warning(f"Site not found using any URL format for: {hostname}:/sites/{site_path}")
            return None

        except Exception as e:
            logger.error(f"Error finding site by URL: {str(e)}")
            return None

    async def get_folder_by_path(self, site_id: str, folder_path: str) -> Optional[Dict[str, Any]]:
        """Get folder information by path within a site."""
        try:
            # Clean the path
            clean_path = folder_path.lstrip('/')
            endpoint = f"sites/{site_id}/drive/root:/{clean_path}"

            result = await self._make_graph_api_call(endpoint)

            if "error" in result:
                logger.warning(f"Folder not found at path {folder_path}: {result['error']}")
                return None

            return result

        except Exception as e:
            logger.error(f"Error getting folder by path: {str(e)}")
            return None

    async def upload_file(self, site_id: str, folder_path: str, file_name: str, content: bytes) -> Dict[str, Any]:
        """Upload a file to SharePoint."""
        try:
            # Clean the path
            clean_path = folder_path.lstrip('/')
            upload_path = f"{clean_path.rstrip('/')}/{file_name}"
            endpoint = f"sites/{site_id}/drive/root:/{upload_path}:/content"

            logger.info(f"Uploading file to: {endpoint}")

            result = await self._make_graph_api_call(endpoint, method="PUT", data=content)

            if "error" in result:
                logger.error(f"Upload failed: {result['error']}")
                return result

            logger.info(f"File uploaded successfully: {file_name}")
            return result

        except Exception as e:
            logger.error(f"Error uploading file: {str(e)}")
            return {"error": f"Error uploading file: {str(e)}"}

    async def download_file(self, site_id: str, file_id: str) -> Optional[bytes]:
        """Download a file from SharePoint."""
        try:
            endpoint = f"sites/{site_id}/drive/items/{file_id}/content"

            headers = {"Authorization": f"Bearer {self.access_token}"}
            timeout = aiohttp.ClientTimeout(total=30)

            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(f"{self.base_url}/{endpoint}", headers=headers) as response:
                    if response.status == 200:
                        return await response.read()
                    else:
                        logger.error(f"Download failed: {response.status}")
                        return None

        except Exception as e:
            logger.error(f"Error downloading file: {str(e)}")
            return None

    async def list_folder_contents(self, site_id: str, folder_path: str = "") -> List[Dict[str, Any]]:
        """List contents of a folder."""
        try:
            if not folder_path or folder_path == "/":
                endpoint = f"sites/{site_id}/drive/root/children"
            else:
                clean_path = folder_path.lstrip('/')
                endpoint = f"sites/{site_id}/drive/root:/{clean_path}:/children"

            result = await self._make_graph_api_call(endpoint)

            if "error" in result:
                logger.warning(f"Failed to list folder contents: {result['error']}")
                return None

            items = result.get("value", [])
            processed_items = []

            for item in items:
                # Determine if it's a folder or file
                is_folder = 'folder' in item

                item_info = {
                    "id": item.get("id"),
                    "name": item.get("name"),
                    "type": "folder" if is_folder else "file",
                    "size": item.get("size", 0),
                    "createdDateTime": item.get("createdDateTime"),
                    "lastModifiedDateTime": item.get("lastModifiedDateTime"),
                    "webUrl": item.get("webUrl")
                }

                if "file" in item:
                    item_info["mimeType"] = item["file"].get("mimeType")
                    item_info["downloadUrl"] = item.get("@microsoft.graph.downloadUrl")

                if "folder" in item:
                    item_info["childCount"] = item["folder"].get("childCount", 0)

                processed_items.append(item_info)

            return processed_items

        except Exception as e:
            logger.error(f"Error listing folder contents: {str(e)}")
            return []

    async def search_files(self, site_id: str, query: str) -> List[Dict[str, Any]]:
        """Search for files in a site."""
        try:
            search_query = query.replace("'", "''")  # Escape single quotes for OData
            endpoint = f"sites/{site_id}/drive/root/search(q='{search_query}')"

            result = await self._make_graph_api_call(endpoint)

            if "error" in result:
                logger.warning(f"Search failed: {result['error']}")
                return []

            items = result.get("value", [])
            processed_items = []

            for item in items:
                is_folder = 'folder' in item

                item_info = {
                    "id": item.get("id"),
                    "name": item.get("name"),
                    "type": "folder" if is_folder else "file",
                    "size": item.get("size", 0),
                    "createdDateTime": item.get("createdDateTime"),
                    "lastModifiedDateTime": item.get("lastModifiedDateTime"),
                    "webUrl": item.get("webUrl"),
                    "parentPath": item.get("parentReference", {}).get("path", "")
                }

                if "file" in item:
                    item_info["mimeType"] = item["file"].get("mimeType")
                    item_info["downloadUrl"] = item.get("@microsoft.graph.downloadUrl")

                processed_items.append(item_info)

            return processed_items

        except Exception as e:
            logger.error(f"Error searching files: {str(e)}")
            return []

    async def copy_file_async(self, site_id: str, source_folder: str, source_file: str, dest_folder: str, dest_file: str) -> None:
        """Copy a file from source to destination folder (like original synchronizer)."""
        try:
            # First, download the source file
            source_path = f"{source_folder}/{source_file}"
            file_content = await self.download_file_by_path(site_id=site_id, file_path=source_path)

            if not file_content:
                raise Exception(f"Failed to download source file: {source_path}")

            # Upload to destination
            await self.upload_file(
                site_id=site_id,
                folder_path=dest_folder,
                file_name=dest_file,
                content=file_content
            )

            logger.info(f"File copied: {source_file} -> {dest_file}")

        except Exception as e:
            logger.error(f"Error copying file: {str(e)}")
            raise

    async def download_file_by_path(self, site_id: str, file_path: str) -> Optional[bytes]:
        """Download a file by its path within a site."""
        try:
            # Clean the path
            clean_path = file_path.strip('/')
            endpoint = f"sites/{site_id}/drive/root:/{clean_path}:/content"

            # Use direct HTTP call for file downloads (binary content)
            headers = {
                "Authorization": f"Bearer {self.access_token}",
            }

            # Remove leading slash from endpoint to avoid double slash
            clean_endpoint = endpoint.lstrip('/')
            url = f"{self.base_url}/{clean_endpoint}"
            logger.debug(f"Downloading file: GET {url}")

            timeout = aiohttp.ClientTimeout(total=30)

            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(url, headers=headers) as response:
                    if response.status == 200:
                        return await response.read()
                    else:
                        error_text = await response.text()
                        logger.error(f"File download failed: {response.status} - {error_text}")
                        return None

        except Exception as e:
            logger.error(f"Error downloading file by path: {str(e)}")
            return None