gateway/modules/features/neutralizePlayground/mainNeutralizePlayground.py

"""
Data Neutralization Service
Handles file processing for data neutralization including SharePoint integration
"""

import logging
import os
import uuid
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
from pathlib import Path
import mimetypes

from modules.interfaces.interfaceAppObjects import getInterface
from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
from modules.services.serviceNeutralization.neutralizer import DataAnonymizer
from modules.shared.timezoneUtils import get_utc_timestamp

logger = logging.getLogger(__name__)

class NeutralizationService:
    """Service for handling data neutralization operations"""

    def __init__(self, current_user: User):
        """Initialize the service with user context"""
        self.current_user = current_user
        self.app_interface = getInterface(current_user)

    def get_config(self) -> Optional[DataNeutraliserConfig]:
        """Get the neutralization configuration for the current user's mandate"""
        return self.app_interface.getNeutralizationConfig()

    def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
        """Save or update the neutralization configuration"""
        return self.app_interface.createOrUpdateNeutralizationConfig(config_data)

    def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
        """Neutralize text content and return results with attribute mappings"""
        return self.app_interface.neutralizeText(text, file_id)

    def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
        """Get neutralization attributes, optionally filtered by file ID"""
        return self.app_interface.getNeutralizationAttributes(file_id)

    def resolve_text(self, text: str) -> str:
        """Resolve UIDs in neutralized text back to original text"""
        return self.app_interface.resolveNeutralizedText(text)

    async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
        """
        Process files from SharePoint source path, neutralize them, and store in target path

        Args:
            source_path: SharePoint path to read files from
            target_path: SharePoint path to store neutralized files

        Returns:
            Dictionary with processing results
        """
        try:
            logger.info(f"Processing SharePoint files from {source_path} to {target_path}")

            # Get user's SharePoint connection that matches the source path
            sharepoint_connection = await self._get_sharepoint_connection(source_path)
            if not sharepoint_connection:
                return {
                    "success": False,
                    "message": "No SharePoint connection found for user",
                    "processed_files": 0,
                    "errors": ["No SharePoint connection found"]
                }

            logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}")

            # Get SharePoint access token
            sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"])
            if not sharepoint_token:
                return {
                    "success": False,
                    "message": "No SharePoint access token found",
                    "processed_files": 0,
                    "errors": ["No SharePoint access token found"]
                }

            # Process files asynchronously
            return await self._process_sharepoint_files_async(
                source_path, target_path, sharepoint_token.tokenAccess
            )

        except Exception as e:
            logger.error(f"Error processing SharePoint files: {str(e)}")
            return {
                "success": False,
                "message": f"Error processing SharePoint files: {str(e)}",
                "processed_files": 0,
                "errors": [str(e)]
            }

    async def _get_sharepoint_connection(self, sharepoint_path: str = None):
        """Get user's SharePoint connection that matches the given path"""
        try:
            # Get all user connections
            from modules.interfaces.interfaceAppModel import UserConnection
            connections = self.app_interface.db.getRecordset(
                UserConnection,
                recordFilter={"userId": self.app_interface.userId}
            )

            # Find all Microsoft connections
            msft_connections = [conn for conn in connections if conn.get("authority") == "msft"]

            if not msft_connections:
                logger.warning("No Microsoft connections found for user")
                return None

            if len(msft_connections) == 1:
                logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}")
                return msft_connections[0]

            # If multiple connections and we have a path, try to match
            if sharepoint_path:
                return await self._match_connection_to_path(msft_connections, sharepoint_path)

            # If no path provided, return the first one
            logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}")
            return msft_connections[0]

        except Exception as e:
            logger.error(f"Error getting SharePoint connection: {str(e)}")
            return None

    async def _match_connection_to_path(self, connections: list, sharepoint_path: str):
        """Match a connection to the SharePoint path by testing access"""
        try:
            # Extract domain from the path
            from urllib.parse import urlparse
            parsed_url = urlparse(sharepoint_path)
            target_domain = parsed_url.netloc.lower()

            logger.info(f"Looking for connection matching domain: {target_domain}")

            # Try each connection to see which one can access the site
            for connection in connections:
                try:
                    # Get token for this connection
                    token = self.app_interface.getConnectionToken(connection["id"])
                    if not token:
                        continue

                    # Test if this connection can access the SharePoint site
                    if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path):
                        logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}")
                        return connection

                except Exception as e:
                    continue

            # If no specific match found, return the first connection
            logger.warning(f"No specific connection match found for {target_domain}, using first available")
            return connections[0]

        except Exception as e:
            logger.error(f"Error matching connection to path: {str(e)}")
            return connections[0] if connections else None

    async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool:
        """Test if the access token can access the given SharePoint path"""
        try:
            return await self._test_sharepoint_access_async(access_token, sharepoint_path)
        except Exception as e:
            return False

    async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool:
        """Async test for SharePoint access"""
        try:
            from modules.connectors.connectorSharepoint import ConnectorSharepoint

            connector = ConnectorSharepoint(access_token=access_token)

            # Parse the path to get site URL
            site_url, _ = self._parse_sharepoint_path(sharepoint_path)
            if not site_url:
                return False

            # Try to find the site
            site_info = await connector.find_site_by_web_url(site_url)
            return site_info is not None

        except Exception as e:
            return False

    async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]:
        """Process SharePoint files asynchronously"""
        try:
            import asyncio
            from modules.connectors.connectorSharepoint import ConnectorSharepoint

            # Initialize SharePoint connector
            connector = ConnectorSharepoint(access_token=access_token)

            # Parse source and target paths to extract site and folder info
            source_site, source_folder = self._parse_sharepoint_path(source_path)
            target_site, target_folder = self._parse_sharepoint_path(target_path)

            if not source_site or not target_site:
                return {
                    "success": False,
                    "message": "Invalid SharePoint path format",
                    "processed_files": 0,
                    "errors": ["Invalid SharePoint path format"]
                }

            # Find source site
            source_site_info = await connector.find_site_by_web_url(source_site)
            if not source_site_info:
                return {
                    "success": False,
                    "message": f"Source site not found: {source_site}",
                    "processed_files": 0,
                    "errors": [f"Source site not found: {source_site}"]
                }

            # Find target site
            target_site_info = await connector.find_site_by_web_url(target_site)
            if not target_site_info:
                return {
                    "success": False,
                    "message": f"Target site not found: {target_site}",
                    "processed_files": 0,
                    "errors": [f"Target site not found: {target_site}"]
                }

            # List files in source folder
            logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}")
            files = await connector.list_folder_contents(source_site_info["id"], source_folder)

            # If no files found, try listing the root folder to see what's available
            if not files:
                logger.warning(f"No files found in folder '{source_folder}', trying root folder")
                files = await connector.list_folder_contents(source_site_info["id"], "")

                if files:
                    # List available folders for debugging
                    folders = [f for f in files if f.get("type") == "folder"]
                    folder_names = [f.get('name') for f in folders]
                    logger.info(f"Available folders in root: {folder_names}")

                    # Format folder list for better UI display
                    folder_list = ", ".join(folder_names) if folder_names else "None"

                    return {
                        "success": False,
                        "message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}",
                        "processed_files": 0,
                        "errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"],
                        "available_folders": folder_names
                    }
                else:
                    return {
                        "success": False,
                        "message": f"No files found in source folder: {source_folder}",
                        "processed_files": 0,
                        "errors": [f"No files found in source folder: {source_folder}"]
                    }

            # Filter for text files only
            text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))]

            if not text_files:
                return {
                    "success": False,
                    "message": "No text files found in source folder",
                    "processed_files": 0,
                    "errors": ["No text files found in source folder"]
                }

            # Process files in parallel for better performance
            processed_files = []
            errors = []

            # Create tasks for parallel processing
            async def process_single_file(file_info):
                """Process a single file - download, neutralize, upload"""
                try:
                    # Download file
                    file_content = await connector.download_file(source_site_info["id"], file_info["id"])
                    if not file_content:
                        return {"error": f"Failed to download file: {file_info['name']}"}

                    # Convert to text
                    try:
                        text_content = file_content.decode('utf-8')
                    except UnicodeDecodeError:
                        text_content = file_content.decode('latin-1')

                    # Neutralize the text
                    neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"])

                    # Create neutralized filename
                    neutralized_filename = f"neutralized_{file_info['name']}"

                    # Upload neutralized file
                    neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
                    upload_result = await connector.upload_file(
                        target_site_info["id"],
                        target_folder,
                        neutralized_filename,
                        neutralized_content
                    )

                    if "error" in upload_result:
                        return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"}
                    else:
                        return {
                            "success": True,
                            "original_name": file_info["name"],
                            "neutralized_name": neutralized_filename,
                            "attributes_count": len(neutralization_result.get("attributes", []))
                        }

                except Exception as e:
                    error_msg = f"Error processing file {file_info['name']}: {str(e)}"
                    logger.error(error_msg)
                    return {"error": error_msg}

            # Process all files in parallel
            logger.info(f"Processing {len(text_files)} files in parallel...")
            tasks = [process_single_file(file_info) for file_info in text_files]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            # Process results
            for i, result in enumerate(results):
                if isinstance(result, Exception):
                    error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}"
                    errors.append(error_msg)
                    logger.error(error_msg)
                elif isinstance(result, dict) and "error" in result:
                    errors.append(result["error"])
                elif isinstance(result, dict) and result.get("success"):
                    processed_files.append({
                        "original_name": result["original_name"],
                        "neutralized_name": result["neutralized_name"],
                        "attributes_count": result["attributes_count"]
                    })
                    logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}")
                else:
                    error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}"
                    errors.append(error_msg)
                    logger.error(error_msg)

            return {
                "success": len(processed_files) > 0,
                "message": f"Processed {len(processed_files)} files successfully",
                "processed_files": len(processed_files),
                "files": processed_files,
                "errors": errors
            }

        except Exception as e:
            logger.error(f"Error in async SharePoint processing: {str(e)}")
            return {
                "success": False,
                "message": f"Error in async SharePoint processing: {str(e)}",
                "processed_files": 0,
                "errors": [str(e)]
            }

    def _parse_sharepoint_path(self, path: str) -> tuple[str, str]:
        """Parse SharePoint path to extract site URL and folder path"""
        try:
            # Expected format: https://domain.sharepoint.com/sites/sitename/folder/path
            if not path.startswith("https://"):
                return None, None

            # Remove query parameters
            if "?" in path:
                path = path.split("?")[0]

            # Split by /sites/
            if "/sites/" not in path:
                return None, None

            parts = path.split("/sites/", 1)
            if len(parts) != 2:
                return None, None

            # Extract domain and site name
            domain = parts[0].replace("https://", "")
            site_name = parts[1].split("/")[0]

            # Create proper site URL for Graph API
            site_url = f"https://{domain}/sites/{site_name}"

            # Extract folder path (everything after the site name)
            folder_parts = parts[1].split("/")[1:]
            folder_path = "/".join(folder_parts) if folder_parts else ""

            # URL decode the folder path
            from urllib.parse import unquote
            folder_path = unquote(folder_path)


            return site_url, folder_path

        except Exception as e:
            logger.error(f"Error parsing SharePoint path '{path}': {str(e)}")
            return None, None

    def _is_text_file(self, filename: str) -> bool:
        """Check if file is a text file based on extension"""
        text_extensions = [
            '.txt', '.csv', '.json', '.xml', '.md', '.log',
            '.doc', '.docx', '.rtf', '.odt',  # Document formats
            '.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h',  # Code files
            '.ini', '.cfg', '.conf', '.properties',  # Config files
            '.sql', '.yaml', '.yml', '.toml',  # Data/config files
            '.ps1', '.bat', '.sh', '.bash'  # Script files
        ]
        return any(filename.lower().endswith(ext) for ext in text_extensions)

    def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]:
        """
        Process file content for neutralization

        Args:
            file_content: Binary file content
            file_name: Name of the file
            mime_type: MIME type of the file

        Returns:
            Dictionary with neutralization results
        """
        try:
            # Determine content type based on MIME type
            content_type = self._get_content_type_from_mime(mime_type)

            # Decode content to text
            try:
                text_content = file_content.decode('utf-8')
            except UnicodeDecodeError:
                # Try with different encodings
                for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                    try:
                        text_content = file_content.decode(encoding)
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    raise ValueError("Unable to decode file content")

            # Generate a temporary file ID for tracking
            temp_file_id = str(uuid.uuid4())

            # Neutralize the content
            neutralization_result = self.neutralize_text(text_content, temp_file_id)

            # Encode the neutralized content back to bytes
            neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')

            # Generate neutralized file name
            neutralized_file_name = f"neutralized_{file_name}"

            return {
                "success": True,
                "original_content": text_content,
                "neutralized_content": neutralization_result["neutralized_text"],
                "neutralized_file_name": neutralized_file_name,
                "attributes": neutralization_result["attributes"],
                "mapping": neutralization_result["mapping"],
                "file_id": temp_file_id
            }

        except Exception as e:
            logger.error(f"Error processing file content: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "original_content": None,
                "neutralized_content": None
            }

    def _get_content_type_from_mime(self, mime_type: str) -> str:
        """Determine content type from MIME type for neutralization processing"""
        if mime_type.startswith('text/'):
            return 'text'
        elif mime_type in ['application/json', 'application/xml', 'text/xml']:
            return 'json' if 'json' in mime_type else 'xml'
        elif mime_type in ['text/csv', 'application/csv']:
            return 'csv'
        else:
            return 'text'  # Default to text processing

    def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Process multiple files for neutralization

        Args:
            files_data: List of dictionaries containing file information
                       Each dict should have: content, name, mime_type

        Returns:
            Dictionary with batch processing results
        """
        try:
            results = []
            total_files = len(files_data)
            successful_files = 0
            errors = []

            for file_data in files_data:
                try:
                    result = self.process_file_content(
                        file_data['content'],
                        file_data['name'],
                        file_data['mime_type']
                    )

                    if result['success']:
                        successful_files += 1
                        results.append({
                            'file_name': file_data['name'],
                            'neutralized_file_name': result['neutralized_file_name'],
                            'file_id': result['file_id'],
                            'attributes_count': len(result['attributes'])
                        })
                    else:
                        errors.append(f"Failed to process {file_data['name']}: {result['error']}")

                except Exception as e:
                    error_msg = f"Error processing {file_data['name']}: {str(e)}"
                    errors.append(error_msg)
                    logger.error(error_msg)

            return {
                "success": len(errors) == 0,
                "total_files": total_files,
                "successful_files": successful_files,
                "failed_files": len(errors),
                "results": results,
                "errors": errors
            }

        except Exception as e:
            logger.error(f"Error in batch neutralization: {str(e)}")
            return {
                "success": False,
                "total_files": len(files_data),
                "successful_files": 0,
                "failed_files": len(files_data),
                "results": [],
                "errors": [str(e)]
            }

    def cleanup_file_attributes(self, file_id: str) -> bool:
        """Clean up neutralization attributes for a specific file"""
        return self.app_interface.deleteNeutralizationAttributes(file_id)

    def get_processing_stats(self) -> Dict[str, Any]:
        """Get statistics about neutralization processing"""
        try:
            # Get all attributes for the current mandate
            all_attributes = self.get_attributes()

            # Group by pattern type
            pattern_counts = {}
            for attr in all_attributes:
                pattern_type = attr.patternType
                pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1

            # Get unique files
            unique_files = set(attr.fileId for attr in all_attributes if attr.fileId)

            return {
                "total_attributes": len(all_attributes),
                "unique_files": len(unique_files),
                "pattern_counts": pattern_counts,
                "mandate_id": self.current_user.mandateId
            }

        except Exception as e:
            logger.error(f"Error getting processing stats: {str(e)}")
            return {
                "total_attributes": 0,
                "unique_files": 0,
                "pattern_counts": {},
                "error": str(e)
            }