""" Data Neutralization Service Handles file processing for data neutralization including SharePoint integration """ import logging import os import uuid from typing import Dict, List, Any, Optional, Tuple from datetime import datetime from pathlib import Path import mimetypes from modules.interfaces.interfaceAppObjects import getInterface from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes from modules.neutralizer.neutralizer import DataAnonymizer from modules.shared.timezoneUtils import get_utc_timestamp logger = logging.getLogger(__name__) class NeutralizationService: """Service for handling data neutralization operations""" def __init__(self, current_user: User): """Initialize the service with user context""" self.current_user = current_user self.app_interface = getInterface(current_user) def get_config(self) -> Optional[DataNeutraliserConfig]: """Get the neutralization configuration for the current user's mandate""" return self.app_interface.getNeutralizationConfig() def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig: """Save or update the neutralization configuration""" return self.app_interface.createOrUpdateNeutralizationConfig(config_data) def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]: """Neutralize text content and return results with attribute mappings""" return self.app_interface.neutralizeText(text, file_id) def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]: """Get neutralization attributes, optionally filtered by file ID""" return self.app_interface.getNeutralizationAttributes(file_id) def resolve_text(self, text: str) -> str: """Resolve UIDs in neutralized text back to original text""" return self.app_interface.resolveNeutralizedText(text) async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]: """ Process files from SharePoint source path, neutralize them, and store in target path Args: source_path: SharePoint path to read files from target_path: SharePoint path to store neutralized files Returns: Dictionary with processing results """ try: logger.info(f"Processing SharePoint files from {source_path} to {target_path}") # Get user's SharePoint connection that matches the source path sharepoint_connection = await self._get_sharepoint_connection(source_path) if not sharepoint_connection: return { "success": False, "message": "No SharePoint connection found for user", "processed_files": 0, "errors": ["No SharePoint connection found"] } logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}") # Get SharePoint access token sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"]) if not sharepoint_token: return { "success": False, "message": "No SharePoint access token found", "processed_files": 0, "errors": ["No SharePoint access token found"] } # Process files asynchronously return await self._process_sharepoint_files_async( source_path, target_path, sharepoint_token.tokenAccess ) except Exception as e: logger.error(f"Error processing SharePoint files: {str(e)}") return { "success": False, "message": f"Error processing SharePoint files: {str(e)}", "processed_files": 0, "errors": [str(e)] } async def _get_sharepoint_connection(self, sharepoint_path: str = None): """Get user's SharePoint connection that matches the given path""" try: # Get all user connections from modules.interfaces.interfaceAppModel import UserConnection connections = self.app_interface.db.getRecordset( UserConnection, recordFilter={"userId": self.app_interface.userId} ) # Find all Microsoft connections msft_connections = [conn for conn in connections if conn.get("authority") == "msft"] if not msft_connections: logger.warning("No Microsoft connections found for user") return None if len(msft_connections) == 1: logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}") return msft_connections[0] # If multiple connections and we have a path, try to match if sharepoint_path: return await self._match_connection_to_path(msft_connections, sharepoint_path) # If no path provided, return the first one logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}") return msft_connections[0] except Exception as e: logger.error(f"Error getting SharePoint connection: {str(e)}") return None async def _match_connection_to_path(self, connections: list, sharepoint_path: str): """Match a connection to the SharePoint path by testing access""" try: # Extract domain from the path from urllib.parse import urlparse parsed_url = urlparse(sharepoint_path) target_domain = parsed_url.netloc.lower() logger.info(f"Looking for connection matching domain: {target_domain}") # Try each connection to see which one can access the site for connection in connections: try: # Get token for this connection token = self.app_interface.getConnectionToken(connection["id"]) if not token: continue # Test if this connection can access the SharePoint site if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path): logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}") return connection except Exception as e: continue # If no specific match found, return the first connection logger.warning(f"No specific connection match found for {target_domain}, using first available") return connections[0] except Exception as e: logger.error(f"Error matching connection to path: {str(e)}") return connections[0] if connections else None async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool: """Test if the access token can access the given SharePoint path""" try: return await self._test_sharepoint_access_async(access_token, sharepoint_path) except Exception as e: return False async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool: """Async test for SharePoint access""" try: from modules.connectors.connectorSharepoint import ConnectorSharepoint connector = ConnectorSharepoint(access_token=access_token) # Parse the path to get site URL site_url, _ = self._parse_sharepoint_path(sharepoint_path) if not site_url: return False # Try to find the site site_info = await connector.find_site_by_web_url(site_url) return site_info is not None except Exception as e: return False async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]: """Process SharePoint files asynchronously""" try: import asyncio from modules.connectors.connectorSharepoint import ConnectorSharepoint # Initialize SharePoint connector connector = ConnectorSharepoint(access_token=access_token) # Parse source and target paths to extract site and folder info source_site, source_folder = self._parse_sharepoint_path(source_path) target_site, target_folder = self._parse_sharepoint_path(target_path) if not source_site or not target_site: return { "success": False, "message": "Invalid SharePoint path format", "processed_files": 0, "errors": ["Invalid SharePoint path format"] } # Find source site source_site_info = await connector.find_site_by_web_url(source_site) if not source_site_info: return { "success": False, "message": f"Source site not found: {source_site}", "processed_files": 0, "errors": [f"Source site not found: {source_site}"] } # Find target site target_site_info = await connector.find_site_by_web_url(target_site) if not target_site_info: return { "success": False, "message": f"Target site not found: {target_site}", "processed_files": 0, "errors": [f"Target site not found: {target_site}"] } # List files in source folder logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}") files = await connector.list_folder_contents(source_site_info["id"], source_folder) # If no files found, try listing the root folder to see what's available if not files: logger.warning(f"No files found in folder '{source_folder}', trying root folder") files = await connector.list_folder_contents(source_site_info["id"], "") if files: # List available folders for debugging folders = [f for f in files if f.get("type") == "folder"] folder_names = [f.get('name') for f in folders] logger.info(f"Available folders in root: {folder_names}") # Format folder list for better UI display folder_list = ", ".join(folder_names) if folder_names else "None" return { "success": False, "message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}", "processed_files": 0, "errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"], "available_folders": folder_names } else: return { "success": False, "message": f"No files found in source folder: {source_folder}", "processed_files": 0, "errors": [f"No files found in source folder: {source_folder}"] } # Filter for text files only text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))] if not text_files: return { "success": False, "message": "No text files found in source folder", "processed_files": 0, "errors": ["No text files found in source folder"] } # Process files in parallel for better performance processed_files = [] errors = [] # Create tasks for parallel processing async def process_single_file(file_info): """Process a single file - download, neutralize, upload""" try: # Download file file_content = await connector.download_file(source_site_info["id"], file_info["id"]) if not file_content: return {"error": f"Failed to download file: {file_info['name']}"} # Convert to text try: text_content = file_content.decode('utf-8') except UnicodeDecodeError: text_content = file_content.decode('latin-1') # Neutralize the text neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"]) # Create neutralized filename neutralized_filename = f"neutralized_{file_info['name']}" # Upload neutralized file neutralized_content = neutralization_result["neutralized_text"].encode('utf-8') upload_result = await connector.upload_file( target_site_info["id"], target_folder, neutralized_filename, neutralized_content ) if "error" in upload_result: return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"} else: return { "success": True, "original_name": file_info["name"], "neutralized_name": neutralized_filename, "attributes_count": len(neutralization_result.get("attributes", [])) } except Exception as e: error_msg = f"Error processing file {file_info['name']}: {str(e)}" logger.error(error_msg) return {"error": error_msg} # Process all files in parallel logger.info(f"Processing {len(text_files)} files in parallel...") tasks = [process_single_file(file_info) for file_info in text_files] results = await asyncio.gather(*tasks, return_exceptions=True) # Process results for i, result in enumerate(results): if isinstance(result, Exception): error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}" errors.append(error_msg) logger.error(error_msg) elif isinstance(result, dict) and "error" in result: errors.append(result["error"]) elif isinstance(result, dict) and result.get("success"): processed_files.append({ "original_name": result["original_name"], "neutralized_name": result["neutralized_name"], "attributes_count": result["attributes_count"] }) logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}") else: error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}" errors.append(error_msg) logger.error(error_msg) return { "success": len(processed_files) > 0, "message": f"Processed {len(processed_files)} files successfully", "processed_files": len(processed_files), "files": processed_files, "errors": errors } except Exception as e: logger.error(f"Error in async SharePoint processing: {str(e)}") return { "success": False, "message": f"Error in async SharePoint processing: {str(e)}", "processed_files": 0, "errors": [str(e)] } def _parse_sharepoint_path(self, path: str) -> tuple[str, str]: """Parse SharePoint path to extract site URL and folder path""" try: # Expected format: https://domain.sharepoint.com/sites/sitename/folder/path if not path.startswith("https://"): return None, None # Remove query parameters if "?" in path: path = path.split("?")[0] # Split by /sites/ if "/sites/" not in path: return None, None parts = path.split("/sites/", 1) if len(parts) != 2: return None, None # Extract domain and site name domain = parts[0].replace("https://", "") site_name = parts[1].split("/")[0] # Create proper site URL for Graph API site_url = f"https://{domain}/sites/{site_name}" # Extract folder path (everything after the site name) folder_parts = parts[1].split("/")[1:] folder_path = "/".join(folder_parts) if folder_parts else "" # URL decode the folder path from urllib.parse import unquote folder_path = unquote(folder_path) return site_url, folder_path except Exception as e: logger.error(f"Error parsing SharePoint path '{path}': {str(e)}") return None, None def _is_text_file(self, filename: str) -> bool: """Check if file is a text file based on extension""" text_extensions = [ '.txt', '.csv', '.json', '.xml', '.md', '.log', '.doc', '.docx', '.rtf', '.odt', # Document formats '.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h', # Code files '.ini', '.cfg', '.conf', '.properties', # Config files '.sql', '.yaml', '.yml', '.toml', # Data/config files '.ps1', '.bat', '.sh', '.bash' # Script files ] return any(filename.lower().endswith(ext) for ext in text_extensions) def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]: """ Process file content for neutralization Args: file_content: Binary file content file_name: Name of the file mime_type: MIME type of the file Returns: Dictionary with neutralization results """ try: # Determine content type based on MIME type content_type = self._get_content_type_from_mime(mime_type) # Decode content to text try: text_content = file_content.decode('utf-8') except UnicodeDecodeError: # Try with different encodings for encoding in ['latin-1', 'cp1252', 'iso-8859-1']: try: text_content = file_content.decode(encoding) break except UnicodeDecodeError: continue else: raise ValueError("Unable to decode file content") # Generate a temporary file ID for tracking temp_file_id = str(uuid.uuid4()) # Neutralize the content neutralization_result = self.neutralize_text(text_content, temp_file_id) # Encode the neutralized content back to bytes neutralized_content = neutralization_result["neutralized_text"].encode('utf-8') # Generate neutralized file name neutralized_file_name = f"neutralized_{file_name}" return { "success": True, "original_content": text_content, "neutralized_content": neutralization_result["neutralized_text"], "neutralized_file_name": neutralized_file_name, "attributes": neutralization_result["attributes"], "mapping": neutralization_result["mapping"], "file_id": temp_file_id } except Exception as e: logger.error(f"Error processing file content: {str(e)}") return { "success": False, "error": str(e), "original_content": None, "neutralized_content": None } def _get_content_type_from_mime(self, mime_type: str) -> str: """Determine content type from MIME type for neutralization processing""" if mime_type.startswith('text/'): return 'text' elif mime_type in ['application/json', 'application/xml', 'text/xml']: return 'json' if 'json' in mime_type else 'xml' elif mime_type in ['text/csv', 'application/csv']: return 'csv' else: return 'text' # Default to text processing def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Process multiple files for neutralization Args: files_data: List of dictionaries containing file information Each dict should have: content, name, mime_type Returns: Dictionary with batch processing results """ try: results = [] total_files = len(files_data) successful_files = 0 errors = [] for file_data in files_data: try: result = self.process_file_content( file_data['content'], file_data['name'], file_data['mime_type'] ) if result['success']: successful_files += 1 results.append({ 'file_name': file_data['name'], 'neutralized_file_name': result['neutralized_file_name'], 'file_id': result['file_id'], 'attributes_count': len(result['attributes']) }) else: errors.append(f"Failed to process {file_data['name']}: {result['error']}") except Exception as e: error_msg = f"Error processing {file_data['name']}: {str(e)}" errors.append(error_msg) logger.error(error_msg) return { "success": len(errors) == 0, "total_files": total_files, "successful_files": successful_files, "failed_files": len(errors), "results": results, "errors": errors } except Exception as e: logger.error(f"Error in batch neutralization: {str(e)}") return { "success": False, "total_files": len(files_data), "successful_files": 0, "failed_files": len(files_data), "results": [], "errors": [str(e)] } def cleanup_file_attributes(self, file_id: str) -> bool: """Clean up neutralization attributes for a specific file""" return self.app_interface.deleteNeutralizationAttributes(file_id) def get_processing_stats(self) -> Dict[str, Any]: """Get statistics about neutralization processing""" try: # Get all attributes for the current mandate all_attributes = self.get_attributes() # Group by pattern type pattern_counts = {} for attr in all_attributes: pattern_type = attr.patternType pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1 # Get unique files unique_files = set(attr.fileId for attr in all_attributes if attr.fileId) return { "total_attributes": len(all_attributes), "unique_files": len(unique_files), "pattern_counts": pattern_counts, "mandate_id": self.current_user.mandateId } except Exception as e: logger.error(f"Error getting processing stats: {str(e)}") return { "total_attributes": 0, "unique_files": 0, "pattern_counts": {}, "error": str(e) }