gateway/modules/features/neutralizePlayground/mainNeutralizePlayground.py
2025-09-22 23:34:47 +02:00

587 lines
25 KiB
Python

"""
Data Neutralization Service
Handles file processing for data neutralization including SharePoint integration
"""
import logging
import os
import uuid
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
from pathlib import Path
import mimetypes
from modules.interfaces.interfaceAppObjects import getInterface
from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
from modules.services.serviceNeutralization.neutralizer import DataAnonymizer
from modules.shared.timezoneUtils import get_utc_timestamp
logger = logging.getLogger(__name__)
class NeutralizationService:
"""Service for handling data neutralization operations"""
def __init__(self, current_user: User):
"""Initialize the service with user context"""
self.current_user = current_user
self.app_interface = getInterface(current_user)
def get_config(self) -> Optional[DataNeutraliserConfig]:
"""Get the neutralization configuration for the current user's mandate"""
return self.app_interface.getNeutralizationConfig()
def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
"""Save or update the neutralization configuration"""
return self.app_interface.createOrUpdateNeutralizationConfig(config_data)
def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
"""Neutralize text content and return results with attribute mappings"""
return self.app_interface.neutralizeText(text, file_id)
def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID"""
return self.app_interface.getNeutralizationAttributes(file_id)
def resolve_text(self, text: str) -> str:
"""Resolve UIDs in neutralized text back to original text"""
return self.app_interface.resolveNeutralizedText(text)
async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
"""
Process files from SharePoint source path, neutralize them, and store in target path
Args:
source_path: SharePoint path to read files from
target_path: SharePoint path to store neutralized files
Returns:
Dictionary with processing results
"""
try:
logger.info(f"Processing SharePoint files from {source_path} to {target_path}")
# Get user's SharePoint connection that matches the source path
sharepoint_connection = await self._get_sharepoint_connection(source_path)
if not sharepoint_connection:
return {
"success": False,
"message": "No SharePoint connection found for user",
"processed_files": 0,
"errors": ["No SharePoint connection found"]
}
logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}")
# Get SharePoint access token
sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"])
if not sharepoint_token:
return {
"success": False,
"message": "No SharePoint access token found",
"processed_files": 0,
"errors": ["No SharePoint access token found"]
}
# Process files asynchronously
return await self._process_sharepoint_files_async(
source_path, target_path, sharepoint_token.tokenAccess
)
except Exception as e:
logger.error(f"Error processing SharePoint files: {str(e)}")
return {
"success": False,
"message": f"Error processing SharePoint files: {str(e)}",
"processed_files": 0,
"errors": [str(e)]
}
async def _get_sharepoint_connection(self, sharepoint_path: str = None):
"""Get user's SharePoint connection that matches the given path"""
try:
# Get all user connections
from modules.interfaces.interfaceAppModel import UserConnection
connections = self.app_interface.db.getRecordset(
UserConnection,
recordFilter={"userId": self.app_interface.userId}
)
# Find all Microsoft connections
msft_connections = [conn for conn in connections if conn.get("authority") == "msft"]
if not msft_connections:
logger.warning("No Microsoft connections found for user")
return None
if len(msft_connections) == 1:
logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}")
return msft_connections[0]
# If multiple connections and we have a path, try to match
if sharepoint_path:
return await self._match_connection_to_path(msft_connections, sharepoint_path)
# If no path provided, return the first one
logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}")
return msft_connections[0]
except Exception as e:
logger.error(f"Error getting SharePoint connection: {str(e)}")
return None
async def _match_connection_to_path(self, connections: list, sharepoint_path: str):
"""Match a connection to the SharePoint path by testing access"""
try:
# Extract domain from the path
from urllib.parse import urlparse
parsed_url = urlparse(sharepoint_path)
target_domain = parsed_url.netloc.lower()
logger.info(f"Looking for connection matching domain: {target_domain}")
# Try each connection to see which one can access the site
for connection in connections:
try:
# Get token for this connection
token = self.app_interface.getConnectionToken(connection["id"])
if not token:
continue
# Test if this connection can access the SharePoint site
if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path):
logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}")
return connection
except Exception as e:
continue
# If no specific match found, return the first connection
logger.warning(f"No specific connection match found for {target_domain}, using first available")
return connections[0]
except Exception as e:
logger.error(f"Error matching connection to path: {str(e)}")
return connections[0] if connections else None
async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool:
"""Test if the access token can access the given SharePoint path"""
try:
return await self._test_sharepoint_access_async(access_token, sharepoint_path)
except Exception as e:
return False
async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool:
"""Async test for SharePoint access"""
try:
from modules.connectors.connectorSharepoint import ConnectorSharepoint
connector = ConnectorSharepoint(access_token=access_token)
# Parse the path to get site URL
site_url, _ = self._parse_sharepoint_path(sharepoint_path)
if not site_url:
return False
# Try to find the site
site_info = await connector.find_site_by_web_url(site_url)
return site_info is not None
except Exception as e:
return False
async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]:
"""Process SharePoint files asynchronously"""
try:
import asyncio
from modules.connectors.connectorSharepoint import ConnectorSharepoint
# Initialize SharePoint connector
connector = ConnectorSharepoint(access_token=access_token)
# Parse source and target paths to extract site and folder info
source_site, source_folder = self._parse_sharepoint_path(source_path)
target_site, target_folder = self._parse_sharepoint_path(target_path)
if not source_site or not target_site:
return {
"success": False,
"message": "Invalid SharePoint path format",
"processed_files": 0,
"errors": ["Invalid SharePoint path format"]
}
# Find source site
source_site_info = await connector.find_site_by_web_url(source_site)
if not source_site_info:
return {
"success": False,
"message": f"Source site not found: {source_site}",
"processed_files": 0,
"errors": [f"Source site not found: {source_site}"]
}
# Find target site
target_site_info = await connector.find_site_by_web_url(target_site)
if not target_site_info:
return {
"success": False,
"message": f"Target site not found: {target_site}",
"processed_files": 0,
"errors": [f"Target site not found: {target_site}"]
}
# List files in source folder
logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}")
files = await connector.list_folder_contents(source_site_info["id"], source_folder)
# If no files found, try listing the root folder to see what's available
if not files:
logger.warning(f"No files found in folder '{source_folder}', trying root folder")
files = await connector.list_folder_contents(source_site_info["id"], "")
if files:
# List available folders for debugging
folders = [f for f in files if f.get("type") == "folder"]
folder_names = [f.get('name') for f in folders]
logger.info(f"Available folders in root: {folder_names}")
# Format folder list for better UI display
folder_list = ", ".join(folder_names) if folder_names else "None"
return {
"success": False,
"message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}",
"processed_files": 0,
"errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"],
"available_folders": folder_names
}
else:
return {
"success": False,
"message": f"No files found in source folder: {source_folder}",
"processed_files": 0,
"errors": [f"No files found in source folder: {source_folder}"]
}
# Filter for text files only
text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))]
if not text_files:
return {
"success": False,
"message": "No text files found in source folder",
"processed_files": 0,
"errors": ["No text files found in source folder"]
}
# Process files in parallel for better performance
processed_files = []
errors = []
# Create tasks for parallel processing
async def process_single_file(file_info):
"""Process a single file - download, neutralize, upload"""
try:
# Download file
file_content = await connector.download_file(source_site_info["id"], file_info["id"])
if not file_content:
return {"error": f"Failed to download file: {file_info['name']}"}
# Convert to text
try:
text_content = file_content.decode('utf-8')
except UnicodeDecodeError:
text_content = file_content.decode('latin-1')
# Neutralize the text
neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"])
# Create neutralized filename
neutralized_filename = f"neutralized_{file_info['name']}"
# Upload neutralized file
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
upload_result = await connector.upload_file(
target_site_info["id"],
target_folder,
neutralized_filename,
neutralized_content
)
if "error" in upload_result:
return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"}
else:
return {
"success": True,
"original_name": file_info["name"],
"neutralized_name": neutralized_filename,
"attributes_count": len(neutralization_result.get("attributes", []))
}
except Exception as e:
error_msg = f"Error processing file {file_info['name']}: {str(e)}"
logger.error(error_msg)
return {"error": error_msg}
# Process all files in parallel
logger.info(f"Processing {len(text_files)} files in parallel...")
tasks = [process_single_file(file_info) for file_info in text_files]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
for i, result in enumerate(results):
if isinstance(result, Exception):
error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}"
errors.append(error_msg)
logger.error(error_msg)
elif isinstance(result, dict) and "error" in result:
errors.append(result["error"])
elif isinstance(result, dict) and result.get("success"):
processed_files.append({
"original_name": result["original_name"],
"neutralized_name": result["neutralized_name"],
"attributes_count": result["attributes_count"]
})
logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}")
else:
error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}"
errors.append(error_msg)
logger.error(error_msg)
return {
"success": len(processed_files) > 0,
"message": f"Processed {len(processed_files)} files successfully",
"processed_files": len(processed_files),
"files": processed_files,
"errors": errors
}
except Exception as e:
logger.error(f"Error in async SharePoint processing: {str(e)}")
return {
"success": False,
"message": f"Error in async SharePoint processing: {str(e)}",
"processed_files": 0,
"errors": [str(e)]
}
def _parse_sharepoint_path(self, path: str) -> tuple[str, str]:
"""Parse SharePoint path to extract site URL and folder path"""
try:
# Expected format: https://domain.sharepoint.com/sites/sitename/folder/path
if not path.startswith("https://"):
return None, None
# Remove query parameters
if "?" in path:
path = path.split("?")[0]
# Split by /sites/
if "/sites/" not in path:
return None, None
parts = path.split("/sites/", 1)
if len(parts) != 2:
return None, None
# Extract domain and site name
domain = parts[0].replace("https://", "")
site_name = parts[1].split("/")[0]
# Create proper site URL for Graph API
site_url = f"https://{domain}/sites/{site_name}"
# Extract folder path (everything after the site name)
folder_parts = parts[1].split("/")[1:]
folder_path = "/".join(folder_parts) if folder_parts else ""
# URL decode the folder path
from urllib.parse import unquote
folder_path = unquote(folder_path)
return site_url, folder_path
except Exception as e:
logger.error(f"Error parsing SharePoint path '{path}': {str(e)}")
return None, None
def _is_text_file(self, filename: str) -> bool:
"""Check if file is a text file based on extension"""
text_extensions = [
'.txt', '.csv', '.json', '.xml', '.md', '.log',
'.doc', '.docx', '.rtf', '.odt', # Document formats
'.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h', # Code files
'.ini', '.cfg', '.conf', '.properties', # Config files
'.sql', '.yaml', '.yml', '.toml', # Data/config files
'.ps1', '.bat', '.sh', '.bash' # Script files
]
return any(filename.lower().endswith(ext) for ext in text_extensions)
def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]:
"""
Process file content for neutralization
Args:
file_content: Binary file content
file_name: Name of the file
mime_type: MIME type of the file
Returns:
Dictionary with neutralization results
"""
try:
# Determine content type based on MIME type
content_type = self._get_content_type_from_mime(mime_type)
# Decode content to text
try:
text_content = file_content.decode('utf-8')
except UnicodeDecodeError:
# Try with different encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
text_content = file_content.decode(encoding)
break
except UnicodeDecodeError:
continue
else:
raise ValueError("Unable to decode file content")
# Generate a temporary file ID for tracking
temp_file_id = str(uuid.uuid4())
# Neutralize the content
neutralization_result = self.neutralize_text(text_content, temp_file_id)
# Encode the neutralized content back to bytes
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
# Generate neutralized file name
neutralized_file_name = f"neutralized_{file_name}"
return {
"success": True,
"original_content": text_content,
"neutralized_content": neutralization_result["neutralized_text"],
"neutralized_file_name": neutralized_file_name,
"attributes": neutralization_result["attributes"],
"mapping": neutralization_result["mapping"],
"file_id": temp_file_id
}
except Exception as e:
logger.error(f"Error processing file content: {str(e)}")
return {
"success": False,
"error": str(e),
"original_content": None,
"neutralized_content": None
}
def _get_content_type_from_mime(self, mime_type: str) -> str:
"""Determine content type from MIME type for neutralization processing"""
if mime_type.startswith('text/'):
return 'text'
elif mime_type in ['application/json', 'application/xml', 'text/xml']:
return 'json' if 'json' in mime_type else 'xml'
elif mime_type in ['text/csv', 'application/csv']:
return 'csv'
else:
return 'text' # Default to text processing
def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Process multiple files for neutralization
Args:
files_data: List of dictionaries containing file information
Each dict should have: content, name, mime_type
Returns:
Dictionary with batch processing results
"""
try:
results = []
total_files = len(files_data)
successful_files = 0
errors = []
for file_data in files_data:
try:
result = self.process_file_content(
file_data['content'],
file_data['name'],
file_data['mime_type']
)
if result['success']:
successful_files += 1
results.append({
'file_name': file_data['name'],
'neutralized_file_name': result['neutralized_file_name'],
'file_id': result['file_id'],
'attributes_count': len(result['attributes'])
})
else:
errors.append(f"Failed to process {file_data['name']}: {result['error']}")
except Exception as e:
error_msg = f"Error processing {file_data['name']}: {str(e)}"
errors.append(error_msg)
logger.error(error_msg)
return {
"success": len(errors) == 0,
"total_files": total_files,
"successful_files": successful_files,
"failed_files": len(errors),
"results": results,
"errors": errors
}
except Exception as e:
logger.error(f"Error in batch neutralization: {str(e)}")
return {
"success": False,
"total_files": len(files_data),
"successful_files": 0,
"failed_files": len(files_data),
"results": [],
"errors": [str(e)]
}
def cleanup_file_attributes(self, file_id: str) -> bool:
"""Clean up neutralization attributes for a specific file"""
return self.app_interface.deleteNeutralizationAttributes(file_id)
def get_processing_stats(self) -> Dict[str, Any]:
"""Get statistics about neutralization processing"""
try:
# Get all attributes for the current mandate
all_attributes = self.get_attributes()
# Group by pattern type
pattern_counts = {}
for attr in all_attributes:
pattern_type = attr.patternType
pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1
# Get unique files
unique_files = set(attr.fileId for attr in all_attributes if attr.fileId)
return {
"total_attributes": len(all_attributes),
"unique_files": len(unique_files),
"pattern_counts": pattern_counts,
"mandate_id": self.current_user.mandateId
}
except Exception as e:
logger.error(f"Error getting processing stats: {str(e)}")
return {
"total_attributes": 0,
"unique_files": 0,
"pattern_counts": {},
"error": str(e)
}