587 lines
25 KiB
Python
587 lines
25 KiB
Python
"""
|
|
Data Neutralization Service
|
|
Handles file processing for data neutralization including SharePoint integration
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import uuid
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import mimetypes
|
|
|
|
from modules.interfaces.interfaceAppObjects import getInterface
|
|
from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
|
|
from modules.services.serviceNeutralization.neutralizer import DataAnonymizer
|
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class NeutralizationService:
|
|
"""Service for handling data neutralization operations"""
|
|
|
|
def __init__(self, current_user: User):
|
|
"""Initialize the service with user context"""
|
|
self.current_user = current_user
|
|
self.app_interface = getInterface(current_user)
|
|
|
|
def get_config(self) -> Optional[DataNeutraliserConfig]:
|
|
"""Get the neutralization configuration for the current user's mandate"""
|
|
return self.app_interface.getNeutralizationConfig()
|
|
|
|
def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
|
|
"""Save or update the neutralization configuration"""
|
|
return self.app_interface.createOrUpdateNeutralizationConfig(config_data)
|
|
|
|
def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Neutralize text content and return results with attribute mappings"""
|
|
return self.app_interface.neutralizeText(text, file_id)
|
|
|
|
def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
|
|
"""Get neutralization attributes, optionally filtered by file ID"""
|
|
return self.app_interface.getNeutralizationAttributes(file_id)
|
|
|
|
def resolve_text(self, text: str) -> str:
|
|
"""Resolve UIDs in neutralized text back to original text"""
|
|
return self.app_interface.resolveNeutralizedText(text)
|
|
|
|
async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Process files from SharePoint source path, neutralize them, and store in target path
|
|
|
|
Args:
|
|
source_path: SharePoint path to read files from
|
|
target_path: SharePoint path to store neutralized files
|
|
|
|
Returns:
|
|
Dictionary with processing results
|
|
"""
|
|
try:
|
|
logger.info(f"Processing SharePoint files from {source_path} to {target_path}")
|
|
|
|
# Get user's SharePoint connection that matches the source path
|
|
sharepoint_connection = await self._get_sharepoint_connection(source_path)
|
|
if not sharepoint_connection:
|
|
return {
|
|
"success": False,
|
|
"message": "No SharePoint connection found for user",
|
|
"processed_files": 0,
|
|
"errors": ["No SharePoint connection found"]
|
|
}
|
|
|
|
logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}")
|
|
|
|
# Get SharePoint access token
|
|
sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"])
|
|
if not sharepoint_token:
|
|
return {
|
|
"success": False,
|
|
"message": "No SharePoint access token found",
|
|
"processed_files": 0,
|
|
"errors": ["No SharePoint access token found"]
|
|
}
|
|
|
|
# Process files asynchronously
|
|
return await self._process_sharepoint_files_async(
|
|
source_path, target_path, sharepoint_token.tokenAccess
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing SharePoint files: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Error processing SharePoint files: {str(e)}",
|
|
"processed_files": 0,
|
|
"errors": [str(e)]
|
|
}
|
|
|
|
async def _get_sharepoint_connection(self, sharepoint_path: str = None):
|
|
"""Get user's SharePoint connection that matches the given path"""
|
|
try:
|
|
# Get all user connections
|
|
from modules.interfaces.interfaceAppModel import UserConnection
|
|
connections = self.app_interface.db.getRecordset(
|
|
UserConnection,
|
|
recordFilter={"userId": self.app_interface.userId}
|
|
)
|
|
|
|
# Find all Microsoft connections
|
|
msft_connections = [conn for conn in connections if conn.get("authority") == "msft"]
|
|
|
|
if not msft_connections:
|
|
logger.warning("No Microsoft connections found for user")
|
|
return None
|
|
|
|
if len(msft_connections) == 1:
|
|
logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}")
|
|
return msft_connections[0]
|
|
|
|
# If multiple connections and we have a path, try to match
|
|
if sharepoint_path:
|
|
return await self._match_connection_to_path(msft_connections, sharepoint_path)
|
|
|
|
# If no path provided, return the first one
|
|
logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}")
|
|
return msft_connections[0]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting SharePoint connection: {str(e)}")
|
|
return None
|
|
|
|
async def _match_connection_to_path(self, connections: list, sharepoint_path: str):
|
|
"""Match a connection to the SharePoint path by testing access"""
|
|
try:
|
|
# Extract domain from the path
|
|
from urllib.parse import urlparse
|
|
parsed_url = urlparse(sharepoint_path)
|
|
target_domain = parsed_url.netloc.lower()
|
|
|
|
logger.info(f"Looking for connection matching domain: {target_domain}")
|
|
|
|
# Try each connection to see which one can access the site
|
|
for connection in connections:
|
|
try:
|
|
# Get token for this connection
|
|
token = self.app_interface.getConnectionToken(connection["id"])
|
|
if not token:
|
|
continue
|
|
|
|
# Test if this connection can access the SharePoint site
|
|
if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path):
|
|
logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}")
|
|
return connection
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
# If no specific match found, return the first connection
|
|
logger.warning(f"No specific connection match found for {target_domain}, using first available")
|
|
return connections[0]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error matching connection to path: {str(e)}")
|
|
return connections[0] if connections else None
|
|
|
|
async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool:
|
|
"""Test if the access token can access the given SharePoint path"""
|
|
try:
|
|
return await self._test_sharepoint_access_async(access_token, sharepoint_path)
|
|
except Exception as e:
|
|
return False
|
|
|
|
async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool:
|
|
"""Async test for SharePoint access"""
|
|
try:
|
|
from modules.connectors.connectorSharepoint import ConnectorSharepoint
|
|
|
|
connector = ConnectorSharepoint(access_token=access_token)
|
|
|
|
# Parse the path to get site URL
|
|
site_url, _ = self._parse_sharepoint_path(sharepoint_path)
|
|
if not site_url:
|
|
return False
|
|
|
|
# Try to find the site
|
|
site_info = await connector.find_site_by_web_url(site_url)
|
|
return site_info is not None
|
|
|
|
except Exception as e:
|
|
return False
|
|
|
|
async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]:
|
|
"""Process SharePoint files asynchronously"""
|
|
try:
|
|
import asyncio
|
|
from modules.connectors.connectorSharepoint import ConnectorSharepoint
|
|
|
|
# Initialize SharePoint connector
|
|
connector = ConnectorSharepoint(access_token=access_token)
|
|
|
|
# Parse source and target paths to extract site and folder info
|
|
source_site, source_folder = self._parse_sharepoint_path(source_path)
|
|
target_site, target_folder = self._parse_sharepoint_path(target_path)
|
|
|
|
if not source_site or not target_site:
|
|
return {
|
|
"success": False,
|
|
"message": "Invalid SharePoint path format",
|
|
"processed_files": 0,
|
|
"errors": ["Invalid SharePoint path format"]
|
|
}
|
|
|
|
# Find source site
|
|
source_site_info = await connector.find_site_by_web_url(source_site)
|
|
if not source_site_info:
|
|
return {
|
|
"success": False,
|
|
"message": f"Source site not found: {source_site}",
|
|
"processed_files": 0,
|
|
"errors": [f"Source site not found: {source_site}"]
|
|
}
|
|
|
|
# Find target site
|
|
target_site_info = await connector.find_site_by_web_url(target_site)
|
|
if not target_site_info:
|
|
return {
|
|
"success": False,
|
|
"message": f"Target site not found: {target_site}",
|
|
"processed_files": 0,
|
|
"errors": [f"Target site not found: {target_site}"]
|
|
}
|
|
|
|
# List files in source folder
|
|
logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}")
|
|
files = await connector.list_folder_contents(source_site_info["id"], source_folder)
|
|
|
|
# If no files found, try listing the root folder to see what's available
|
|
if not files:
|
|
logger.warning(f"No files found in folder '{source_folder}', trying root folder")
|
|
files = await connector.list_folder_contents(source_site_info["id"], "")
|
|
|
|
if files:
|
|
# List available folders for debugging
|
|
folders = [f for f in files if f.get("type") == "folder"]
|
|
folder_names = [f.get('name') for f in folders]
|
|
logger.info(f"Available folders in root: {folder_names}")
|
|
|
|
# Format folder list for better UI display
|
|
folder_list = ", ".join(folder_names) if folder_names else "None"
|
|
|
|
return {
|
|
"success": False,
|
|
"message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}",
|
|
"processed_files": 0,
|
|
"errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"],
|
|
"available_folders": folder_names
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"message": f"No files found in source folder: {source_folder}",
|
|
"processed_files": 0,
|
|
"errors": [f"No files found in source folder: {source_folder}"]
|
|
}
|
|
|
|
# Filter for text files only
|
|
text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))]
|
|
|
|
if not text_files:
|
|
return {
|
|
"success": False,
|
|
"message": "No text files found in source folder",
|
|
"processed_files": 0,
|
|
"errors": ["No text files found in source folder"]
|
|
}
|
|
|
|
# Process files in parallel for better performance
|
|
processed_files = []
|
|
errors = []
|
|
|
|
# Create tasks for parallel processing
|
|
async def process_single_file(file_info):
|
|
"""Process a single file - download, neutralize, upload"""
|
|
try:
|
|
# Download file
|
|
file_content = await connector.download_file(source_site_info["id"], file_info["id"])
|
|
if not file_content:
|
|
return {"error": f"Failed to download file: {file_info['name']}"}
|
|
|
|
# Convert to text
|
|
try:
|
|
text_content = file_content.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
text_content = file_content.decode('latin-1')
|
|
|
|
# Neutralize the text
|
|
neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"])
|
|
|
|
# Create neutralized filename
|
|
neutralized_filename = f"neutralized_{file_info['name']}"
|
|
|
|
# Upload neutralized file
|
|
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
|
|
upload_result = await connector.upload_file(
|
|
target_site_info["id"],
|
|
target_folder,
|
|
neutralized_filename,
|
|
neutralized_content
|
|
)
|
|
|
|
if "error" in upload_result:
|
|
return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"}
|
|
else:
|
|
return {
|
|
"success": True,
|
|
"original_name": file_info["name"],
|
|
"neutralized_name": neutralized_filename,
|
|
"attributes_count": len(neutralization_result.get("attributes", []))
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error processing file {file_info['name']}: {str(e)}"
|
|
logger.error(error_msg)
|
|
return {"error": error_msg}
|
|
|
|
# Process all files in parallel
|
|
logger.info(f"Processing {len(text_files)} files in parallel...")
|
|
tasks = [process_single_file(file_info) for file_info in text_files]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results
|
|
for i, result in enumerate(results):
|
|
if isinstance(result, Exception):
|
|
error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}"
|
|
errors.append(error_msg)
|
|
logger.error(error_msg)
|
|
elif isinstance(result, dict) and "error" in result:
|
|
errors.append(result["error"])
|
|
elif isinstance(result, dict) and result.get("success"):
|
|
processed_files.append({
|
|
"original_name": result["original_name"],
|
|
"neutralized_name": result["neutralized_name"],
|
|
"attributes_count": result["attributes_count"]
|
|
})
|
|
logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}")
|
|
else:
|
|
error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}"
|
|
errors.append(error_msg)
|
|
logger.error(error_msg)
|
|
|
|
return {
|
|
"success": len(processed_files) > 0,
|
|
"message": f"Processed {len(processed_files)} files successfully",
|
|
"processed_files": len(processed_files),
|
|
"files": processed_files,
|
|
"errors": errors
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in async SharePoint processing: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"message": f"Error in async SharePoint processing: {str(e)}",
|
|
"processed_files": 0,
|
|
"errors": [str(e)]
|
|
}
|
|
|
|
def _parse_sharepoint_path(self, path: str) -> tuple[str, str]:
|
|
"""Parse SharePoint path to extract site URL and folder path"""
|
|
try:
|
|
# Expected format: https://domain.sharepoint.com/sites/sitename/folder/path
|
|
if not path.startswith("https://"):
|
|
return None, None
|
|
|
|
# Remove query parameters
|
|
if "?" in path:
|
|
path = path.split("?")[0]
|
|
|
|
# Split by /sites/
|
|
if "/sites/" not in path:
|
|
return None, None
|
|
|
|
parts = path.split("/sites/", 1)
|
|
if len(parts) != 2:
|
|
return None, None
|
|
|
|
# Extract domain and site name
|
|
domain = parts[0].replace("https://", "")
|
|
site_name = parts[1].split("/")[0]
|
|
|
|
# Create proper site URL for Graph API
|
|
site_url = f"https://{domain}/sites/{site_name}"
|
|
|
|
# Extract folder path (everything after the site name)
|
|
folder_parts = parts[1].split("/")[1:]
|
|
folder_path = "/".join(folder_parts) if folder_parts else ""
|
|
|
|
# URL decode the folder path
|
|
from urllib.parse import unquote
|
|
folder_path = unquote(folder_path)
|
|
|
|
|
|
return site_url, folder_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error parsing SharePoint path '{path}': {str(e)}")
|
|
return None, None
|
|
|
|
def _is_text_file(self, filename: str) -> bool:
|
|
"""Check if file is a text file based on extension"""
|
|
text_extensions = [
|
|
'.txt', '.csv', '.json', '.xml', '.md', '.log',
|
|
'.doc', '.docx', '.rtf', '.odt', # Document formats
|
|
'.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h', # Code files
|
|
'.ini', '.cfg', '.conf', '.properties', # Config files
|
|
'.sql', '.yaml', '.yml', '.toml', # Data/config files
|
|
'.ps1', '.bat', '.sh', '.bash' # Script files
|
|
]
|
|
return any(filename.lower().endswith(ext) for ext in text_extensions)
|
|
|
|
def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]:
|
|
"""
|
|
Process file content for neutralization
|
|
|
|
Args:
|
|
file_content: Binary file content
|
|
file_name: Name of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
Dictionary with neutralization results
|
|
"""
|
|
try:
|
|
# Determine content type based on MIME type
|
|
content_type = self._get_content_type_from_mime(mime_type)
|
|
|
|
# Decode content to text
|
|
try:
|
|
text_content = file_content.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
# Try with different encodings
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
text_content = file_content.decode(encoding)
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
else:
|
|
raise ValueError("Unable to decode file content")
|
|
|
|
# Generate a temporary file ID for tracking
|
|
temp_file_id = str(uuid.uuid4())
|
|
|
|
# Neutralize the content
|
|
neutralization_result = self.neutralize_text(text_content, temp_file_id)
|
|
|
|
# Encode the neutralized content back to bytes
|
|
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
|
|
|
|
# Generate neutralized file name
|
|
neutralized_file_name = f"neutralized_{file_name}"
|
|
|
|
return {
|
|
"success": True,
|
|
"original_content": text_content,
|
|
"neutralized_content": neutralization_result["neutralized_text"],
|
|
"neutralized_file_name": neutralized_file_name,
|
|
"attributes": neutralization_result["attributes"],
|
|
"mapping": neutralization_result["mapping"],
|
|
"file_id": temp_file_id
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing file content: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"original_content": None,
|
|
"neutralized_content": None
|
|
}
|
|
|
|
def _get_content_type_from_mime(self, mime_type: str) -> str:
|
|
"""Determine content type from MIME type for neutralization processing"""
|
|
if mime_type.startswith('text/'):
|
|
return 'text'
|
|
elif mime_type in ['application/json', 'application/xml', 'text/xml']:
|
|
return 'json' if 'json' in mime_type else 'xml'
|
|
elif mime_type in ['text/csv', 'application/csv']:
|
|
return 'csv'
|
|
else:
|
|
return 'text' # Default to text processing
|
|
|
|
def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Process multiple files for neutralization
|
|
|
|
Args:
|
|
files_data: List of dictionaries containing file information
|
|
Each dict should have: content, name, mime_type
|
|
|
|
Returns:
|
|
Dictionary with batch processing results
|
|
"""
|
|
try:
|
|
results = []
|
|
total_files = len(files_data)
|
|
successful_files = 0
|
|
errors = []
|
|
|
|
for file_data in files_data:
|
|
try:
|
|
result = self.process_file_content(
|
|
file_data['content'],
|
|
file_data['name'],
|
|
file_data['mime_type']
|
|
)
|
|
|
|
if result['success']:
|
|
successful_files += 1
|
|
results.append({
|
|
'file_name': file_data['name'],
|
|
'neutralized_file_name': result['neutralized_file_name'],
|
|
'file_id': result['file_id'],
|
|
'attributes_count': len(result['attributes'])
|
|
})
|
|
else:
|
|
errors.append(f"Failed to process {file_data['name']}: {result['error']}")
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error processing {file_data['name']}: {str(e)}"
|
|
errors.append(error_msg)
|
|
logger.error(error_msg)
|
|
|
|
return {
|
|
"success": len(errors) == 0,
|
|
"total_files": total_files,
|
|
"successful_files": successful_files,
|
|
"failed_files": len(errors),
|
|
"results": results,
|
|
"errors": errors
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in batch neutralization: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"total_files": len(files_data),
|
|
"successful_files": 0,
|
|
"failed_files": len(files_data),
|
|
"results": [],
|
|
"errors": [str(e)]
|
|
}
|
|
|
|
def cleanup_file_attributes(self, file_id: str) -> bool:
|
|
"""Clean up neutralization attributes for a specific file"""
|
|
return self.app_interface.deleteNeutralizationAttributes(file_id)
|
|
|
|
def get_processing_stats(self) -> Dict[str, Any]:
|
|
"""Get statistics about neutralization processing"""
|
|
try:
|
|
# Get all attributes for the current mandate
|
|
all_attributes = self.get_attributes()
|
|
|
|
# Group by pattern type
|
|
pattern_counts = {}
|
|
for attr in all_attributes:
|
|
pattern_type = attr.patternType
|
|
pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1
|
|
|
|
# Get unique files
|
|
unique_files = set(attr.fileId for attr in all_attributes if attr.fileId)
|
|
|
|
return {
|
|
"total_attributes": len(all_attributes),
|
|
"unique_files": len(unique_files),
|
|
"pattern_counts": pattern_counts,
|
|
"mandate_id": self.current_user.mandateId
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting processing stats: {str(e)}")
|
|
return {
|
|
"total_attributes": 0,
|
|
"unique_files": 0,
|
|
"pattern_counts": {},
|
|
"error": str(e)
|
|
}
|