333 lines
16 KiB
Python
333 lines
16 KiB
Python
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from modules.datamodels.datamodelUam import User
|
|
from modules.datamodels.datamodelNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig
|
|
from modules.services.serviceNeutralization.mainServiceNeutralization import NeutralizationService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class NeutralizationPlayground:
|
|
"""Feature/UI wrapper around NeutralizationService for playground & routes."""
|
|
|
|
def __init__(self, currentUser: User):
|
|
self.currentUser = currentUser
|
|
self.service = NeutralizationService(currentUser)
|
|
|
|
def processText(self, text: str) -> Dict[str, Any]:
|
|
return self.service.processText(text)
|
|
|
|
def processFiles(self, fileIds: List[str]) -> Dict[str, Any]:
|
|
results: List[Dict[str, Any]] = []
|
|
errors: List[str] = []
|
|
for fileId in fileIds:
|
|
try:
|
|
res = self.service.processFile(fileId)
|
|
results.append({
|
|
'file_id': fileId,
|
|
'neutralized_file_name': res.get('neutralized_file_name'),
|
|
'attributes_count': len(res.get('attributes', []))
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Error processing file {fileId}: {str(e)}")
|
|
errors.append(f"{fileId}: {str(e)}")
|
|
return {
|
|
'success': len(errors) == 0,
|
|
'total_files': len(fileIds),
|
|
'successful_files': len(results),
|
|
'failed_files': len(errors),
|
|
'results': results,
|
|
'errors': errors,
|
|
}
|
|
|
|
async def processSharepointFiles(self, sourcePath: str, targetPath: str) -> Dict[str, Any]:
|
|
from modules.features.neutralizePlayground.sharepoint import SharepointProcessor
|
|
processor = SharepointProcessor(self.currentUser, self.service)
|
|
return await processor.processSharepointFiles(sourcePath, targetPath)
|
|
|
|
# Cleanup attributes
|
|
def cleanAttributes(self, fileId: str) -> bool:
|
|
if not self.service.app_interface:
|
|
return False
|
|
return self.service.app_interface.deleteNeutralizationAttributes(fileId)
|
|
|
|
# Stats
|
|
def getStats(self) -> Dict[str, Any]:
|
|
try:
|
|
allAttributes = self.service._getAttributes()
|
|
patternCounts: Dict[str, int] = {}
|
|
for attr in allAttributes:
|
|
patternType = attr.patternType
|
|
patternCounts[patternType] = patternCounts.get(patternType, 0) + 1
|
|
uniqueFiles = set(attr.fileId for attr in allAttributes if attr.fileId)
|
|
return {
|
|
'total_attributes': len(allAttributes),
|
|
'unique_files': len(uniqueFiles),
|
|
'pattern_counts': patternCounts,
|
|
'mandate_id': self.currentUser.mandateId if self.currentUser else None,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting stats: {str(e)}")
|
|
return {
|
|
'total_attributes': 0,
|
|
'unique_files': 0,
|
|
'pattern_counts': {},
|
|
'error': str(e),
|
|
}
|
|
|
|
# Additional methods needed by the route
|
|
def get_config(self) -> Optional[DataNeutraliserConfig]:
|
|
"""Get neutralization configuration"""
|
|
return self.service.getConfig()
|
|
|
|
def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
|
|
"""Save neutralization configuration"""
|
|
return self.service.saveConfig(config_data)
|
|
|
|
def neutralize_text(self, text: str, file_id: str = None) -> Dict[str, Any]:
|
|
"""Neutralize text content"""
|
|
return self.service.processText(text)
|
|
|
|
def resolve_text(self, text: str) -> str:
|
|
"""Resolve UIDs in neutralized text back to original text"""
|
|
return self.service.resolveText(text)
|
|
|
|
def get_attributes(self, file_id: str = None) -> List[DataNeutralizerAttributes]:
|
|
"""Get neutralization attributes, optionally filtered by file ID"""
|
|
if not self.service.app_interface:
|
|
return []
|
|
try:
|
|
all_attributes = self.service._getAttributes()
|
|
if file_id:
|
|
return [attr for attr in all_attributes if attr.fileId == file_id]
|
|
return all_attributes
|
|
except Exception as e:
|
|
logger.error(f"Error getting attributes: {str(e)}")
|
|
return []
|
|
|
|
async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
|
|
"""Process files from SharePoint source path and store neutralized files in target path"""
|
|
return await self.processSharepointFiles(source_path, target_path)
|
|
|
|
def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Process multiple files for neutralization"""
|
|
file_ids = [file_data.get('fileId') for file_data in files_data if file_data.get('fileId')]
|
|
return self.processFiles(file_ids)
|
|
|
|
def get_processing_stats(self) -> Dict[str, Any]:
|
|
"""Get neutralization processing statistics"""
|
|
return self.getStats()
|
|
|
|
def cleanup_file_attributes(self, file_id: str) -> bool:
|
|
"""Clean up neutralization attributes for a specific file"""
|
|
return self.cleanAttributes(file_id)
|
|
|
|
|
|
# Internal SharePoint helper module separated to keep feature logic tidy
|
|
class SharepointProcessor:
|
|
def __init__(self, currentUser: User, service: NeutralizationService):
|
|
self.currentUser = currentUser
|
|
self.service = service
|
|
|
|
async def processSharepointFiles(self, sourcePath: str, targetPath: str) -> Dict[str, Any]:
|
|
try:
|
|
logger.info(f"Processing SharePoint files from {sourcePath} to {targetPath}")
|
|
connection = await self._getSharepointConnection(sourcePath)
|
|
if not connection:
|
|
return {
|
|
'success': False,
|
|
'message': 'No SharePoint connection found for user',
|
|
'processed_files': 0,
|
|
'errors': ['No SharePoint connection found'],
|
|
}
|
|
from modules.security.tokenManager import TokenManager
|
|
token = TokenManager().getFreshToken(self.service.app_interface, connection['id'])
|
|
if not token:
|
|
return {
|
|
'success': False,
|
|
'message': 'No SharePoint access token found',
|
|
'processed_files': 0,
|
|
'errors': ['No SharePoint access token found'],
|
|
}
|
|
return await self._processSharepointFilesAsync(sourcePath, targetPath, token.tokenAccess)
|
|
except Exception as e:
|
|
logger.error(f"Error processing SharePoint files: {str(e)}")
|
|
return {
|
|
'success': False,
|
|
'message': f'Error processing SharePoint files: {str(e)}',
|
|
'processed_files': 0,
|
|
'errors': [str(e)],
|
|
}
|
|
|
|
async def _getSharepointConnection(self, sharepointPath: str = None):
|
|
try:
|
|
from modules.datamodels.datamodelUam import UserConnection
|
|
connections = self.service.app_interface.db.getRecordset(
|
|
UserConnection,
|
|
recordFilter={"userId": self.service.app_interface.userId}
|
|
)
|
|
msftConnections = [c for c in connections if c.get('authority') == 'msft']
|
|
if not msftConnections:
|
|
logger.warning('No Microsoft connections found for user')
|
|
return None
|
|
if len(msftConnections) == 1:
|
|
logger.info(f"Found single Microsoft connection: {msftConnections[0].get('id')}")
|
|
return msftConnections[0]
|
|
if sharepointPath:
|
|
return await self._matchConnectionToPath(msftConnections, sharepointPath)
|
|
logger.info(f"Multiple Microsoft connections found, using first one: {msftConnections[0].get('id')}")
|
|
return msftConnections[0]
|
|
except Exception:
|
|
logger.error('Error getting SharePoint connection')
|
|
return None
|
|
|
|
async def _matchConnectionToPath(self, connections: list, sharepointPath: str):
|
|
try:
|
|
from urllib.parse import urlparse
|
|
targetDomain = urlparse(sharepointPath).netloc.lower()
|
|
logger.info(f"Looking for connection matching domain: {targetDomain}")
|
|
from modules.security.tokenManager import TokenManager
|
|
for connection in connections:
|
|
try:
|
|
token = TokenManager().getFreshToken(self.service.app_interface, connection['id'])
|
|
if not token:
|
|
continue
|
|
if await self._testSharepointAccess(token.tokenAccess, sharepointPath):
|
|
logger.info(f"Found matching connection for domain {targetDomain}: {connection.get('id')}")
|
|
return connection
|
|
except Exception:
|
|
continue
|
|
logger.warning(f"No specific connection match found for {targetDomain}, using first available")
|
|
return connections[0]
|
|
except Exception:
|
|
logger.error('Error matching connection to path')
|
|
return connections[0] if connections else None
|
|
|
|
async def _testSharepointAccess(self, accessToken: str, sharepointPath: str) -> bool:
|
|
try:
|
|
return await self._testSharepointAccessAsync(accessToken, sharepointPath)
|
|
except Exception:
|
|
return False
|
|
|
|
async def _testSharepointAccessAsync(self, accessToken: str, sharepointPath: str) -> bool:
|
|
try:
|
|
from modules.services.serviceSharepoint.mainSharepoint import SharepointService
|
|
connector = SharepointService(access_token=accessToken)
|
|
siteUrl, _ = self._parseSharepointPath(sharepointPath)
|
|
if not siteUrl:
|
|
return False
|
|
siteInfo = await connector.find_site_by_web_url(siteUrl)
|
|
return siteInfo is not None
|
|
except Exception:
|
|
return False
|
|
|
|
async def _processSharepointFilesAsync(self, sourcePath: str, targetPath: str, accessToken: str) -> Dict[str, Any]:
|
|
try:
|
|
import asyncio
|
|
from modules.services.serviceSharepoint.mainSharepoint import SharepointService
|
|
connector = SharepointService(access_token=accessToken)
|
|
sourceSite, sourceFolder = self._parseSharepointPath(sourcePath)
|
|
targetSite, targetFolder = self._parseSharepointPath(targetPath)
|
|
if not sourceSite or not targetSite:
|
|
return {'success': False, 'message': 'Invalid SharePoint path format', 'processed_files': 0, 'errors': ['Invalid SharePoint path format']}
|
|
sourceSiteInfo = await connector.find_site_by_web_url(sourceSite)
|
|
if not sourceSiteInfo:
|
|
return {'success': False, 'message': f'Source site not found: {sourceSite}', 'processed_files': 0, 'errors': [f'Source site not found: {sourceSite}']}
|
|
targetSiteInfo = await connector.find_site_by_web_url(targetSite)
|
|
if not targetSiteInfo:
|
|
return {'success': False, 'message': f'Target site not found: {targetSite}', 'processed_files': 0, 'errors': [f'Target site not found: {targetSite}']}
|
|
logger.info(f"Listing files in folder: {sourceFolder} for site: {sourceSiteInfo['id']}")
|
|
files = await connector.list_folder_contents(sourceSiteInfo['id'], sourceFolder)
|
|
if not files:
|
|
logger.warning(f"No files found in folder '{sourceFolder}', trying root folder")
|
|
files = await connector.list_folder_contents(sourceSiteInfo['id'], '')
|
|
if files:
|
|
folders = [f for f in files if f.get('type') == 'folder']
|
|
folderNames = [f.get('name') for f in folders]
|
|
logger.info(f"Available folders in root: {folderNames}")
|
|
folderList = ", ".join(folderNames) if folderNames else "None"
|
|
return {
|
|
'success': False,
|
|
'message': f"Folder '{sourceFolder}' not found. Available folders in root: {folderList}",
|
|
'processed_files': 0,
|
|
'errors': [f"Folder '{sourceFolder}' not found. Available folders: {folderList}"],
|
|
'available_folders': folderNames,
|
|
}
|
|
else:
|
|
return {'success': False, 'message': f'No files found in source folder: {sourceFolder}', 'processed_files': 0, 'errors': [f'No files found in source folder: {sourceFolder}']}
|
|
|
|
textFiles = [f for f in files if f.get('type') == 'file']
|
|
processed: List[Dict[str, Any]] = []
|
|
errors: List[str] = []
|
|
|
|
async def _processSingle(fileInfo: Dict[str, Any]):
|
|
try:
|
|
fileContent = await connector.download_file(sourceSiteInfo['id'], fileInfo['id'])
|
|
if not fileContent:
|
|
return {'error': f"Failed to download file: {fileInfo['name']}"}
|
|
try:
|
|
textContent = fileContent.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
textContent = fileContent.decode('latin-1')
|
|
result = self.service._neutralizeText(textContent, 'text')
|
|
neutralizedFilename = f"neutralized_{fileInfo['name']}"
|
|
uploadResult = await connector.upload_file(targetSiteInfo['id'], targetFolder, neutralizedFilename, result['neutralized_text'].encode('utf-8'))
|
|
if 'error' in uploadResult:
|
|
return {'error': f"Failed to upload neutralized file: {neutralizedFilename} - {uploadResult['error']}"}
|
|
return {
|
|
'success': True,
|
|
'original_name': fileInfo['name'],
|
|
'neutralized_name': neutralizedFilename,
|
|
'attributes_count': len(result.get('attributes', [])),
|
|
}
|
|
except Exception as e:
|
|
return {'error': f"Error processing file {fileInfo['name']}: {str(e)}"}
|
|
|
|
tasks = [ _processSingle(f) for f in textFiles ]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
for i, r in enumerate(results):
|
|
if isinstance(r, Exception):
|
|
errors.append(f"Exception processing file {textFiles[i]['name']}: {str(r)}")
|
|
elif isinstance(r, dict) and 'error' in r:
|
|
errors.append(r['error'])
|
|
elif isinstance(r, dict) and r.get('success'):
|
|
processed.append({
|
|
'original_name': r['original_name'],
|
|
'neutralized_name': r['neutralized_name'],
|
|
'attributes_count': r['attributes_count'],
|
|
})
|
|
else:
|
|
errors.append(f"Unknown result processing file {textFiles[i]['name']}: {r}")
|
|
return {
|
|
'success': len(processed) > 0,
|
|
'message': f"Processed {len(processed)} files successfully",
|
|
'processed_files': len(processed),
|
|
'files': processed,
|
|
'errors': errors,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error in async SharePoint processing: {str(e)}")
|
|
return {'success': False, 'message': f'Error in async SharePoint processing: {str(e)}', 'processed_files': 0, 'errors': [str(e)]}
|
|
|
|
def _parseSharepointPath(self, path: str) -> tuple[str, str]:
|
|
try:
|
|
if not path.startswith('https://'):
|
|
return None, None
|
|
if '?' in path:
|
|
path = path.split('?')[0]
|
|
if '/sites/' not in path:
|
|
return None, None
|
|
parts = path.split('/sites/', 1)
|
|
if len(parts) != 2:
|
|
return None, None
|
|
domain = parts[0].replace('https://', '')
|
|
siteName = parts[1].split('/')[0]
|
|
siteUrl = f"https://{domain}/sites/{siteName}"
|
|
folderParts = parts[1].split('/')[1:]
|
|
from urllib.parse import unquote
|
|
folderPath = unquote('/'.join(folderParts) if folderParts else '')
|
|
return siteUrl, folderPath
|
|
except Exception:
|
|
logger.error(f"Error parsing SharePoint path '{path}'")
|
|
return None, None
|