neutralizer 1.0
This commit is contained in:
parent
70479aff88
commit
a941963e78
9 changed files with 1390 additions and 81 deletions
|
|
@ -74,6 +74,28 @@ class AppAccess:
|
||||||
else:
|
else:
|
||||||
# Regular users only see their own connections
|
# Regular users only see their own connections
|
||||||
filtered_records = [r for r in recordset if r.get("userId") == self.userId]
|
filtered_records = [r for r in recordset if r.get("userId") == self.userId]
|
||||||
|
# Special handling for data neutralization config table
|
||||||
|
elif table_name == "DataNeutraliserConfig":
|
||||||
|
if self.privilege == UserPrivilege.SYSADMIN:
|
||||||
|
# SysAdmin sees all configs
|
||||||
|
filtered_records = recordset
|
||||||
|
elif self.privilege == UserPrivilege.ADMIN:
|
||||||
|
# Admin sees configs in their mandate
|
||||||
|
filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId]
|
||||||
|
else:
|
||||||
|
# Regular users only see their own configs
|
||||||
|
filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId and r.get("userId") == self.userId]
|
||||||
|
# Special handling for data neutralizer attributes table
|
||||||
|
elif table_name == "DataNeutralizerAttributes":
|
||||||
|
if self.privilege == UserPrivilege.SYSADMIN:
|
||||||
|
# SysAdmin sees all attributes
|
||||||
|
filtered_records = recordset
|
||||||
|
elif self.privilege == UserPrivilege.ADMIN:
|
||||||
|
# Admin sees attributes in their mandate
|
||||||
|
filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId]
|
||||||
|
else:
|
||||||
|
# Regular users only see their own attributes
|
||||||
|
filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId and r.get("userId") == self.userId]
|
||||||
# System admins see all other records
|
# System admins see all other records
|
||||||
elif self.privilege == UserPrivilege.SYSADMIN:
|
elif self.privilege == UserPrivilege.SYSADMIN:
|
||||||
filtered_records = recordset
|
filtered_records = recordset
|
||||||
|
|
@ -126,6 +148,37 @@ class AppAccess:
|
||||||
record["_hideEdit"] = record.get("userId") != self.userId
|
record["_hideEdit"] = record.get("userId") != self.userId
|
||||||
record["_hideDelete"] = record.get("userId") != self.userId
|
record["_hideDelete"] = record.get("userId") != self.userId
|
||||||
|
|
||||||
|
elif table_name == "DataNeutraliserConfig":
|
||||||
|
# Everyone can view configs they have access to
|
||||||
|
record["_hideView"] = False
|
||||||
|
# SysAdmin can edit/delete any config
|
||||||
|
if self.privilege == UserPrivilege.SYSADMIN:
|
||||||
|
record["_hideEdit"] = False
|
||||||
|
record["_hideDelete"] = False
|
||||||
|
# Admin can edit/delete configs in their mandate
|
||||||
|
elif self.privilege == UserPrivilege.ADMIN:
|
||||||
|
record["_hideEdit"] = record.get("mandateId","-") != self.mandateId
|
||||||
|
record["_hideDelete"] = record.get("mandateId","-") != self.mandateId
|
||||||
|
# Regular users can only edit/delete their own configs
|
||||||
|
else:
|
||||||
|
record["_hideEdit"] = record.get("userId") != self.userId
|
||||||
|
record["_hideDelete"] = record.get("userId") != self.userId
|
||||||
|
elif table_name == "DataNeutralizerAttributes":
|
||||||
|
# Everyone can view attributes they have access to
|
||||||
|
record["_hideView"] = False
|
||||||
|
# SysAdmin can edit/delete any attributes
|
||||||
|
if self.privilege == UserPrivilege.SYSADMIN:
|
||||||
|
record["_hideEdit"] = False
|
||||||
|
record["_hideDelete"] = False
|
||||||
|
# Admin can edit/delete attributes in their mandate
|
||||||
|
elif self.privilege == UserPrivilege.ADMIN:
|
||||||
|
record["_hideEdit"] = record.get("mandateId","-") != self.mandateId
|
||||||
|
record["_hideDelete"] = record.get("mandateId","-") != self.mandateId
|
||||||
|
# Regular users can only edit/delete their own attributes
|
||||||
|
else:
|
||||||
|
record["_hideEdit"] = record.get("userId") != self.userId
|
||||||
|
record["_hideDelete"] = record.get("userId") != self.userId
|
||||||
|
|
||||||
elif table_name == "AuthEvent":
|
elif table_name == "AuthEvent":
|
||||||
# Only show auth events for the current user or if admin
|
# Only show auth events for the current user or if admin
|
||||||
if self.privilege in [UserPrivilege.SYSADMIN, UserPrivilege.ADMIN]:
|
if self.privilege in [UserPrivilege.SYSADMIN, UserPrivilege.ADMIN]:
|
||||||
|
|
|
||||||
|
|
@ -427,6 +427,126 @@ register_model_labels(
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class DataNeutraliserConfig(BaseModel, ModelMixin):
|
||||||
|
"""Data model for data neutralization configuration"""
|
||||||
|
id: str = Field(
|
||||||
|
default_factory=lambda: str(uuid.uuid4()),
|
||||||
|
description="Unique ID of the configuration",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
mandateId: str = Field(
|
||||||
|
description="ID of the mandate this configuration belongs to",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
userId: str = Field(
|
||||||
|
description="ID of the user who created this configuration",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
enabled: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Whether data neutralization is enabled",
|
||||||
|
frontend_type="checkbox",
|
||||||
|
frontend_readonly=False,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
namesToParse: str = Field(
|
||||||
|
default="",
|
||||||
|
description="Multiline list of names to parse for neutralization",
|
||||||
|
frontend_type="textarea",
|
||||||
|
frontend_readonly=False,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
sharepointSourcePath: str = Field(
|
||||||
|
default="",
|
||||||
|
description="SharePoint path to read files for neutralization",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=False,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
sharepointTargetPath: str = Field(
|
||||||
|
default="",
|
||||||
|
description="SharePoint path to store neutralized files",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=False,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register labels for DataNeutraliserConfig
|
||||||
|
register_model_labels(
|
||||||
|
"DataNeutraliserConfig",
|
||||||
|
{"en": "Data Neutralization Config", "fr": "Configuration de neutralisation des données"},
|
||||||
|
{
|
||||||
|
"id": {"en": "ID", "fr": "ID"},
|
||||||
|
"mandateId": {"en": "Mandate ID", "fr": "ID de mandat"},
|
||||||
|
"userId": {"en": "User ID", "fr": "ID utilisateur"},
|
||||||
|
"enabled": {"en": "Enabled", "fr": "Activé"},
|
||||||
|
"namesToParse": {"en": "Names to Parse", "fr": "Noms à analyser"},
|
||||||
|
"sharepointSourcePath": {"en": "Source Path", "fr": "Chemin source"},
|
||||||
|
"sharepointTargetPath": {"en": "Target Path", "fr": "Chemin cible"}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
class DataNeutralizerAttributes(BaseModel, ModelMixin):
|
||||||
|
"""Data model for neutralized data attributes mapping"""
|
||||||
|
id: str = Field(
|
||||||
|
default_factory=lambda: str(uuid.uuid4()),
|
||||||
|
description="Unique ID of the attribute mapping (used as UID in neutralized files)",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
mandateId: str = Field(
|
||||||
|
description="ID of the mandate this attribute belongs to",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
userId: str = Field(
|
||||||
|
description="ID of the user who created this attribute",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
originalText: str = Field(
|
||||||
|
description="Original text that was neutralized",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
fileId: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="ID of the file this attribute belongs to",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=False
|
||||||
|
)
|
||||||
|
patternType: str = Field(
|
||||||
|
description="Type of pattern that matched (email, phone, name, etc.)",
|
||||||
|
frontend_type="text",
|
||||||
|
frontend_readonly=True,
|
||||||
|
frontend_required=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register labels for DataNeutralizerAttributes
|
||||||
|
register_model_labels(
|
||||||
|
"DataNeutralizerAttributes",
|
||||||
|
{"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"},
|
||||||
|
{
|
||||||
|
"id": {"en": "ID", "fr": "ID"},
|
||||||
|
"mandateId": {"en": "Mandate ID", "fr": "ID de mandat"},
|
||||||
|
"userId": {"en": "User ID", "fr": "ID utilisateur"},
|
||||||
|
"originalText": {"en": "Original Text", "fr": "Texte original"},
|
||||||
|
"fileId": {"en": "File ID", "fr": "ID de fichier"},
|
||||||
|
"patternType": {"en": "Pattern Type", "fr": "Type de modèle"}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
class SystemTable(BaseModel, ModelMixin):
|
class SystemTable(BaseModel, ModelMixin):
|
||||||
"""Data model for system table entries"""
|
"""Data model for system table entries"""
|
||||||
table_name: str = Field(
|
table_name: str = Field(
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import importlib
|
||||||
import json
|
import json
|
||||||
from passlib.context import CryptContext
|
from passlib.context import CryptContext
|
||||||
import uuid
|
import uuid
|
||||||
|
import re
|
||||||
|
|
||||||
from modules.connectors.connectorDbPostgre import DatabaseConnector
|
from modules.connectors.connectorDbPostgre import DatabaseConnector
|
||||||
from modules.shared.configuration import APP_CONFIG
|
from modules.shared.configuration import APP_CONFIG
|
||||||
|
|
@ -19,7 +20,8 @@ from modules.interfaces.interfaceAppAccess import AppAccess
|
||||||
from modules.interfaces.interfaceAppModel import (
|
from modules.interfaces.interfaceAppModel import (
|
||||||
User, Mandate, UserInDB, UserConnection,
|
User, Mandate, UserInDB, UserConnection,
|
||||||
AuthAuthority, UserPrivilege,
|
AuthAuthority, UserPrivilege,
|
||||||
ConnectionStatus, Token, AuthEvent
|
ConnectionStatus, Token, AuthEvent,
|
||||||
|
DataNeutraliserConfig, DataNeutralizerAttributes
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -993,6 +995,211 @@ class AppObjects:
|
||||||
logger.error(f"Error during logout: {str(e)}")
|
logger.error(f"Error during logout: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
# Data Neutralization methods
|
||||||
|
|
||||||
|
def getNeutralizationConfig(self) -> Optional[DataNeutraliserConfig]:
|
||||||
|
"""Get the data neutralization configuration for the current user's mandate"""
|
||||||
|
try:
|
||||||
|
configs = self.db.getRecordset(DataNeutraliserConfig, recordFilter={"mandateId": self.mandateId})
|
||||||
|
if not configs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Apply access control
|
||||||
|
filtered_configs = self._uam(DataNeutraliserConfig, configs)
|
||||||
|
if not filtered_configs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return DataNeutraliserConfig.from_dict(filtered_configs[0])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting neutralization config: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def createOrUpdateNeutralizationConfig(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
|
||||||
|
"""Create or update the data neutralization configuration"""
|
||||||
|
try:
|
||||||
|
# Check if config already exists
|
||||||
|
existing_config = self.getNeutralizationConfig()
|
||||||
|
|
||||||
|
if existing_config:
|
||||||
|
# Update existing config
|
||||||
|
update_data = existing_config.to_dict()
|
||||||
|
update_data.update(config_data)
|
||||||
|
update_data["updatedAt"] = get_utc_timestamp()
|
||||||
|
|
||||||
|
updated_config = DataNeutraliserConfig.from_dict(update_data)
|
||||||
|
self.db.recordModify(DataNeutraliserConfig, existing_config.id, updated_config)
|
||||||
|
|
||||||
|
return updated_config
|
||||||
|
else:
|
||||||
|
# Create new config
|
||||||
|
config_data["mandateId"] = self.mandateId
|
||||||
|
config_data["userId"] = self.userId
|
||||||
|
|
||||||
|
new_config = DataNeutraliserConfig.from_dict(config_data)
|
||||||
|
created_record = self.db.recordCreate(DataNeutraliserConfig, new_config)
|
||||||
|
|
||||||
|
return DataNeutraliserConfig.from_dict(created_record)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating/updating neutralization config: {str(e)}")
|
||||||
|
raise ValueError(f"Failed to create/update neutralization config: {str(e)}")
|
||||||
|
|
||||||
|
def neutralizeText(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Neutralize text content and store attribute mappings"""
|
||||||
|
try:
|
||||||
|
from modules.neutralizer.neutralizer import DataAnonymizer
|
||||||
|
|
||||||
|
# Get neutralization configuration to extract namesToParse
|
||||||
|
config = self.getNeutralizationConfig()
|
||||||
|
names_to_parse = []
|
||||||
|
if config and hasattr(config, 'namesToParse') and config.namesToParse:
|
||||||
|
# Split by newlines and filter out empty strings
|
||||||
|
names_to_parse = [name.strip() for name in config.namesToParse.split('\n') if name.strip()]
|
||||||
|
|
||||||
|
# Initialize anonymizer with custom names
|
||||||
|
anonymizer = DataAnonymizer(names_to_parse=names_to_parse)
|
||||||
|
|
||||||
|
# Process the text
|
||||||
|
result = anonymizer.process_content(text, 'text')
|
||||||
|
|
||||||
|
# Store attribute mappings in database
|
||||||
|
stored_attributes = []
|
||||||
|
for original_text, neutralized_text in result.mapping.items():
|
||||||
|
# Extract pattern type and UUID from the neutralized text format [type.uuid]
|
||||||
|
pattern_type = "unknown"
|
||||||
|
placeholder_uuid = None
|
||||||
|
|
||||||
|
if neutralized_text.startswith("[") and "." in neutralized_text and neutralized_text.endswith("]"):
|
||||||
|
# Extract type and UUID from [type.uuid] format
|
||||||
|
inner = neutralized_text[1:-1] # Remove [ and ]
|
||||||
|
if "." in inner:
|
||||||
|
pattern_type, placeholder_uuid = inner.split(".", 1)
|
||||||
|
|
||||||
|
# Check if this exact original text already has a placeholder in the database
|
||||||
|
existing_attribute = self.getExistingPlaceholder(original_text)
|
||||||
|
|
||||||
|
if existing_attribute:
|
||||||
|
# Reuse existing placeholder
|
||||||
|
existing_uuid = existing_attribute.id
|
||||||
|
existing_pattern_type = existing_attribute.patternType
|
||||||
|
|
||||||
|
# Update the neutralized text to use the existing UUID
|
||||||
|
result.data = result.data.replace(neutralized_text, f"[{existing_pattern_type}.{existing_uuid}]")
|
||||||
|
result.mapping[original_text] = f"[{existing_pattern_type}.{existing_uuid}]"
|
||||||
|
|
||||||
|
stored_attributes.append(existing_attribute)
|
||||||
|
else:
|
||||||
|
# Create new attribute record with the UUID that the neutralizer generated
|
||||||
|
attribute_data = {
|
||||||
|
"id": placeholder_uuid, # Use the UUID from the neutralizer
|
||||||
|
"mandateId": self.mandateId,
|
||||||
|
"userId": self.userId,
|
||||||
|
"originalText": original_text,
|
||||||
|
"fileId": file_id,
|
||||||
|
"patternType": pattern_type
|
||||||
|
}
|
||||||
|
|
||||||
|
attribute = DataNeutralizerAttributes.from_dict(attribute_data)
|
||||||
|
created_attribute = self.db.recordCreate(DataNeutralizerAttributes, attribute)
|
||||||
|
stored_attributes.append(created_attribute)
|
||||||
|
|
||||||
|
|
||||||
|
# The neutralized text is already in the correct [type.uuid] format
|
||||||
|
# No need to replace it, as it's already properly formatted
|
||||||
|
|
||||||
|
return {
|
||||||
|
"neutralized_text": result.data,
|
||||||
|
"attributes": stored_attributes,
|
||||||
|
"mapping": result.mapping,
|
||||||
|
"replaced_fields": result.replaced_fields,
|
||||||
|
"processed_info": result.processed_info
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error neutralizing text: {str(e)}")
|
||||||
|
raise ValueError(f"Failed to neutralize text: {str(e)}")
|
||||||
|
|
||||||
|
def getExistingPlaceholder(self, original_text: str) -> Optional[DataNeutralizerAttributes]:
|
||||||
|
"""Get existing placeholder for original text if it exists"""
|
||||||
|
try:
|
||||||
|
existing_attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
|
||||||
|
"mandateId": self.mandateId,
|
||||||
|
"userId": self.userId,
|
||||||
|
"originalText": original_text
|
||||||
|
})
|
||||||
|
|
||||||
|
if existing_attributes:
|
||||||
|
return DataNeutralizerAttributes.from_dict(existing_attributes[0])
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting existing placeholder: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNeutralizationAttributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
|
||||||
|
"""Get neutralization attributes, optionally filtered by file ID"""
|
||||||
|
try:
|
||||||
|
filter_dict = {"mandateId": self.mandateId}
|
||||||
|
if file_id:
|
||||||
|
filter_dict["fileId"] = file_id
|
||||||
|
|
||||||
|
attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter=filter_dict)
|
||||||
|
filtered_attributes = self._uam(DataNeutralizerAttributes, attributes)
|
||||||
|
|
||||||
|
return [DataNeutralizerAttributes.from_dict(attr) for attr in filtered_attributes]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting neutralization attributes: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def resolveNeutralizedText(self, text: str) -> str:
|
||||||
|
"""Resolve UIDs in neutralized text back to original text"""
|
||||||
|
try:
|
||||||
|
# Find all placeholders in the new format [type.uuid]
|
||||||
|
placeholder_pattern = r'\[([a-z]+)\.([a-f0-9-]{36})\]'
|
||||||
|
matches = re.findall(placeholder_pattern, text)
|
||||||
|
|
||||||
|
resolved_text = text
|
||||||
|
for placeholder_type, uid in matches:
|
||||||
|
# Find the attribute with this UID (which is the record ID)
|
||||||
|
attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
|
||||||
|
"mandateId": self.mandateId,
|
||||||
|
"id": uid
|
||||||
|
})
|
||||||
|
|
||||||
|
if attributes:
|
||||||
|
attribute = attributes[0]
|
||||||
|
# Replace placeholder with original text
|
||||||
|
placeholder = f"[{placeholder_type}.{uid}]"
|
||||||
|
resolved_text = resolved_text.replace(placeholder, attribute["originalText"])
|
||||||
|
else:
|
||||||
|
logger.warning(f"No attribute found for UID {uid}")
|
||||||
|
|
||||||
|
return resolved_text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error resolving neutralized text: {str(e)}")
|
||||||
|
return text
|
||||||
|
|
||||||
|
def deleteNeutralizationAttributes(self, file_id: str) -> bool:
|
||||||
|
"""Delete all neutralization attributes for a specific file"""
|
||||||
|
try:
|
||||||
|
attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
|
||||||
|
"mandateId": self.mandateId,
|
||||||
|
"fileId": file_id
|
||||||
|
})
|
||||||
|
|
||||||
|
for attribute in attributes:
|
||||||
|
self.db.recordDelete(DataNeutralizerAttributes, attribute["id"])
|
||||||
|
|
||||||
|
logger.info(f"Deleted {len(attributes)} neutralization attributes for file {file_id}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting neutralization attributes: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
# Public Methods
|
# Public Methods
|
||||||
|
|
||||||
def getInterface(currentUser: User) -> AppObjects:
|
def getInterface(currentUser: User) -> AppObjects:
|
||||||
|
|
|
||||||
|
|
@ -50,10 +50,15 @@ class ProcessResult:
|
||||||
class DataAnonymizer:
|
class DataAnonymizer:
|
||||||
"""Hauptklasse für die Datenanonymisierung"""
|
"""Hauptklasse für die Datenanonymisierung"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, names_to_parse: List[str] = None):
|
||||||
"""Initialize the anonymizer with patterns"""
|
"""Initialize the anonymizer with patterns and custom names
|
||||||
|
|
||||||
|
Args:
|
||||||
|
names_to_parse: List of names to parse and replace (case-insensitive)
|
||||||
|
"""
|
||||||
self.header_patterns = HeaderPatterns.patterns
|
self.header_patterns = HeaderPatterns.patterns
|
||||||
self.data_patterns = DataPatterns.patterns
|
self.data_patterns = DataPatterns.patterns
|
||||||
|
self.names_to_parse = names_to_parse or []
|
||||||
self.replaced_fields = set()
|
self.replaced_fields = set()
|
||||||
self.mapping = {}
|
self.mapping = {}
|
||||||
self.processing_info = []
|
self.processing_info = []
|
||||||
|
|
@ -64,6 +69,7 @@ class DataAnonymizer:
|
||||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def _is_table_line(self, line: str) -> bool:
|
def _is_table_line(self, line: str) -> bool:
|
||||||
"""Check if a line represents a table row"""
|
"""Check if a line represents a table row"""
|
||||||
return bool(re.match(r'^\s*[^:]+:\s*[^:]+$', line) or
|
return bool(re.match(r'^\s*[^:]+:\s*[^:]+$', line) or
|
||||||
|
|
@ -110,46 +116,72 @@ class DataAnonymizer:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error anonymizing table: {str(e)}")
|
logger.error(f"Error anonymizing table: {str(e)}")
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def _anonymize_plain_text(self, text: PlainText) -> PlainText:
|
def _anonymize_plain_text(self, text: PlainText) -> PlainText:
|
||||||
"""Anonymize plain text content"""
|
"""Anonymize plain text content using simple search-and-replace approach"""
|
||||||
try:
|
try:
|
||||||
# Process the entire text at once instead of line by line
|
|
||||||
current_text = text.content
|
current_text = text.content
|
||||||
|
|
||||||
# Find all matches in the entire text
|
# Step 1: Replace custom names first (simple regex search-and-replace)
|
||||||
matches = find_patterns_in_text(current_text, self.data_patterns)
|
for name in self.names_to_parse:
|
||||||
|
if not name.strip():
|
||||||
# Process matches in reverse order to avoid position shifting
|
|
||||||
for match in sorted(matches, key=lambda x: x[2], reverse=True):
|
|
||||||
pattern_name, matched_text, start, end = match
|
|
||||||
|
|
||||||
# Skip if the matched text is already a placeholder
|
|
||||||
if re.match(r'\[[A-Z_]+\d+\]', matched_text):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Find the pattern that matched
|
# Create case-insensitive regex pattern with word boundaries
|
||||||
pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
|
pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
|
||||||
if pattern:
|
|
||||||
# Use the pattern's replacement template
|
# Find all matches for this name
|
||||||
|
matches = list(pattern.finditer(current_text))
|
||||||
|
|
||||||
|
# Replace each match with a placeholder
|
||||||
|
for match in reversed(matches): # Process from right to left to avoid position shifts
|
||||||
|
matched_text = match.group()
|
||||||
if matched_text not in self.mapping:
|
if matched_text not in self.mapping:
|
||||||
self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1)
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
self.mapping[matched_text] = f"[name.{placeholder_id}]"
|
||||||
|
|
||||||
replacement = self.mapping[matched_text]
|
replacement = self.mapping[matched_text]
|
||||||
|
start, end = match.span()
|
||||||
|
current_text = current_text[:start] + replacement + current_text[end:]
|
||||||
|
|
||||||
if pattern_name == 'email':
|
# Step 2: Replace pattern-based matches (emails, phones, etc.)
|
||||||
print(f"DEBUG: Replacing email '{matched_text}' with '{replacement}'")
|
# Use the same simple approach for patterns
|
||||||
print(f"DEBUG: Text after replacement: {current_text[:start] + replacement + current_text[end:]}")
|
pattern_matches = find_patterns_in_text(current_text, self.data_patterns)
|
||||||
|
|
||||||
# Replace the matched text while preserving surrounding whitespace
|
# Process pattern matches from right to left to avoid position shifts
|
||||||
|
for pattern_name, matched_text, start, end in reversed(pattern_matches):
|
||||||
|
# Skip if already a placeholder
|
||||||
|
if re.match(r'\[[a-z]+\.[a-f0-9-]+\]', matched_text):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip if contains placeholder characters
|
||||||
|
if '[' in matched_text or ']' in matched_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if matched_text not in self.mapping:
|
||||||
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern_name, 'data')
|
||||||
|
self.mapping[matched_text] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
|
|
||||||
|
replacement = self.mapping[matched_text]
|
||||||
current_text = current_text[:start] + replacement + current_text[end:]
|
current_text = current_text[:start] + replacement + current_text[end:]
|
||||||
|
|
||||||
return PlainText(content=current_text, source_type=text.source_type)
|
return PlainText(content=current_text, source_type=text.source_type)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error anonymizing plain text: {str(e)}")
|
logger.error(f"Error anonymizing plain text: {str(e)}")
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def _anonymize_json_value(self, value: Any, key: str = None) -> Any:
|
def _anonymize_json_value(self, value: Any, key: str = None) -> Any:
|
||||||
|
|
@ -173,16 +205,49 @@ class DataAnonymizer:
|
||||||
pattern = get_pattern_for_header(key, self.header_patterns)
|
pattern = get_pattern_for_header(key, self.header_patterns)
|
||||||
if pattern:
|
if pattern:
|
||||||
if value not in self.mapping:
|
if value not in self.mapping:
|
||||||
self.mapping[value] = pattern.replacement_template.format(len(self.mapping) + 1)
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'name': 'name',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern.name, 'data')
|
||||||
|
self.mapping[value] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
return self.mapping[value]
|
return self.mapping[value]
|
||||||
|
|
||||||
# Check if the value itself matches any patterns
|
# Check if the value itself matches any patterns
|
||||||
matches = find_patterns_in_text(value, self.data_patterns)
|
pattern_matches = find_patterns_in_text(value, self.data_patterns)
|
||||||
if matches:
|
custom_name_matches = self._find_custom_names(value)
|
||||||
# Use the first match's pattern
|
|
||||||
pattern_name = matches[0][0]
|
if pattern_matches or custom_name_matches:
|
||||||
if value not in self.mapping:
|
# Use the first match's pattern or custom name
|
||||||
self.mapping[value] = f"{pattern_name.upper()}_{len(self.mapping) + 1}"
|
if pattern_matches:
|
||||||
|
pattern_name = pattern_matches[0][0]
|
||||||
|
if value not in self.mapping:
|
||||||
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'name': 'name',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern_name, 'data')
|
||||||
|
self.mapping[value] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
|
elif custom_name_matches:
|
||||||
|
if value not in self.mapping:
|
||||||
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
self.mapping[value] = f"[name.{placeholder_id}]"
|
||||||
return self.mapping[value]
|
return self.mapping[value]
|
||||||
|
|
||||||
return value
|
return value
|
||||||
|
|
@ -207,7 +272,19 @@ class DataAnonymizer:
|
||||||
pattern = get_pattern_for_header(attr_name, self.header_patterns)
|
pattern = get_pattern_for_header(attr_name, self.header_patterns)
|
||||||
if pattern:
|
if pattern:
|
||||||
if attr_value not in self.mapping:
|
if attr_value not in self.mapping:
|
||||||
self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1)
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'name': 'name',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern.name, 'data')
|
||||||
|
self.mapping[attr_value] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
processed_attrs[attr_name] = self.mapping[attr_value]
|
processed_attrs[attr_name] = self.mapping[attr_value]
|
||||||
else:
|
else:
|
||||||
# Check if attribute value matches any data patterns
|
# Check if attribute value matches any data patterns
|
||||||
|
|
@ -217,7 +294,19 @@ class DataAnonymizer:
|
||||||
pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
|
pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
|
||||||
if pattern:
|
if pattern:
|
||||||
if attr_value not in self.mapping:
|
if attr_value not in self.mapping:
|
||||||
self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1)
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'name': 'name',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern_name, 'data')
|
||||||
|
self.mapping[attr_value] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
processed_attrs[attr_name] = self.mapping[attr_value]
|
processed_attrs[attr_name] = self.mapping[attr_value]
|
||||||
else:
|
else:
|
||||||
processed_attrs[attr_name] = attr_value
|
processed_attrs[attr_name] = attr_value
|
||||||
|
|
@ -230,14 +319,36 @@ class DataAnonymizer:
|
||||||
# Process text content
|
# Process text content
|
||||||
text = element.text.strip() if element.text and element.text.strip() else ''
|
text = element.text.strip() if element.text and element.text.strip() else ''
|
||||||
if text:
|
if text:
|
||||||
# Check if text matches any patterns
|
# Check if text matches any patterns or custom names
|
||||||
matches = find_patterns_in_text(text, self.data_patterns)
|
pattern_matches = find_patterns_in_text(text, self.data_patterns)
|
||||||
if matches:
|
custom_name_matches = self._find_custom_names(text)
|
||||||
pattern_name = matches[0][0]
|
|
||||||
pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
|
if pattern_matches or custom_name_matches:
|
||||||
if pattern:
|
if pattern_matches:
|
||||||
|
pattern_name = pattern_matches[0][0]
|
||||||
|
pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
|
||||||
|
if pattern:
|
||||||
|
if text not in self.mapping:
|
||||||
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
# Create placeholder in format [type.uuid]
|
||||||
|
type_mapping = {
|
||||||
|
'email': 'email',
|
||||||
|
'phone': 'phone',
|
||||||
|
'name': 'name',
|
||||||
|
'address': 'address',
|
||||||
|
'id': 'id'
|
||||||
|
}
|
||||||
|
placeholder_type = type_mapping.get(pattern_name, 'data')
|
||||||
|
self.mapping[text] = f"[{placeholder_type}.{placeholder_id}]"
|
||||||
|
text = self.mapping[text]
|
||||||
|
elif custom_name_matches:
|
||||||
if text not in self.mapping:
|
if text not in self.mapping:
|
||||||
self.mapping[text] = pattern.replacement_template.format(len(self.mapping) + 1)
|
# Generate a UUID for the placeholder
|
||||||
|
import uuid
|
||||||
|
placeholder_id = str(uuid.uuid4())
|
||||||
|
self.mapping[text] = f"[name.{placeholder_id}]"
|
||||||
text = self.mapping[text]
|
text = self.mapping[text]
|
||||||
|
|
||||||
# Process child elements
|
# Process child elements
|
||||||
|
|
@ -271,18 +382,24 @@ class DataAnonymizer:
|
||||||
ProcessResult: Contains anonymized data, mapping, replaced fields and processing info
|
ProcessResult: Contains anonymized data, mapping, replaced fields and processing info
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# Check if content is binary data
|
# Check if content is binary data
|
||||||
is_binary = False
|
is_binary = False
|
||||||
try:
|
try:
|
||||||
# Try to decode base64 if it's a string
|
# First, check if content looks like base64 (contains only base64 characters)
|
||||||
try:
|
if re.match(r'^[A-Za-z0-9+/]*={0,2}$', content.strip()):
|
||||||
decoded = base64.b64decode(content)
|
# Try to decode base64 if it looks like base64
|
||||||
# If it's not valid text, consider it binary
|
try:
|
||||||
decoded.decode('utf-8')
|
decoded = base64.b64decode(content)
|
||||||
except (base64.binascii.Error, UnicodeDecodeError):
|
# If it's not valid text, consider it binary
|
||||||
is_binary = True
|
decoded.decode('utf-8')
|
||||||
except Exception:
|
is_binary = True
|
||||||
is_binary = True
|
except (base64.binascii.Error, UnicodeDecodeError):
|
||||||
|
is_binary = False
|
||||||
|
else:
|
||||||
|
is_binary = False
|
||||||
|
except Exception as e:
|
||||||
|
is_binary = False
|
||||||
|
|
||||||
if is_binary:
|
if is_binary:
|
||||||
# TODO: Implement binary data neutralization
|
# TODO: Implement binary data neutralization
|
||||||
|
|
@ -356,7 +473,7 @@ class DataAnonymizer:
|
||||||
|
|
||||||
# Combine all processed content
|
# Combine all processed content
|
||||||
result = content
|
result = content
|
||||||
for text, anonymized_text in zip(plain_texts, anonymized_texts):
|
for i, (text, anonymized_text) in enumerate(zip(plain_texts, anonymized_texts)):
|
||||||
if text.content != anonymized_text.content:
|
if text.content != anonymized_text.content:
|
||||||
result = result.replace(text.content, anonymized_text.content)
|
result = result.replace(text.content, anonymized_text.content)
|
||||||
|
|
||||||
|
|
@ -364,5 +481,4 @@ class DataAnonymizer:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing content: {str(e)}")
|
logger.error(f"Error processing content: {str(e)}")
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
return ProcessResult(None, self.mapping, [], {'type': 'error', 'error': str(e)})
|
return ProcessResult(None, self.mapping, [], {'type': 'error', 'error': str(e)})
|
||||||
|
|
@ -232,16 +232,6 @@ class HeaderPatterns:
|
||||||
class DataPatterns:
|
class DataPatterns:
|
||||||
"""Patterns for identifying sensitive data in content"""
|
"""Patterns for identifying sensitive data in content"""
|
||||||
patterns = [
|
patterns = [
|
||||||
# Name patterns
|
|
||||||
Pattern(
|
|
||||||
name="name",
|
|
||||||
patterns=[
|
|
||||||
# Person names with titles and academic degrees
|
|
||||||
r'\b(?:Dr\.|Prof\.|PhD\.?|MD\.?|Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-Z][a-z]{2,}(?:\s+[A-Za-z]{2,}){1,2}\b'
|
|
||||||
],
|
|
||||||
replacement_template="[NAME_{}]"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Email pattern for plain text
|
# Email pattern for plain text
|
||||||
Pattern(
|
Pattern(
|
||||||
name="email",
|
name="email",
|
||||||
|
|
@ -392,11 +382,6 @@ def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
|
||||||
matches = []
|
matches = []
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
for p in pattern.patterns:
|
for p in pattern.patterns:
|
||||||
if pattern.name == 'email':
|
|
||||||
print(f"\nDEBUG: Checking email pattern '{p}'")
|
|
||||||
for match in re.finditer(p, text, re.IGNORECASE):
|
for match in re.finditer(p, text, re.IGNORECASE):
|
||||||
if pattern.name == 'email':
|
|
||||||
print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}")
|
|
||||||
print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'")
|
|
||||||
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
||||||
return sorted(matches, key=lambda x: x[2]) # Sort by start position
|
return sorted(matches, key=lambda x: x[2]) # Sort by start position
|
||||||
|
|
@ -17,7 +17,8 @@ from modules.security.auth import limiter, getCurrentUser
|
||||||
import modules.interfaces.interfaceComponentObjects as interfaceComponentObjects
|
import modules.interfaces.interfaceComponentObjects as interfaceComponentObjects
|
||||||
from modules.interfaces.interfaceComponentModel import FileItem, FilePreview
|
from modules.interfaces.interfaceComponentModel import FileItem, FilePreview
|
||||||
from modules.shared.attributeUtils import getModelAttributeDefinitions, AttributeResponse, AttributeDefinition
|
from modules.shared.attributeUtils import getModelAttributeDefinitions, AttributeResponse, AttributeDefinition
|
||||||
from modules.interfaces.interfaceAppModel import User
|
from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
|
||||||
|
from modules.services.serviceNeutralization import NeutralizationService
|
||||||
|
|
||||||
# Configure logger
|
# Configure logger
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -364,3 +365,253 @@ async def preview_file(
|
||||||
detail=f"Error previewing file: {str(e)}"
|
detail=f"Error previewing file: {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Data Neutralization endpoints
|
||||||
|
|
||||||
|
@router.get("/neutralization/config", response_model=DataNeutraliserConfig)
|
||||||
|
@limiter.limit("30/minute")
|
||||||
|
async def get_neutralization_config(
|
||||||
|
request: Request,
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> DataNeutraliserConfig:
|
||||||
|
"""Get data neutralization configuration"""
|
||||||
|
try:
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
config = service.get_config()
|
||||||
|
|
||||||
|
if not config:
|
||||||
|
# Return default config instead of 404
|
||||||
|
return DataNeutraliserConfig(
|
||||||
|
mandateId=currentUser.mandateId,
|
||||||
|
userId=currentUser.id,
|
||||||
|
enabled=True,
|
||||||
|
namesToParse="",
|
||||||
|
sharepointSourcePath="",
|
||||||
|
sharepointTargetPath=""
|
||||||
|
)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting neutralization config: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error getting neutralization config: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/neutralization/config", response_model=DataNeutraliserConfig)
|
||||||
|
@limiter.limit("10/minute")
|
||||||
|
async def save_neutralization_config(
|
||||||
|
request: Request,
|
||||||
|
config_data: Dict[str, Any] = Body(...),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> DataNeutraliserConfig:
|
||||||
|
"""Save or update data neutralization configuration"""
|
||||||
|
try:
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
config = service.save_config(config_data)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving neutralization config: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error saving neutralization config: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/neutralization/neutralize-text", response_model=Dict[str, Any])
|
||||||
|
@limiter.limit("20/minute")
|
||||||
|
async def neutralize_text(
|
||||||
|
request: Request,
|
||||||
|
text_data: Dict[str, Any] = Body(...),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Neutralize text content"""
|
||||||
|
try:
|
||||||
|
text = text_data.get("text", "")
|
||||||
|
file_id = text_data.get("fileId")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Text content is required"
|
||||||
|
)
|
||||||
|
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
result = service.neutralize_text(text, file_id)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error neutralizing text: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error neutralizing text: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/neutralization/resolve-text", response_model=Dict[str, str])
|
||||||
|
@limiter.limit("20/minute")
|
||||||
|
async def resolve_text(
|
||||||
|
request: Request,
|
||||||
|
text_data: Dict[str, str] = Body(...),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""Resolve UIDs in neutralized text back to original text"""
|
||||||
|
try:
|
||||||
|
text = text_data.get("text", "")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Text content is required"
|
||||||
|
)
|
||||||
|
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
resolved_text = service.resolve_text(text)
|
||||||
|
|
||||||
|
return {"resolved_text": resolved_text}
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error resolving text: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error resolving text: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.get("/neutralization/attributes", response_model=List[DataNeutralizerAttributes])
|
||||||
|
@limiter.limit("30/minute")
|
||||||
|
async def get_neutralization_attributes(
|
||||||
|
request: Request,
|
||||||
|
fileId: Optional[str] = Query(None, description="Filter by file ID"),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> List[DataNeutralizerAttributes]:
|
||||||
|
"""Get neutralization attributes, optionally filtered by file ID"""
|
||||||
|
try:
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
attributes = service.get_attributes(fileId)
|
||||||
|
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting neutralization attributes: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error getting neutralization attributes: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/neutralization/process-sharepoint", response_model=Dict[str, Any])
|
||||||
|
@limiter.limit("5/minute")
|
||||||
|
async def process_sharepoint_files(
|
||||||
|
request: Request,
|
||||||
|
paths_data: Dict[str, str] = Body(...),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Process files from SharePoint source path and store neutralized files in target path"""
|
||||||
|
try:
|
||||||
|
source_path = paths_data.get("sourcePath", "")
|
||||||
|
target_path = paths_data.get("targetPath", "")
|
||||||
|
|
||||||
|
if not source_path or not target_path:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Both source and target paths are required"
|
||||||
|
)
|
||||||
|
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
result = await service.process_sharepoint_files(source_path, target_path)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing SharePoint files: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error processing SharePoint files: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/neutralization/batch-process", response_model=Dict[str, Any])
|
||||||
|
@limiter.limit("10/minute")
|
||||||
|
async def batch_process_files(
|
||||||
|
request: Request,
|
||||||
|
files_data: List[Dict[str, Any]] = Body(...),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Process multiple files for neutralization"""
|
||||||
|
try:
|
||||||
|
if not files_data:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Files data is required"
|
||||||
|
)
|
||||||
|
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
result = service.batch_neutralize_files(files_data)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error batch processing files: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error batch processing files: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.get("/neutralization/stats", response_model=Dict[str, Any])
|
||||||
|
@limiter.limit("30/minute")
|
||||||
|
async def get_neutralization_stats(
|
||||||
|
request: Request,
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Get neutralization processing statistics"""
|
||||||
|
try:
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
stats = service.get_processing_stats()
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting neutralization stats: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error getting neutralization stats: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.delete("/neutralization/attributes/{fileId}", response_model=Dict[str, str])
|
||||||
|
@limiter.limit("10/minute")
|
||||||
|
async def cleanup_file_attributes(
|
||||||
|
request: Request,
|
||||||
|
fileId: str = Path(..., description="File ID to cleanup attributes for"),
|
||||||
|
currentUser: User = Depends(getCurrentUser)
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""Clean up neutralization attributes for a specific file"""
|
||||||
|
try:
|
||||||
|
service = NeutralizationService(currentUser)
|
||||||
|
success = service.cleanup_file_attributes(fileId)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {"message": f"Successfully cleaned up attributes for file {fileId}"}
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail="Failed to cleanup file attributes"
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning up file attributes: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error cleaning up file attributes: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
|
||||||
587
modules/services/serviceNeutralization.py
Normal file
587
modules/services/serviceNeutralization.py
Normal file
|
|
@ -0,0 +1,587 @@
|
||||||
|
"""
|
||||||
|
Data Neutralization Service
|
||||||
|
Handles file processing for data neutralization including SharePoint integration
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from typing import Dict, List, Any, Optional, Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
|
from modules.interfaces.interfaceAppObjects import getInterface
|
||||||
|
from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
|
||||||
|
from modules.neutralizer.neutralizer import DataAnonymizer
|
||||||
|
from modules.shared.timezoneUtils import get_utc_timestamp
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class NeutralizationService:
|
||||||
|
"""Service for handling data neutralization operations"""
|
||||||
|
|
||||||
|
def __init__(self, current_user: User):
|
||||||
|
"""Initialize the service with user context"""
|
||||||
|
self.current_user = current_user
|
||||||
|
self.app_interface = getInterface(current_user)
|
||||||
|
|
||||||
|
def get_config(self) -> Optional[DataNeutraliserConfig]:
|
||||||
|
"""Get the neutralization configuration for the current user's mandate"""
|
||||||
|
return self.app_interface.getNeutralizationConfig()
|
||||||
|
|
||||||
|
def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
|
||||||
|
"""Save or update the neutralization configuration"""
|
||||||
|
return self.app_interface.createOrUpdateNeutralizationConfig(config_data)
|
||||||
|
|
||||||
|
def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Neutralize text content and return results with attribute mappings"""
|
||||||
|
return self.app_interface.neutralizeText(text, file_id)
|
||||||
|
|
||||||
|
def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
|
||||||
|
"""Get neutralization attributes, optionally filtered by file ID"""
|
||||||
|
return self.app_interface.getNeutralizationAttributes(file_id)
|
||||||
|
|
||||||
|
def resolve_text(self, text: str) -> str:
|
||||||
|
"""Resolve UIDs in neutralized text back to original text"""
|
||||||
|
return self.app_interface.resolveNeutralizedText(text)
|
||||||
|
|
||||||
|
async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process files from SharePoint source path, neutralize them, and store in target path
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_path: SharePoint path to read files from
|
||||||
|
target_path: SharePoint path to store neutralized files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with processing results
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing SharePoint files from {source_path} to {target_path}")
|
||||||
|
|
||||||
|
# Get user's SharePoint connection that matches the source path
|
||||||
|
sharepoint_connection = await self._get_sharepoint_connection(source_path)
|
||||||
|
if not sharepoint_connection:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "No SharePoint connection found for user",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": ["No SharePoint connection found"]
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}")
|
||||||
|
|
||||||
|
# Get SharePoint access token
|
||||||
|
sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"])
|
||||||
|
if not sharepoint_token:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "No SharePoint access token found",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": ["No SharePoint access token found"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process files asynchronously
|
||||||
|
return await self._process_sharepoint_files_async(
|
||||||
|
source_path, target_path, sharepoint_token.tokenAccess
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing SharePoint files: {str(e)}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Error processing SharePoint files: {str(e)}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [str(e)]
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _get_sharepoint_connection(self, sharepoint_path: str = None):
|
||||||
|
"""Get user's SharePoint connection that matches the given path"""
|
||||||
|
try:
|
||||||
|
# Get all user connections
|
||||||
|
from modules.interfaces.interfaceAppModel import UserConnection
|
||||||
|
connections = self.app_interface.db.getRecordset(
|
||||||
|
UserConnection,
|
||||||
|
recordFilter={"userId": self.app_interface.userId}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find all Microsoft connections
|
||||||
|
msft_connections = [conn for conn in connections if conn.get("authority") == "msft"]
|
||||||
|
|
||||||
|
if not msft_connections:
|
||||||
|
logger.warning("No Microsoft connections found for user")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(msft_connections) == 1:
|
||||||
|
logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}")
|
||||||
|
return msft_connections[0]
|
||||||
|
|
||||||
|
# If multiple connections and we have a path, try to match
|
||||||
|
if sharepoint_path:
|
||||||
|
return await self._match_connection_to_path(msft_connections, sharepoint_path)
|
||||||
|
|
||||||
|
# If no path provided, return the first one
|
||||||
|
logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}")
|
||||||
|
return msft_connections[0]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting SharePoint connection: {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _match_connection_to_path(self, connections: list, sharepoint_path: str):
|
||||||
|
"""Match a connection to the SharePoint path by testing access"""
|
||||||
|
try:
|
||||||
|
# Extract domain from the path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
parsed_url = urlparse(sharepoint_path)
|
||||||
|
target_domain = parsed_url.netloc.lower()
|
||||||
|
|
||||||
|
logger.info(f"Looking for connection matching domain: {target_domain}")
|
||||||
|
|
||||||
|
# Try each connection to see which one can access the site
|
||||||
|
for connection in connections:
|
||||||
|
try:
|
||||||
|
# Get token for this connection
|
||||||
|
token = self.app_interface.getConnectionToken(connection["id"])
|
||||||
|
if not token:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Test if this connection can access the SharePoint site
|
||||||
|
if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path):
|
||||||
|
logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}")
|
||||||
|
return connection
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If no specific match found, return the first connection
|
||||||
|
logger.warning(f"No specific connection match found for {target_domain}, using first available")
|
||||||
|
return connections[0]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error matching connection to path: {str(e)}")
|
||||||
|
return connections[0] if connections else None
|
||||||
|
|
||||||
|
async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool:
|
||||||
|
"""Test if the access token can access the given SharePoint path"""
|
||||||
|
try:
|
||||||
|
return await self._test_sharepoint_access_async(access_token, sharepoint_path)
|
||||||
|
except Exception as e:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool:
|
||||||
|
"""Async test for SharePoint access"""
|
||||||
|
try:
|
||||||
|
from modules.connectors.connectorSharepoint import ConnectorSharepoint
|
||||||
|
|
||||||
|
connector = ConnectorSharepoint(access_token=access_token)
|
||||||
|
|
||||||
|
# Parse the path to get site URL
|
||||||
|
site_url, _ = self._parse_sharepoint_path(sharepoint_path)
|
||||||
|
if not site_url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Try to find the site
|
||||||
|
site_info = await connector.find_site_by_web_url(site_url)
|
||||||
|
return site_info is not None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]:
|
||||||
|
"""Process SharePoint files asynchronously"""
|
||||||
|
try:
|
||||||
|
import asyncio
|
||||||
|
from modules.connectors.connectorSharepoint import ConnectorSharepoint
|
||||||
|
|
||||||
|
# Initialize SharePoint connector
|
||||||
|
connector = ConnectorSharepoint(access_token=access_token)
|
||||||
|
|
||||||
|
# Parse source and target paths to extract site and folder info
|
||||||
|
source_site, source_folder = self._parse_sharepoint_path(source_path)
|
||||||
|
target_site, target_folder = self._parse_sharepoint_path(target_path)
|
||||||
|
|
||||||
|
if not source_site or not target_site:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "Invalid SharePoint path format",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": ["Invalid SharePoint path format"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Find source site
|
||||||
|
source_site_info = await connector.find_site_by_web_url(source_site)
|
||||||
|
if not source_site_info:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Source site not found: {source_site}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [f"Source site not found: {source_site}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Find target site
|
||||||
|
target_site_info = await connector.find_site_by_web_url(target_site)
|
||||||
|
if not target_site_info:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Target site not found: {target_site}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [f"Target site not found: {target_site}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# List files in source folder
|
||||||
|
logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}")
|
||||||
|
files = await connector.list_folder_contents(source_site_info["id"], source_folder)
|
||||||
|
|
||||||
|
# If no files found, try listing the root folder to see what's available
|
||||||
|
if not files:
|
||||||
|
logger.warning(f"No files found in folder '{source_folder}', trying root folder")
|
||||||
|
files = await connector.list_folder_contents(source_site_info["id"], "")
|
||||||
|
|
||||||
|
if files:
|
||||||
|
# List available folders for debugging
|
||||||
|
folders = [f for f in files if f.get("type") == "folder"]
|
||||||
|
folder_names = [f.get('name') for f in folders]
|
||||||
|
logger.info(f"Available folders in root: {folder_names}")
|
||||||
|
|
||||||
|
# Format folder list for better UI display
|
||||||
|
folder_list = ", ".join(folder_names) if folder_names else "None"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"],
|
||||||
|
"available_folders": folder_names
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"No files found in source folder: {source_folder}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [f"No files found in source folder: {source_folder}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Filter for text files only
|
||||||
|
text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))]
|
||||||
|
|
||||||
|
if not text_files:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": "No text files found in source folder",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": ["No text files found in source folder"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process files in parallel for better performance
|
||||||
|
processed_files = []
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Create tasks for parallel processing
|
||||||
|
async def process_single_file(file_info):
|
||||||
|
"""Process a single file - download, neutralize, upload"""
|
||||||
|
try:
|
||||||
|
# Download file
|
||||||
|
file_content = await connector.download_file(source_site_info["id"], file_info["id"])
|
||||||
|
if not file_content:
|
||||||
|
return {"error": f"Failed to download file: {file_info['name']}"}
|
||||||
|
|
||||||
|
# Convert to text
|
||||||
|
try:
|
||||||
|
text_content = file_content.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
text_content = file_content.decode('latin-1')
|
||||||
|
|
||||||
|
# Neutralize the text
|
||||||
|
neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"])
|
||||||
|
|
||||||
|
# Create neutralized filename
|
||||||
|
neutralized_filename = f"neutralized_{file_info['name']}"
|
||||||
|
|
||||||
|
# Upload neutralized file
|
||||||
|
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
|
||||||
|
upload_result = await connector.upload_file(
|
||||||
|
target_site_info["id"],
|
||||||
|
target_folder,
|
||||||
|
neutralized_filename,
|
||||||
|
neutralized_content
|
||||||
|
)
|
||||||
|
|
||||||
|
if "error" in upload_result:
|
||||||
|
return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"original_name": file_info["name"],
|
||||||
|
"neutralized_name": neutralized_filename,
|
||||||
|
"attributes_count": len(neutralization_result.get("attributes", []))
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Error processing file {file_info['name']}: {str(e)}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
return {"error": error_msg}
|
||||||
|
|
||||||
|
# Process all files in parallel
|
||||||
|
logger.info(f"Processing {len(text_files)} files in parallel...")
|
||||||
|
tasks = [process_single_file(file_info) for file_info in text_files]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}"
|
||||||
|
errors.append(error_msg)
|
||||||
|
logger.error(error_msg)
|
||||||
|
elif isinstance(result, dict) and "error" in result:
|
||||||
|
errors.append(result["error"])
|
||||||
|
elif isinstance(result, dict) and result.get("success"):
|
||||||
|
processed_files.append({
|
||||||
|
"original_name": result["original_name"],
|
||||||
|
"neutralized_name": result["neutralized_name"],
|
||||||
|
"attributes_count": result["attributes_count"]
|
||||||
|
})
|
||||||
|
logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}")
|
||||||
|
else:
|
||||||
|
error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}"
|
||||||
|
errors.append(error_msg)
|
||||||
|
logger.error(error_msg)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": len(processed_files) > 0,
|
||||||
|
"message": f"Processed {len(processed_files)} files successfully",
|
||||||
|
"processed_files": len(processed_files),
|
||||||
|
"files": processed_files,
|
||||||
|
"errors": errors
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in async SharePoint processing: {str(e)}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Error in async SharePoint processing: {str(e)}",
|
||||||
|
"processed_files": 0,
|
||||||
|
"errors": [str(e)]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _parse_sharepoint_path(self, path: str) -> tuple[str, str]:
|
||||||
|
"""Parse SharePoint path to extract site URL and folder path"""
|
||||||
|
try:
|
||||||
|
# Expected format: https://domain.sharepoint.com/sites/sitename/folder/path
|
||||||
|
if not path.startswith("https://"):
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Remove query parameters
|
||||||
|
if "?" in path:
|
||||||
|
path = path.split("?")[0]
|
||||||
|
|
||||||
|
# Split by /sites/
|
||||||
|
if "/sites/" not in path:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
parts = path.split("/sites/", 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Extract domain and site name
|
||||||
|
domain = parts[0].replace("https://", "")
|
||||||
|
site_name = parts[1].split("/")[0]
|
||||||
|
|
||||||
|
# Create proper site URL for Graph API
|
||||||
|
site_url = f"https://{domain}/sites/{site_name}"
|
||||||
|
|
||||||
|
# Extract folder path (everything after the site name)
|
||||||
|
folder_parts = parts[1].split("/")[1:]
|
||||||
|
folder_path = "/".join(folder_parts) if folder_parts else ""
|
||||||
|
|
||||||
|
# URL decode the folder path
|
||||||
|
from urllib.parse import unquote
|
||||||
|
folder_path = unquote(folder_path)
|
||||||
|
|
||||||
|
|
||||||
|
return site_url, folder_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing SharePoint path '{path}': {str(e)}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def _is_text_file(self, filename: str) -> bool:
|
||||||
|
"""Check if file is a text file based on extension"""
|
||||||
|
text_extensions = [
|
||||||
|
'.txt', '.csv', '.json', '.xml', '.md', '.log',
|
||||||
|
'.doc', '.docx', '.rtf', '.odt', # Document formats
|
||||||
|
'.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h', # Code files
|
||||||
|
'.ini', '.cfg', '.conf', '.properties', # Config files
|
||||||
|
'.sql', '.yaml', '.yml', '.toml', # Data/config files
|
||||||
|
'.ps1', '.bat', '.sh', '.bash' # Script files
|
||||||
|
]
|
||||||
|
return any(filename.lower().endswith(ext) for ext in text_extensions)
|
||||||
|
|
||||||
|
def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process file content for neutralization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_content: Binary file content
|
||||||
|
file_name: Name of the file
|
||||||
|
mime_type: MIME type of the file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with neutralization results
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Determine content type based on MIME type
|
||||||
|
content_type = self._get_content_type_from_mime(mime_type)
|
||||||
|
|
||||||
|
# Decode content to text
|
||||||
|
try:
|
||||||
|
text_content = file_content.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Try with different encodings
|
||||||
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
||||||
|
try:
|
||||||
|
text_content = file_content.decode(encoding)
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise ValueError("Unable to decode file content")
|
||||||
|
|
||||||
|
# Generate a temporary file ID for tracking
|
||||||
|
temp_file_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
# Neutralize the content
|
||||||
|
neutralization_result = self.neutralize_text(text_content, temp_file_id)
|
||||||
|
|
||||||
|
# Encode the neutralized content back to bytes
|
||||||
|
neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
|
||||||
|
|
||||||
|
# Generate neutralized file name
|
||||||
|
neutralized_file_name = f"neutralized_{file_name}"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"original_content": text_content,
|
||||||
|
"neutralized_content": neutralization_result["neutralized_text"],
|
||||||
|
"neutralized_file_name": neutralized_file_name,
|
||||||
|
"attributes": neutralization_result["attributes"],
|
||||||
|
"mapping": neutralization_result["mapping"],
|
||||||
|
"file_id": temp_file_id
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file content: {str(e)}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"original_content": None,
|
||||||
|
"neutralized_content": None
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_content_type_from_mime(self, mime_type: str) -> str:
|
||||||
|
"""Determine content type from MIME type for neutralization processing"""
|
||||||
|
if mime_type.startswith('text/'):
|
||||||
|
return 'text'
|
||||||
|
elif mime_type in ['application/json', 'application/xml', 'text/xml']:
|
||||||
|
return 'json' if 'json' in mime_type else 'xml'
|
||||||
|
elif mime_type in ['text/csv', 'application/csv']:
|
||||||
|
return 'csv'
|
||||||
|
else:
|
||||||
|
return 'text' # Default to text processing
|
||||||
|
|
||||||
|
def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process multiple files for neutralization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files_data: List of dictionaries containing file information
|
||||||
|
Each dict should have: content, name, mime_type
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with batch processing results
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
results = []
|
||||||
|
total_files = len(files_data)
|
||||||
|
successful_files = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
for file_data in files_data:
|
||||||
|
try:
|
||||||
|
result = self.process_file_content(
|
||||||
|
file_data['content'],
|
||||||
|
file_data['name'],
|
||||||
|
file_data['mime_type']
|
||||||
|
)
|
||||||
|
|
||||||
|
if result['success']:
|
||||||
|
successful_files += 1
|
||||||
|
results.append({
|
||||||
|
'file_name': file_data['name'],
|
||||||
|
'neutralized_file_name': result['neutralized_file_name'],
|
||||||
|
'file_id': result['file_id'],
|
||||||
|
'attributes_count': len(result['attributes'])
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
errors.append(f"Failed to process {file_data['name']}: {result['error']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Error processing {file_data['name']}: {str(e)}"
|
||||||
|
errors.append(error_msg)
|
||||||
|
logger.error(error_msg)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": len(errors) == 0,
|
||||||
|
"total_files": total_files,
|
||||||
|
"successful_files": successful_files,
|
||||||
|
"failed_files": len(errors),
|
||||||
|
"results": results,
|
||||||
|
"errors": errors
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in batch neutralization: {str(e)}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"total_files": len(files_data),
|
||||||
|
"successful_files": 0,
|
||||||
|
"failed_files": len(files_data),
|
||||||
|
"results": [],
|
||||||
|
"errors": [str(e)]
|
||||||
|
}
|
||||||
|
|
||||||
|
def cleanup_file_attributes(self, file_id: str) -> bool:
|
||||||
|
"""Clean up neutralization attributes for a specific file"""
|
||||||
|
return self.app_interface.deleteNeutralizationAttributes(file_id)
|
||||||
|
|
||||||
|
def get_processing_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get statistics about neutralization processing"""
|
||||||
|
try:
|
||||||
|
# Get all attributes for the current mandate
|
||||||
|
all_attributes = self.get_attributes()
|
||||||
|
|
||||||
|
# Group by pattern type
|
||||||
|
pattern_counts = {}
|
||||||
|
for attr in all_attributes:
|
||||||
|
pattern_type = attr.patternType
|
||||||
|
pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1
|
||||||
|
|
||||||
|
# Get unique files
|
||||||
|
unique_files = set(attr.fileId for attr in all_attributes if attr.fileId)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_attributes": len(all_attributes),
|
||||||
|
"unique_files": len(unique_files),
|
||||||
|
"pattern_counts": pattern_counts,
|
||||||
|
"mandate_id": self.current_user.mandateId
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting processing stats: {str(e)}")
|
||||||
|
return {
|
||||||
|
"total_attributes": 0,
|
||||||
|
"unique_files": 0,
|
||||||
|
"pattern_counts": {},
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
@ -130,10 +130,6 @@ class DataAnonymizer:
|
||||||
self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1)
|
self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1)
|
||||||
replacement = self.mapping[matched_text]
|
replacement = self.mapping[matched_text]
|
||||||
|
|
||||||
if pattern_name == 'email':
|
|
||||||
print(f"DEBUG: Replacing email '{matched_text}' with '{replacement}'")
|
|
||||||
print(f"DEBUG: Text after replacement: {current_text[:start] + replacement + current_text[end:]}")
|
|
||||||
|
|
||||||
# Replace the matched text while preserving surrounding whitespace
|
# Replace the matched text while preserving surrounding whitespace
|
||||||
current_text = current_text[:start] + replacement + current_text[end:]
|
current_text = current_text[:start] + replacement + current_text[end:]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -392,11 +392,5 @@ def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
|
||||||
matches = []
|
matches = []
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
for p in pattern.patterns:
|
for p in pattern.patterns:
|
||||||
if pattern.name == 'email':
|
|
||||||
print(f"\nDEBUG: Checking email pattern '{p}'")
|
|
||||||
for match in re.finditer(p, text, re.IGNORECASE):
|
|
||||||
if pattern.name == 'email':
|
|
||||||
print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}")
|
|
||||||
print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'")
|
|
||||||
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
matches.append((pattern.name, match.group(0), match.start(), match.end()))
|
||||||
return sorted(matches, key=lambda x: x[2]) # Sort by start position
|
return sorted(matches, key=lambda x: x[2]) # Sort by start position
|
||||||
Loading…
Reference in a new issue