neutralizer 1.0

2025-09-17 02:12:34 +02:00 · 2025-09-17 02:12:34 +02:00 · a941963e78
commit a941963e78
parent 70479aff88
9 changed files with 1390 additions and 81 deletions
--- a/modules/interfaces/interfaceAppAccess.py
+++ b/modules/interfaces/interfaceAppAccess.py
@ -74,6 +74,28 @@ class AppAccess:
            else:
                # Regular users only see their own connections
                filtered_records = [r for r in recordset if r.get("userId") == self.userId]
        # Special handling for data neutralization config table
        elif table_name == "DataNeutraliserConfig":
            if self.privilege == UserPrivilege.SYSADMIN:
                # SysAdmin sees all configs
                filtered_records = recordset
            elif self.privilege == UserPrivilege.ADMIN:
                # Admin sees configs in their mandate
                filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId]
            else:
                # Regular users only see their own configs
                filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId and r.get("userId") == self.userId]
        # Special handling for data neutralizer attributes table
        elif table_name == "DataNeutralizerAttributes":
            if self.privilege == UserPrivilege.SYSADMIN:
                # SysAdmin sees all attributes
                filtered_records = recordset
            elif self.privilege == UserPrivilege.ADMIN:
                # Admin sees attributes in their mandate
                filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId]
            else:
                # Regular users only see their own attributes
                filtered_records = [r for r in recordset if r.get("mandateId","-") == self.mandateId and r.get("userId") == self.userId]
        # System admins see all other records
        elif self.privilege == UserPrivilege.SYSADMIN:
            filtered_records = recordset
@ -126,6 +148,37 @@ class AppAccess:
                    record["_hideEdit"] = record.get("userId") != self.userId
                    record["_hideDelete"] = record.get("userId") != self.userId
            elif table_name == "DataNeutraliserConfig":
                # Everyone can view configs they have access to
                record["_hideView"] = False
                # SysAdmin can edit/delete any config
                if self.privilege == UserPrivilege.SYSADMIN:
                    record["_hideEdit"] = False
                    record["_hideDelete"] = False
                # Admin can edit/delete configs in their mandate
                elif self.privilege == UserPrivilege.ADMIN:
                    record["_hideEdit"] = record.get("mandateId","-") != self.mandateId
                    record["_hideDelete"] = record.get("mandateId","-") != self.mandateId
                # Regular users can only edit/delete their own configs
                else:
                    record["_hideEdit"] = record.get("userId") != self.userId
                    record["_hideDelete"] = record.get("userId") != self.userId
            elif table_name == "DataNeutralizerAttributes":
                # Everyone can view attributes they have access to
                record["_hideView"] = False
                # SysAdmin can edit/delete any attributes
                if self.privilege == UserPrivilege.SYSADMIN:
                    record["_hideEdit"] = False
                    record["_hideDelete"] = False
                # Admin can edit/delete attributes in their mandate
                elif self.privilege == UserPrivilege.ADMIN:
                    record["_hideEdit"] = record.get("mandateId","-") != self.mandateId
                    record["_hideDelete"] = record.get("mandateId","-") != self.mandateId
                # Regular users can only edit/delete their own attributes
                else:
                    record["_hideEdit"] = record.get("userId") != self.userId
                    record["_hideDelete"] = record.get("userId") != self.userId
            elif table_name == "AuthEvent":
                # Only show auth events for the current user or if admin
                if self.privilege in [UserPrivilege.SYSADMIN, UserPrivilege.ADMIN]:
--- a/modules/interfaces/interfaceAppModel.py
+++ b/modules/interfaces/interfaceAppModel.py
@ -427,6 +427,126 @@ register_model_labels(
    }
 )
 class DataNeutraliserConfig(BaseModel, ModelMixin):
    """Data model for data neutralization configuration"""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()), 
        description="Unique ID of the configuration",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=False
    )
    mandateId: str = Field(
        description="ID of the mandate this configuration belongs to",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
    userId: str = Field(
        description="ID of the user who created this configuration",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
    enabled: bool = Field(
        default=True, 
        description="Whether data neutralization is enabled",
        frontend_type="checkbox",
        frontend_readonly=False,
        frontend_required=False
    )
    namesToParse: str = Field(
        default="", 
        description="Multiline list of names to parse for neutralization",
        frontend_type="textarea",
        frontend_readonly=False,
        frontend_required=False
    )
    sharepointSourcePath: str = Field(
        default="", 
        description="SharePoint path to read files for neutralization",
        frontend_type="text",
        frontend_readonly=False,
        frontend_required=False
    )
    sharepointTargetPath: str = Field(
        default="", 
        description="SharePoint path to store neutralized files",
        frontend_type="text",
        frontend_readonly=False,
        frontend_required=False
    )
 # Register labels for DataNeutraliserConfig
 register_model_labels(
    "DataNeutraliserConfig",
    {"en": "Data Neutralization Config", "fr": "Configuration de neutralisation des données"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "mandateId": {"en": "Mandate ID", "fr": "ID de mandat"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "enabled": {"en": "Enabled", "fr": "Activé"},
        "namesToParse": {"en": "Names to Parse", "fr": "Noms à analyser"},
        "sharepointSourcePath": {"en": "Source Path", "fr": "Chemin source"},
        "sharepointTargetPath": {"en": "Target Path", "fr": "Chemin cible"}
    }
 )
 class DataNeutralizerAttributes(BaseModel, ModelMixin):
    """Data model for neutralized data attributes mapping"""
    id: str = Field(
        default_factory=lambda: str(uuid.uuid4()), 
        description="Unique ID of the attribute mapping (used as UID in neutralized files)",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=False
    )
    mandateId: str = Field(
        description="ID of the mandate this attribute belongs to",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
    userId: str = Field(
        description="ID of the user who created this attribute",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
    originalText: str = Field(
        description="Original text that was neutralized",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
    fileId: Optional[str] = Field(
        default=None,
        description="ID of the file this attribute belongs to",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=False
    )
    patternType: str = Field(
        description="Type of pattern that matched (email, phone, name, etc.)",
        frontend_type="text",
        frontend_readonly=True,
        frontend_required=True
    )
 # Register labels for DataNeutralizerAttributes
 register_model_labels(
    "DataNeutralizerAttributes",
    {"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"},
    {
        "id": {"en": "ID", "fr": "ID"},
        "mandateId": {"en": "Mandate ID", "fr": "ID de mandat"},
        "userId": {"en": "User ID", "fr": "ID utilisateur"},
        "originalText": {"en": "Original Text", "fr": "Texte original"},
        "fileId": {"en": "File ID", "fr": "ID de fichier"},
        "patternType": {"en": "Pattern Type", "fr": "Type de modèle"}
    }
 )
 class SystemTable(BaseModel, ModelMixin):
    """Data model for system table entries"""
    table_name: str = Field(
--- a/modules/interfaces/interfaceAppObjects.py
+++ b/modules/interfaces/interfaceAppObjects.py
@ -11,6 +11,7 @@ import importlib
 import json
 from passlib.context import CryptContext
 import uuid
 import re
 from modules.connectors.connectorDbPostgre import DatabaseConnector
 from modules.shared.configuration import APP_CONFIG
@ -19,7 +20,8 @@ from modules.interfaces.interfaceAppAccess import AppAccess
 from modules.interfaces.interfaceAppModel import (
    User, Mandate, UserInDB, UserConnection, 
    AuthAuthority, UserPrivilege,
-    ConnectionStatus, Token, AuthEvent
+    ConnectionStatus, Token, AuthEvent,
    DataNeutraliserConfig, DataNeutralizerAttributes
 )
 logger = logging.getLogger(__name__)
@ -993,6 +995,211 @@ class AppObjects:
            logger.error(f"Error during logout: {str(e)}")
            raise
    # Data Neutralization methods
    def getNeutralizationConfig(self) -> Optional[DataNeutraliserConfig]:
        """Get the data neutralization configuration for the current user's mandate"""
        try:
            configs = self.db.getRecordset(DataNeutraliserConfig, recordFilter={"mandateId": self.mandateId})
            if not configs:
                return None
            # Apply access control
            filtered_configs = self._uam(DataNeutraliserConfig, configs)
            if not filtered_configs:
                return None
            return DataNeutraliserConfig.from_dict(filtered_configs[0])
        except Exception as e:
            logger.error(f"Error getting neutralization config: {str(e)}")
            return None
    def createOrUpdateNeutralizationConfig(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
        """Create or update the data neutralization configuration"""
        try:
            # Check if config already exists
            existing_config = self.getNeutralizationConfig()
            if existing_config:
                # Update existing config
                update_data = existing_config.to_dict()
                update_data.update(config_data)
                update_data["updatedAt"] = get_utc_timestamp()
                updated_config = DataNeutraliserConfig.from_dict(update_data)
                self.db.recordModify(DataNeutraliserConfig, existing_config.id, updated_config)
                return updated_config
            else:
                # Create new config
                config_data["mandateId"] = self.mandateId
                config_data["userId"] = self.userId
                new_config = DataNeutraliserConfig.from_dict(config_data)
                created_record = self.db.recordCreate(DataNeutraliserConfig, new_config)
                return DataNeutraliserConfig.from_dict(created_record)
        except Exception as e:
            logger.error(f"Error creating/updating neutralization config: {str(e)}")
            raise ValueError(f"Failed to create/update neutralization config: {str(e)}")
    def neutralizeText(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
        """Neutralize text content and store attribute mappings"""
        try:
            from modules.neutralizer.neutralizer import DataAnonymizer
            # Get neutralization configuration to extract namesToParse
            config = self.getNeutralizationConfig()
            names_to_parse = []
            if config and hasattr(config, 'namesToParse') and config.namesToParse:
                # Split by newlines and filter out empty strings
                names_to_parse = [name.strip() for name in config.namesToParse.split('\n') if name.strip()]
            # Initialize anonymizer with custom names
            anonymizer = DataAnonymizer(names_to_parse=names_to_parse)
            # Process the text
            result = anonymizer.process_content(text, 'text')
            # Store attribute mappings in database
            stored_attributes = []
            for original_text, neutralized_text in result.mapping.items():
                # Extract pattern type and UUID from the neutralized text format [type.uuid]
                pattern_type = "unknown"
                placeholder_uuid = None
                if neutralized_text.startswith("[") and "." in neutralized_text and neutralized_text.endswith("]"):
                    # Extract type and UUID from [type.uuid] format
                    inner = neutralized_text[1:-1]  # Remove [ and ]
                    if "." in inner:
                        pattern_type, placeholder_uuid = inner.split(".", 1)
                # Check if this exact original text already has a placeholder in the database
                existing_attribute = self.getExistingPlaceholder(original_text)
                if existing_attribute:
                    # Reuse existing placeholder
                    existing_uuid = existing_attribute.id
                    existing_pattern_type = existing_attribute.patternType
                    # Update the neutralized text to use the existing UUID
                    result.data = result.data.replace(neutralized_text, f"[{existing_pattern_type}.{existing_uuid}]")
                    result.mapping[original_text] = f"[{existing_pattern_type}.{existing_uuid}]"
                    stored_attributes.append(existing_attribute)
                else:
                    # Create new attribute record with the UUID that the neutralizer generated
                    attribute_data = {
                        "id": placeholder_uuid,  # Use the UUID from the neutralizer
                        "mandateId": self.mandateId,
                        "userId": self.userId,
                        "originalText": original_text,
                        "fileId": file_id,
                        "patternType": pattern_type
                    }
                    attribute = DataNeutralizerAttributes.from_dict(attribute_data)
                    created_attribute = self.db.recordCreate(DataNeutralizerAttributes, attribute)
                    stored_attributes.append(created_attribute)
                # The neutralized text is already in the correct [type.uuid] format
                # No need to replace it, as it's already properly formatted
            return {
                "neutralized_text": result.data,
                "attributes": stored_attributes,
                "mapping": result.mapping,
                "replaced_fields": result.replaced_fields,
                "processed_info": result.processed_info
            }
        except Exception as e:
            logger.error(f"Error neutralizing text: {str(e)}")
            raise ValueError(f"Failed to neutralize text: {str(e)}")
    def getExistingPlaceholder(self, original_text: str) -> Optional[DataNeutralizerAttributes]:
        """Get existing placeholder for original text if it exists"""
        try:
            existing_attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
                "mandateId": self.mandateId,
                "userId": self.userId,
                "originalText": original_text
            })
            if existing_attributes:
                return DataNeutralizerAttributes.from_dict(existing_attributes[0])
            return None
        except Exception as e:
            logger.error(f"Error getting existing placeholder: {str(e)}")
            return None
    def getNeutralizationAttributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
        """Get neutralization attributes, optionally filtered by file ID"""
        try:
            filter_dict = {"mandateId": self.mandateId}
            if file_id:
                filter_dict["fileId"] = file_id
            attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter=filter_dict)
            filtered_attributes = self._uam(DataNeutralizerAttributes, attributes)
            return [DataNeutralizerAttributes.from_dict(attr) for attr in filtered_attributes]
        except Exception as e:
            logger.error(f"Error getting neutralization attributes: {str(e)}")
            return []
    def resolveNeutralizedText(self, text: str) -> str:
        """Resolve UIDs in neutralized text back to original text"""
        try:
            # Find all placeholders in the new format [type.uuid]
            placeholder_pattern = r'\[([a-z]+)\.([a-f0-9-]{36})\]'
            matches = re.findall(placeholder_pattern, text)
            resolved_text = text
            for placeholder_type, uid in matches:
                # Find the attribute with this UID (which is the record ID)
                attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
                    "mandateId": self.mandateId,
                    "id": uid
                })
                if attributes:
                    attribute = attributes[0]
                    # Replace placeholder with original text
                    placeholder = f"[{placeholder_type}.{uid}]"
                    resolved_text = resolved_text.replace(placeholder, attribute["originalText"])
                else:
                    logger.warning(f"No attribute found for UID {uid}")
            return resolved_text
        except Exception as e:
            logger.error(f"Error resolving neutralized text: {str(e)}")
            return text
    def deleteNeutralizationAttributes(self, file_id: str) -> bool:
        """Delete all neutralization attributes for a specific file"""
        try:
            attributes = self.db.getRecordset(DataNeutralizerAttributes, recordFilter={
                "mandateId": self.mandateId,
                "fileId": file_id
            })
            for attribute in attributes:
                self.db.recordDelete(DataNeutralizerAttributes, attribute["id"])
            logger.info(f"Deleted {len(attributes)} neutralization attributes for file {file_id}")
            return True
        except Exception as e:
            logger.error(f"Error deleting neutralization attributes: {str(e)}")
            return False
 # Public Methods
 def getInterface(currentUser: User) -> AppObjects:
--- a/modules/neutralizer/neutralizer.py
+++ b/modules/neutralizer/neutralizer.py
@ -50,10 +50,15 @@ class ProcessResult:
 class DataAnonymizer:
    """Hauptklasse für die Datenanonymisierung"""
-    def __init__(self):
+    def __init__(self, names_to_parse: List[str] = None):
-        """Initialize the anonymizer with patterns"""
+        """Initialize the anonymizer with patterns and custom names
        Args:
            names_to_parse: List of names to parse and replace (case-insensitive)
        """
        self.header_patterns = HeaderPatterns.patterns
        self.data_patterns = DataPatterns.patterns
        self.names_to_parse = names_to_parse or []
        self.replaced_fields = set()
        self.mapping = {}
        self.processing_info = []
@ -64,6 +69,7 @@ class DataAnonymizer:
        text = text.replace('\r\n', '\n').replace('\r', '\n')
        return text.strip()
    def _is_table_line(self, line: str) -> bool:
        """Check if a line represents a table row"""
        return bool(re.match(r'^\s*[^:]+:\s*[^:]+$', line) or 
@ -110,46 +116,72 @@ class DataAnonymizer:
        except Exception as e:
            logger.error(f"Error anonymizing table: {str(e)}")
            logger.debug(traceback.format_exc())
            raise
    def _anonymize_plain_text(self, text: PlainText) -> PlainText:
-        """Anonymize plain text content"""
+        """Anonymize plain text content using simple search-and-replace approach"""
        try:
            # Process the entire text at once instead of line by line
            current_text = text.content
-            # Find all matches in the entire text
+            # Step 1: Replace custom names first (simple regex search-and-replace)
-            matches = find_patterns_in_text(current_text, self.data_patterns)
+            for name in self.names_to_parse:
-            
+                if not name.strip():
            # Process matches in reverse order to avoid position shifting
            for match in sorted(matches, key=lambda x: x[2], reverse=True):
                pattern_name, matched_text, start, end = match
                # Skip if the matched text is already a placeholder
                if re.match(r'\[[A-Z_]+\d+\]', matched_text):
                    continue
-                # Find the pattern that matched
+                # Create case-insensitive regex pattern with word boundaries
-                pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
+                pattern = re.compile(r'\b' + re.escape(name.strip()) + r'\b', re.IGNORECASE)
-                if pattern:
+                
-                    # Use the pattern's replacement template
+                # Find all matches for this name
                matches = list(pattern.finditer(current_text))
                # Replace each match with a placeholder
                for match in reversed(matches):  # Process from right to left to avoid position shifts
                    matched_text = match.group()
                    if matched_text not in self.mapping:
-                        self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1)
+                        # Generate a UUID for the placeholder
                        import uuid
                        placeholder_id = str(uuid.uuid4())
                        self.mapping[matched_text] = f"[name.{placeholder_id}]"
                    replacement = self.mapping[matched_text]
                    start, end = match.span()
                    current_text = current_text[:start] + replacement + current_text[end:]
-                    if pattern_name == 'email':
+            # Step 2: Replace pattern-based matches (emails, phones, etc.)
-                        print(f"DEBUG: Replacing email '{matched_text}' with '{replacement}'")
+            # Use the same simple approach for patterns
-                        print(f"DEBUG: Text after replacement: {current_text[:start] + replacement + current_text[end:]}")
+            pattern_matches = find_patterns_in_text(current_text, self.data_patterns)
-                # Replace the matched text while preserving surrounding whitespace
+            # Process pattern matches from right to left to avoid position shifts
            for pattern_name, matched_text, start, end in reversed(pattern_matches):
                # Skip if already a placeholder
                if re.match(r'\[[a-z]+\.[a-f0-9-]+\]', matched_text):
                    continue
                # Skip if contains placeholder characters
                if '[' in matched_text or ']' in matched_text:
                    continue
                if matched_text not in self.mapping:
                    # Generate a UUID for the placeholder
                    import uuid
                    placeholder_id = str(uuid.uuid4())
                    # Create placeholder in format [type.uuid]
                    type_mapping = {
                        'email': 'email',
                        'phone': 'phone', 
                        'address': 'address',
                        'id': 'id'
                    }
                    placeholder_type = type_mapping.get(pattern_name, 'data')
                    self.mapping[matched_text] = f"[{placeholder_type}.{placeholder_id}]"
                replacement = self.mapping[matched_text]
                current_text = current_text[:start] + replacement + current_text[end:]
            return PlainText(content=current_text, source_type=text.source_type)
        except Exception as e:
            logger.error(f"Error anonymizing plain text: {str(e)}")
            logger.debug(traceback.format_exc())
            raise
    def _anonymize_json_value(self, value: Any, key: str = None) -> Any:
@ -173,16 +205,49 @@ class DataAnonymizer:
                pattern = get_pattern_for_header(key, self.header_patterns)
                if pattern:
                    if value not in self.mapping:
-                        self.mapping[value] = pattern.replacement_template.format(len(self.mapping) + 1)
+                        # Generate a UUID for the placeholder
                        import uuid
                        placeholder_id = str(uuid.uuid4())
                        # Create placeholder in format [type.uuid]
                        type_mapping = {
                            'email': 'email',
                            'phone': 'phone', 
                            'name': 'name',
                            'address': 'address',
                            'id': 'id'
                        }
                        placeholder_type = type_mapping.get(pattern.name, 'data')
                        self.mapping[value] = f"[{placeholder_type}.{placeholder_id}]"
                    return self.mapping[value]
            # Check if the value itself matches any patterns
-            matches = find_patterns_in_text(value, self.data_patterns)
+            pattern_matches = find_patterns_in_text(value, self.data_patterns)
-            if matches:
+            custom_name_matches = self._find_custom_names(value)
-                # Use the first match's pattern
+            
-                pattern_name = matches[0][0]
+            if pattern_matches or custom_name_matches:
-                if value not in self.mapping:
+                # Use the first match's pattern or custom name
-                    self.mapping[value] = f"{pattern_name.upper()}_{len(self.mapping) + 1}"
+                if pattern_matches:
                    pattern_name = pattern_matches[0][0]
                    if value not in self.mapping:
                        # Generate a UUID for the placeholder
                        import uuid
                        placeholder_id = str(uuid.uuid4())
                        # Create placeholder in format [type.uuid]
                        type_mapping = {
                            'email': 'email',
                            'phone': 'phone', 
                            'name': 'name',
                            'address': 'address',
                            'id': 'id'
                        }
                        placeholder_type = type_mapping.get(pattern_name, 'data')
                        self.mapping[value] = f"[{placeholder_type}.{placeholder_id}]"
                elif custom_name_matches:
                    if value not in self.mapping:
                        # Generate a UUID for the placeholder
                        import uuid
                        placeholder_id = str(uuid.uuid4())
                        self.mapping[value] = f"[name.{placeholder_id}]"
                return self.mapping[value]
            return value
@ -207,7 +272,19 @@ class DataAnonymizer:
            pattern = get_pattern_for_header(attr_name, self.header_patterns)
            if pattern:
                if attr_value not in self.mapping:
-                    self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1)
+                    # Generate a UUID for the placeholder
                    import uuid
                    placeholder_id = str(uuid.uuid4())
                    # Create placeholder in format [type.uuid]
                    type_mapping = {
                        'email': 'email',
                        'phone': 'phone', 
                        'name': 'name',
                        'address': 'address',
                        'id': 'id'
                    }
                    placeholder_type = type_mapping.get(pattern.name, 'data')
                    self.mapping[attr_value] = f"[{placeholder_type}.{placeholder_id}]"
                processed_attrs[attr_name] = self.mapping[attr_value]
            else:
                # Check if attribute value matches any data patterns
@ -217,7 +294,19 @@ class DataAnonymizer:
                    pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
                    if pattern:
                        if attr_value not in self.mapping:
-                            self.mapping[attr_value] = pattern.replacement_template.format(len(self.mapping) + 1)
+                            # Generate a UUID for the placeholder
                            import uuid
                            placeholder_id = str(uuid.uuid4())
                            # Create placeholder in format [type.uuid]
                            type_mapping = {
                                'email': 'email',
                                'phone': 'phone', 
                                'name': 'name',
                                'address': 'address',
                                'id': 'id'
                            }
                            placeholder_type = type_mapping.get(pattern_name, 'data')
                            self.mapping[attr_value] = f"[{placeholder_type}.{placeholder_id}]"
                        processed_attrs[attr_name] = self.mapping[attr_value]
                    else:
                        processed_attrs[attr_name] = attr_value
@ -230,14 +319,36 @@ class DataAnonymizer:
        # Process text content
        text = element.text.strip() if element.text and element.text.strip() else ''
        if text:
-            # Check if text matches any patterns
+            # Check if text matches any patterns or custom names
-            matches = find_patterns_in_text(text, self.data_patterns)
+            pattern_matches = find_patterns_in_text(text, self.data_patterns)
-            if matches:
+            custom_name_matches = self._find_custom_names(text)
-                pattern_name = matches[0][0]
+            
-                pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
+            if pattern_matches or custom_name_matches:
-                if pattern:
+                if pattern_matches:
                    pattern_name = pattern_matches[0][0]
                    pattern = next((p for p in self.data_patterns if p.name == pattern_name), None)
                    if pattern:
                        if text not in self.mapping:
                            # Generate a UUID for the placeholder
                            import uuid
                            placeholder_id = str(uuid.uuid4())
                            # Create placeholder in format [type.uuid]
                            type_mapping = {
                                'email': 'email',
                                'phone': 'phone', 
                                'name': 'name',
                                'address': 'address',
                                'id': 'id'
                            }
                            placeholder_type = type_mapping.get(pattern_name, 'data')
                            self.mapping[text] = f"[{placeholder_type}.{placeholder_id}]"
                        text = self.mapping[text]
                elif custom_name_matches:
                    if text not in self.mapping:
-                        self.mapping[text] = pattern.replacement_template.format(len(self.mapping) + 1)
+                        # Generate a UUID for the placeholder
                        import uuid
                        placeholder_id = str(uuid.uuid4())
                        self.mapping[text] = f"[name.{placeholder_id}]"
                    text = self.mapping[text]
        # Process child elements
@ -271,18 +382,24 @@ class DataAnonymizer:
            ProcessResult: Contains anonymized data, mapping, replaced fields and processing info
        """
        try:
            # Check if content is binary data
            is_binary = False
            try:
-                # Try to decode base64 if it's a string
+                # First, check if content looks like base64 (contains only base64 characters)
-                try:
+                if re.match(r'^[A-Za-z0-9+/]*={0,2}$', content.strip()):
-                    decoded = base64.b64decode(content)
+                    # Try to decode base64 if it looks like base64
-                    # If it's not valid text, consider it binary
+                    try:
-                    decoded.decode('utf-8')
+                        decoded = base64.b64decode(content)
-                except (base64.binascii.Error, UnicodeDecodeError):
+                        # If it's not valid text, consider it binary
-                    is_binary = True
+                        decoded.decode('utf-8')
-            except Exception:
+                        is_binary = True
-                is_binary = True
+                    except (base64.binascii.Error, UnicodeDecodeError):
                        is_binary = False
                else:
                    is_binary = False
            except Exception as e:
                is_binary = False
            if is_binary:
                # TODO: Implement binary data neutralization
@ -356,7 +473,7 @@ class DataAnonymizer:
                # Combine all processed content
                result = content
-                for text, anonymized_text in zip(plain_texts, anonymized_texts):
+                for i, (text, anonymized_text) in enumerate(zip(plain_texts, anonymized_texts)):
                    if text.content != anonymized_text.content:
                        result = result.replace(text.content, anonymized_text.content)
@ -364,5 +481,4 @@ class DataAnonymizer:
        except Exception as e:
            logger.error(f"Error processing content: {str(e)}")
            logger.debug(traceback.format_exc())
            return ProcessResult(None, self.mapping, [], {'type': 'error', 'error': str(e)}) 
--- a/modules/neutralizer/patterns.py
+++ b/modules/neutralizer/patterns.py
@ -232,16 +232,6 @@ class HeaderPatterns:
 class DataPatterns:
    """Patterns for identifying sensitive data in content"""
    patterns = [
        # Name patterns
        Pattern(
            name="name",
            patterns=[
                # Person names with titles and academic degrees
                r'\b(?:Dr\.|Prof\.|PhD\.?|MD\.?|Herr|Frau|Mr\.|Mrs\.|Ms\.|Monsieur|Madame|Signore|Signora)\s+[A-Z][a-z]{2,}(?:\s+[A-Za-z]{2,}){1,2}\b'
            ],
            replacement_template="[NAME_{}]"
        ),
        # Email pattern for plain text
        Pattern(
            name="email",
@ -392,11 +382,6 @@ def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
    matches = []
    for pattern in patterns:
        for p in pattern.patterns:
            if pattern.name == 'email':
                print(f"\nDEBUG: Checking email pattern '{p}'")
            for match in re.finditer(p, text, re.IGNORECASE):
                if pattern.name == 'email':
                    print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}")
                    print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'")
                matches.append((pattern.name, match.group(0), match.start(), match.end()))
    return sorted(matches, key=lambda x: x[2])  # Sort by start position 
--- a/modules/routes/routeDataFiles.py
+++ b/modules/routes/routeDataFiles.py
@ -17,7 +17,8 @@ from modules.security.auth import limiter, getCurrentUser
 import modules.interfaces.interfaceComponentObjects as interfaceComponentObjects
 from modules.interfaces.interfaceComponentModel import FileItem, FilePreview
 from modules.shared.attributeUtils import getModelAttributeDefinitions, AttributeResponse, AttributeDefinition
-from modules.interfaces.interfaceAppModel import User
+from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
 from modules.services.serviceNeutralization import NeutralizationService
 # Configure logger
 logger = logging.getLogger(__name__)
@ -364,3 +365,253 @@ async def preview_file(
            detail=f"Error previewing file: {str(e)}"
        )
 # Data Neutralization endpoints
@router.get("/neutralization/config", response_model=DataNeutraliserConfig)
@limiter.limit("30/minute")
 async def get_neutralization_config(
    request: Request,
    currentUser: User = Depends(getCurrentUser)
 ) -> DataNeutraliserConfig:
    """Get data neutralization configuration"""
    try:
        service = NeutralizationService(currentUser)
        config = service.get_config()
        if not config:
            # Return default config instead of 404
            return DataNeutraliserConfig(
                mandateId=currentUser.mandateId,
                userId=currentUser.id,
                enabled=True,
                namesToParse="",
                sharepointSourcePath="",
                sharepointTargetPath=""
            )
        return config
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error getting neutralization config: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error getting neutralization config: {str(e)}"
        )
@router.post("/neutralization/config", response_model=DataNeutraliserConfig)
@limiter.limit("10/minute")
 async def save_neutralization_config(
    request: Request,
    config_data: Dict[str, Any] = Body(...),
    currentUser: User = Depends(getCurrentUser)
 ) -> DataNeutraliserConfig:
    """Save or update data neutralization configuration"""
    try:
        service = NeutralizationService(currentUser)
        config = service.save_config(config_data)
        return config
    except Exception as e:
        logger.error(f"Error saving neutralization config: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error saving neutralization config: {str(e)}"
        )
@router.post("/neutralization/neutralize-text", response_model=Dict[str, Any])
@limiter.limit("20/minute")
 async def neutralize_text(
    request: Request,
    text_data: Dict[str, Any] = Body(...),
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, Any]:
    """Neutralize text content"""
    try:
        text = text_data.get("text", "")
        file_id = text_data.get("fileId")
        if not text:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Text content is required"
            )
        service = NeutralizationService(currentUser)
        result = service.neutralize_text(text, file_id)
        return result
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error neutralizing text: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error neutralizing text: {str(e)}"
        )
@router.post("/neutralization/resolve-text", response_model=Dict[str, str])
@limiter.limit("20/minute")
 async def resolve_text(
    request: Request,
    text_data: Dict[str, str] = Body(...),
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, str]:
    """Resolve UIDs in neutralized text back to original text"""
    try:
        text = text_data.get("text", "")
        if not text:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Text content is required"
            )
        service = NeutralizationService(currentUser)
        resolved_text = service.resolve_text(text)
        return {"resolved_text": resolved_text}
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error resolving text: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error resolving text: {str(e)}"
        )
@router.get("/neutralization/attributes", response_model=List[DataNeutralizerAttributes])
@limiter.limit("30/minute")
 async def get_neutralization_attributes(
    request: Request,
    fileId: Optional[str] = Query(None, description="Filter by file ID"),
    currentUser: User = Depends(getCurrentUser)
 ) -> List[DataNeutralizerAttributes]:
    """Get neutralization attributes, optionally filtered by file ID"""
    try:
        service = NeutralizationService(currentUser)
        attributes = service.get_attributes(fileId)
        return attributes
    except Exception as e:
        logger.error(f"Error getting neutralization attributes: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error getting neutralization attributes: {str(e)}"
        )
@router.post("/neutralization/process-sharepoint", response_model=Dict[str, Any])
@limiter.limit("5/minute")
 async def process_sharepoint_files(
    request: Request,
    paths_data: Dict[str, str] = Body(...),
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, Any]:
    """Process files from SharePoint source path and store neutralized files in target path"""
    try:
        source_path = paths_data.get("sourcePath", "")
        target_path = paths_data.get("targetPath", "")
        if not source_path or not target_path:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Both source and target paths are required"
            )
        service = NeutralizationService(currentUser)
        result = await service.process_sharepoint_files(source_path, target_path)
        return result
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing SharePoint files: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error processing SharePoint files: {str(e)}"
        )
@router.post("/neutralization/batch-process", response_model=Dict[str, Any])
@limiter.limit("10/minute")
 async def batch_process_files(
    request: Request,
    files_data: List[Dict[str, Any]] = Body(...),
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, Any]:
    """Process multiple files for neutralization"""
    try:
        if not files_data:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Files data is required"
            )
        service = NeutralizationService(currentUser)
        result = service.batch_neutralize_files(files_data)
        return result
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error batch processing files: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error batch processing files: {str(e)}"
        )
@router.get("/neutralization/stats", response_model=Dict[str, Any])
@limiter.limit("30/minute")
 async def get_neutralization_stats(
    request: Request,
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, Any]:
    """Get neutralization processing statistics"""
    try:
        service = NeutralizationService(currentUser)
        stats = service.get_processing_stats()
        return stats
    except Exception as e:
        logger.error(f"Error getting neutralization stats: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error getting neutralization stats: {str(e)}"
        )
@router.delete("/neutralization/attributes/{fileId}", response_model=Dict[str, str])
@limiter.limit("10/minute")
 async def cleanup_file_attributes(
    request: Request,
    fileId: str = Path(..., description="File ID to cleanup attributes for"),
    currentUser: User = Depends(getCurrentUser)
 ) -> Dict[str, str]:
    """Clean up neutralization attributes for a specific file"""
    try:
        service = NeutralizationService(currentUser)
        success = service.cleanup_file_attributes(fileId)
        if success:
            return {"message": f"Successfully cleaned up attributes for file {fileId}"}
        else:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail="Failed to cleanup file attributes"
            )
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error cleaning up file attributes: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Error cleaning up file attributes: {str(e)}"
        )
--- a/modules/services/serviceNeutralization.py
+++ b/modules/services/serviceNeutralization.py
@ -0,0 +1,587 @@
 """
 Data Neutralization Service
 Handles file processing for data neutralization including SharePoint integration
 """
 import logging
 import os
 import uuid
 from typing import Dict, List, Any, Optional, Tuple
 from datetime import datetime
 from pathlib import Path
 import mimetypes
 from modules.interfaces.interfaceAppObjects import getInterface
 from modules.interfaces.interfaceAppModel import User, DataNeutraliserConfig, DataNeutralizerAttributes
 from modules.neutralizer.neutralizer import DataAnonymizer
 from modules.shared.timezoneUtils import get_utc_timestamp
 logger = logging.getLogger(__name__)
 class NeutralizationService:
    """Service for handling data neutralization operations"""
    def __init__(self, current_user: User):
        """Initialize the service with user context"""
        self.current_user = current_user
        self.app_interface = getInterface(current_user)
    def get_config(self) -> Optional[DataNeutraliserConfig]:
        """Get the neutralization configuration for the current user's mandate"""
        return self.app_interface.getNeutralizationConfig()
    def save_config(self, config_data: Dict[str, Any]) -> DataNeutraliserConfig:
        """Save or update the neutralization configuration"""
        return self.app_interface.createOrUpdateNeutralizationConfig(config_data)
    def neutralize_text(self, text: str, file_id: Optional[str] = None) -> Dict[str, Any]:
        """Neutralize text content and return results with attribute mappings"""
        return self.app_interface.neutralizeText(text, file_id)
    def get_attributes(self, file_id: Optional[str] = None) -> List[DataNeutralizerAttributes]:
        """Get neutralization attributes, optionally filtered by file ID"""
        return self.app_interface.getNeutralizationAttributes(file_id)
    def resolve_text(self, text: str) -> str:
        """Resolve UIDs in neutralized text back to original text"""
        return self.app_interface.resolveNeutralizedText(text)
    async def process_sharepoint_files(self, source_path: str, target_path: str) -> Dict[str, Any]:
        """
        Process files from SharePoint source path, neutralize them, and store in target path
        Args:
            source_path: SharePoint path to read files from
            target_path: SharePoint path to store neutralized files
        Returns:
            Dictionary with processing results
        """
        try:
            logger.info(f"Processing SharePoint files from {source_path} to {target_path}")
            # Get user's SharePoint connection that matches the source path
            sharepoint_connection = await self._get_sharepoint_connection(source_path)
            if not sharepoint_connection:
                return {
                    "success": False,
                    "message": "No SharePoint connection found for user",
                    "processed_files": 0,
                    "errors": ["No SharePoint connection found"]
                }
            logger.info(f"Using SharePoint connection: {sharepoint_connection.get('id')} for path: {source_path}")
            # Get SharePoint access token
            sharepoint_token = self.app_interface.getConnectionToken(sharepoint_connection["id"])
            if not sharepoint_token:
                return {
                    "success": False,
                    "message": "No SharePoint access token found",
                    "processed_files": 0,
                    "errors": ["No SharePoint access token found"]
                }
            # Process files asynchronously
            return await self._process_sharepoint_files_async(
                source_path, target_path, sharepoint_token.tokenAccess
            )
        except Exception as e:
            logger.error(f"Error processing SharePoint files: {str(e)}")
            return {
                "success": False,
                "message": f"Error processing SharePoint files: {str(e)}",
                "processed_files": 0,
                "errors": [str(e)]
            }
    async def _get_sharepoint_connection(self, sharepoint_path: str = None):
        """Get user's SharePoint connection that matches the given path"""
        try:
            # Get all user connections
            from modules.interfaces.interfaceAppModel import UserConnection
            connections = self.app_interface.db.getRecordset(
                UserConnection,
                recordFilter={"userId": self.app_interface.userId}
            )
            # Find all Microsoft connections
            msft_connections = [conn for conn in connections if conn.get("authority") == "msft"]
            if not msft_connections:
                logger.warning("No Microsoft connections found for user")
                return None
            if len(msft_connections) == 1:
                logger.info(f"Found single Microsoft connection: {msft_connections[0].get('id')}")
                return msft_connections[0]
            # If multiple connections and we have a path, try to match
            if sharepoint_path:
                return await self._match_connection_to_path(msft_connections, sharepoint_path)
            # If no path provided, return the first one
            logger.info(f"Multiple Microsoft connections found, using first one: {msft_connections[0].get('id')}")
            return msft_connections[0]
        except Exception as e:
            logger.error(f"Error getting SharePoint connection: {str(e)}")
            return None
    async def _match_connection_to_path(self, connections: list, sharepoint_path: str):
        """Match a connection to the SharePoint path by testing access"""
        try:
            # Extract domain from the path
            from urllib.parse import urlparse
            parsed_url = urlparse(sharepoint_path)
            target_domain = parsed_url.netloc.lower()
            logger.info(f"Looking for connection matching domain: {target_domain}")
            # Try each connection to see which one can access the site
            for connection in connections:
                try:
                    # Get token for this connection
                    token = self.app_interface.getConnectionToken(connection["id"])
                    if not token:
                        continue
                    # Test if this connection can access the SharePoint site
                    if await self._test_sharepoint_access(token.tokenAccess, sharepoint_path):
                        logger.info(f"Found matching connection for domain {target_domain}: {connection.get('id')}")
                        return connection
                except Exception as e:
                    continue
            # If no specific match found, return the first connection
            logger.warning(f"No specific connection match found for {target_domain}, using first available")
            return connections[0]
        except Exception as e:
            logger.error(f"Error matching connection to path: {str(e)}")
            return connections[0] if connections else None
    async def _test_sharepoint_access(self, access_token: str, sharepoint_path: str) -> bool:
        """Test if the access token can access the given SharePoint path"""
        try:
            return await self._test_sharepoint_access_async(access_token, sharepoint_path)
        except Exception as e:
            return False
    async def _test_sharepoint_access_async(self, access_token: str, sharepoint_path: str) -> bool:
        """Async test for SharePoint access"""
        try:
            from modules.connectors.connectorSharepoint import ConnectorSharepoint
            connector = ConnectorSharepoint(access_token=access_token)
            # Parse the path to get site URL
            site_url, _ = self._parse_sharepoint_path(sharepoint_path)
            if not site_url:
                return False
            # Try to find the site
            site_info = await connector.find_site_by_web_url(site_url)
            return site_info is not None
        except Exception as e:
            return False
    async def _process_sharepoint_files_async(self, source_path: str, target_path: str, access_token: str) -> Dict[str, Any]:
        """Process SharePoint files asynchronously"""
        try:
            import asyncio
            from modules.connectors.connectorSharepoint import ConnectorSharepoint
            # Initialize SharePoint connector
            connector = ConnectorSharepoint(access_token=access_token)
            # Parse source and target paths to extract site and folder info
            source_site, source_folder = self._parse_sharepoint_path(source_path)
            target_site, target_folder = self._parse_sharepoint_path(target_path)
            if not source_site or not target_site:
                return {
                    "success": False,
                    "message": "Invalid SharePoint path format",
                    "processed_files": 0,
                    "errors": ["Invalid SharePoint path format"]
                }
            # Find source site
            source_site_info = await connector.find_site_by_web_url(source_site)
            if not source_site_info:
                return {
                    "success": False,
                    "message": f"Source site not found: {source_site}",
                    "processed_files": 0,
                    "errors": [f"Source site not found: {source_site}"]
                }
            # Find target site
            target_site_info = await connector.find_site_by_web_url(target_site)
            if not target_site_info:
                return {
                    "success": False,
                    "message": f"Target site not found: {target_site}",
                    "processed_files": 0,
                    "errors": [f"Target site not found: {target_site}"]
                }
            # List files in source folder
            logger.info(f"Listing files in folder: {source_folder} for site: {source_site_info['id']}")
            files = await connector.list_folder_contents(source_site_info["id"], source_folder)
            # If no files found, try listing the root folder to see what's available
            if not files:
                logger.warning(f"No files found in folder '{source_folder}', trying root folder")
                files = await connector.list_folder_contents(source_site_info["id"], "")
                if files:
                    # List available folders for debugging
                    folders = [f for f in files if f.get("type") == "folder"]
                    folder_names = [f.get('name') for f in folders]
                    logger.info(f"Available folders in root: {folder_names}")
                    # Format folder list for better UI display
                    folder_list = ", ".join(folder_names) if folder_names else "None"
                    return {
                        "success": False,
                        "message": f"Folder '{source_folder}' not found. Available folders in root: {folder_list}",
                        "processed_files": 0,
                        "errors": [f"Folder '{source_folder}' not found. Available folders: {folder_list}"],
                        "available_folders": folder_names
                    }
                else:
                    return {
                        "success": False,
                        "message": f"No files found in source folder: {source_folder}",
                        "processed_files": 0,
                        "errors": [f"No files found in source folder: {source_folder}"]
                    }
            # Filter for text files only
            text_files = [f for f in files if f.get("type") == "file" and self._is_text_file(f.get("name", ""))]
            if not text_files:
                return {
                    "success": False,
                    "message": "No text files found in source folder",
                    "processed_files": 0,
                    "errors": ["No text files found in source folder"]
                }
            # Process files in parallel for better performance
            processed_files = []
            errors = []
            # Create tasks for parallel processing
            async def process_single_file(file_info):
                """Process a single file - download, neutralize, upload"""
                try:
                    # Download file
                    file_content = await connector.download_file(source_site_info["id"], file_info["id"])
                    if not file_content:
                        return {"error": f"Failed to download file: {file_info['name']}"}
                    # Convert to text
                    try:
                        text_content = file_content.decode('utf-8')
                    except UnicodeDecodeError:
                        text_content = file_content.decode('latin-1')
                    # Neutralize the text
                    neutralization_result = self.app_interface.neutralizeText(text_content, file_info["id"])
                    # Create neutralized filename
                    neutralized_filename = f"neutralized_{file_info['name']}"
                    # Upload neutralized file
                    neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
                    upload_result = await connector.upload_file(
                        target_site_info["id"],
                        target_folder,
                        neutralized_filename,
                        neutralized_content
                    )
                    if "error" in upload_result:
                        return {"error": f"Failed to upload neutralized file: {neutralized_filename} - {upload_result['error']}"}
                    else:
                        return {
                            "success": True,
                            "original_name": file_info["name"],
                            "neutralized_name": neutralized_filename,
                            "attributes_count": len(neutralization_result.get("attributes", []))
                        }
                except Exception as e:
                    error_msg = f"Error processing file {file_info['name']}: {str(e)}"
                    logger.error(error_msg)
                    return {"error": error_msg}
            # Process all files in parallel
            logger.info(f"Processing {len(text_files)} files in parallel...")
            tasks = [process_single_file(file_info) for file_info in text_files]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            # Process results
            for i, result in enumerate(results):
                if isinstance(result, Exception):
                    error_msg = f"Exception processing file {text_files[i]['name']}: {str(result)}"
                    errors.append(error_msg)
                    logger.error(error_msg)
                elif isinstance(result, dict) and "error" in result:
                    errors.append(result["error"])
                elif isinstance(result, dict) and result.get("success"):
                    processed_files.append({
                        "original_name": result["original_name"],
                        "neutralized_name": result["neutralized_name"],
                        "attributes_count": result["attributes_count"]
                    })
                    logger.info(f"Successfully processed file: {result['original_name']} -> {result['neutralized_name']}")
                else:
                    error_msg = f"Unknown result processing file {text_files[i]['name']}: {result}"
                    errors.append(error_msg)
                    logger.error(error_msg)
            return {
                "success": len(processed_files) > 0,
                "message": f"Processed {len(processed_files)} files successfully",
                "processed_files": len(processed_files),
                "files": processed_files,
                "errors": errors
            }
        except Exception as e:
            logger.error(f"Error in async SharePoint processing: {str(e)}")
            return {
                "success": False,
                "message": f"Error in async SharePoint processing: {str(e)}",
                "processed_files": 0,
                "errors": [str(e)]
            }
    def _parse_sharepoint_path(self, path: str) -> tuple[str, str]:
        """Parse SharePoint path to extract site URL and folder path"""
        try:
            # Expected format: https://domain.sharepoint.com/sites/sitename/folder/path
            if not path.startswith("https://"):
                return None, None
            # Remove query parameters
            if "?" in path:
                path = path.split("?")[0]
            # Split by /sites/
            if "/sites/" not in path:
                return None, None
            parts = path.split("/sites/", 1)
            if len(parts) != 2:
                return None, None
            # Extract domain and site name
            domain = parts[0].replace("https://", "")
            site_name = parts[1].split("/")[0]
            # Create proper site URL for Graph API
            site_url = f"https://{domain}/sites/{site_name}"
            # Extract folder path (everything after the site name)
            folder_parts = parts[1].split("/")[1:]
            folder_path = "/".join(folder_parts) if folder_parts else ""
            # URL decode the folder path
            from urllib.parse import unquote
            folder_path = unquote(folder_path)
            return site_url, folder_path
        except Exception as e:
            logger.error(f"Error parsing SharePoint path '{path}': {str(e)}")
            return None, None
    def _is_text_file(self, filename: str) -> bool:
        """Check if file is a text file based on extension"""
        text_extensions = [
            '.txt', '.csv', '.json', '.xml', '.md', '.log',
            '.doc', '.docx', '.rtf', '.odt',  # Document formats
            '.html', '.htm', '.css', '.js', '.ts', '.py', '.java', '.cpp', '.c', '.h',  # Code files
            '.ini', '.cfg', '.conf', '.properties',  # Config files
            '.sql', '.yaml', '.yml', '.toml',  # Data/config files
            '.ps1', '.bat', '.sh', '.bash'  # Script files
        ]
        return any(filename.lower().endswith(ext) for ext in text_extensions)
    def process_file_content(self, file_content: bytes, file_name: str, mime_type: str) -> Dict[str, Any]:
        """
        Process file content for neutralization
        Args:
            file_content: Binary file content
            file_name: Name of the file
            mime_type: MIME type of the file
        Returns:
            Dictionary with neutralization results
        """
        try:
            # Determine content type based on MIME type
            content_type = self._get_content_type_from_mime(mime_type)
            # Decode content to text
            try:
                text_content = file_content.decode('utf-8')
            except UnicodeDecodeError:
                # Try with different encodings
                for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
                    try:
                        text_content = file_content.decode(encoding)
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    raise ValueError("Unable to decode file content")
            # Generate a temporary file ID for tracking
            temp_file_id = str(uuid.uuid4())
            # Neutralize the content
            neutralization_result = self.neutralize_text(text_content, temp_file_id)
            # Encode the neutralized content back to bytes
            neutralized_content = neutralization_result["neutralized_text"].encode('utf-8')
            # Generate neutralized file name
            neutralized_file_name = f"neutralized_{file_name}"
            return {
                "success": True,
                "original_content": text_content,
                "neutralized_content": neutralization_result["neutralized_text"],
                "neutralized_file_name": neutralized_file_name,
                "attributes": neutralization_result["attributes"],
                "mapping": neutralization_result["mapping"],
                "file_id": temp_file_id
            }
        except Exception as e:
            logger.error(f"Error processing file content: {str(e)}")
            return {
                "success": False,
                "error": str(e),
                "original_content": None,
                "neutralized_content": None
            }
    def _get_content_type_from_mime(self, mime_type: str) -> str:
        """Determine content type from MIME type for neutralization processing"""
        if mime_type.startswith('text/'):
            return 'text'
        elif mime_type in ['application/json', 'application/xml', 'text/xml']:
            return 'json' if 'json' in mime_type else 'xml'
        elif mime_type in ['text/csv', 'application/csv']:
            return 'csv'
        else:
            return 'text'  # Default to text processing
    def batch_neutralize_files(self, files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Process multiple files for neutralization
        Args:
            files_data: List of dictionaries containing file information
                       Each dict should have: content, name, mime_type
        Returns:
            Dictionary with batch processing results
        """
        try:
            results = []
            total_files = len(files_data)
            successful_files = 0
            errors = []
            for file_data in files_data:
                try:
                    result = self.process_file_content(
                        file_data['content'],
                        file_data['name'],
                        file_data['mime_type']
                    )
                    if result['success']:
                        successful_files += 1
                        results.append({
                            'file_name': file_data['name'],
                            'neutralized_file_name': result['neutralized_file_name'],
                            'file_id': result['file_id'],
                            'attributes_count': len(result['attributes'])
                        })
                    else:
                        errors.append(f"Failed to process {file_data['name']}: {result['error']}")
                except Exception as e:
                    error_msg = f"Error processing {file_data['name']}: {str(e)}"
                    errors.append(error_msg)
                    logger.error(error_msg)
            return {
                "success": len(errors) == 0,
                "total_files": total_files,
                "successful_files": successful_files,
                "failed_files": len(errors),
                "results": results,
                "errors": errors
            }
        except Exception as e:
            logger.error(f"Error in batch neutralization: {str(e)}")
            return {
                "success": False,
                "total_files": len(files_data),
                "successful_files": 0,
                "failed_files": len(files_data),
                "results": [],
                "errors": [str(e)]
            }
    def cleanup_file_attributes(self, file_id: str) -> bool:
        """Clean up neutralization attributes for a specific file"""
        return self.app_interface.deleteNeutralizationAttributes(file_id)
    def get_processing_stats(self) -> Dict[str, Any]:
        """Get statistics about neutralization processing"""
        try:
            # Get all attributes for the current mandate
            all_attributes = self.get_attributes()
            # Group by pattern type
            pattern_counts = {}
            for attr in all_attributes:
                pattern_type = attr.patternType
                pattern_counts[pattern_type] = pattern_counts.get(pattern_type, 0) + 1
            # Get unique files
            unique_files = set(attr.fileId for attr in all_attributes if attr.fileId)
            return {
                "total_attributes": len(all_attributes),
                "unique_files": len(unique_files),
                "pattern_counts": pattern_counts,
                "mandate_id": self.current_user.mandateId
            }
        except Exception as e:
            logger.error(f"Error getting processing stats: {str(e)}")
            return {
                "total_attributes": 0,
                "unique_files": 0,
                "pattern_counts": {},
                "error": str(e)
            }
--- a/tests/test_neutralizer/neutralizer.py
+++ b/tests/test_neutralizer/neutralizer.py
@ -130,10 +130,6 @@ class DataAnonymizer:
                        self.mapping[matched_text] = pattern.replacement_template.format(len(self.mapping) + 1)
                    replacement = self.mapping[matched_text]
                    if pattern_name == 'email':
                        print(f"DEBUG: Replacing email '{matched_text}' with '{replacement}'")
                        print(f"DEBUG: Text after replacement: {current_text[:start] + replacement + current_text[end:]}")
                # Replace the matched text while preserving surrounding whitespace
                current_text = current_text[:start] + replacement + current_text[end:]
--- a/tests/test_neutralizer/patterns.py
+++ b/tests/test_neutralizer/patterns.py
@ -392,11 +392,5 @@ def find_patterns_in_text(text: str, patterns: List[Pattern]) -> List[tuple]:
    matches = []
    for pattern in patterns:
        for p in pattern.patterns:
            if pattern.name == 'email':
                print(f"\nDEBUG: Checking email pattern '{p}'")
            for match in re.finditer(p, text, re.IGNORECASE):
                if pattern.name == 'email':
                    print(f"DEBUG: Found email match: '{match.group(0)}' at position {match.start()}-{match.end()}")
                    print(f"DEBUG: Context: '{text[max(0, match.start()-20):match.end()+20]}'")
                matches.append((pattern.name, match.group(0), match.start(), match.end()))
    return sorted(matches, key=lambda x: x[2])  # Sort by start position