gateway/modules/features/neutralization/serviceNeutralization/subProcessText.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Text processing module for data anonymization
Handles plain text processing without header information
"""

from typing import Dict, List, Any
from dataclasses import dataclass
from .subParseString import StringParser

@dataclass
class PlainText:
    """Repräsentiert normalen Text"""
    content: str
    source_type: str  # 'txt', 'docx', 'text_plain'

class TextProcessor:
    """Handles plain text processing for anonymization"""

    def __init__(self, NamesToParse: List[str] = None):
        """
        Initialize the text processor

        Args:
            NamesToParse: List of names to parse and replace
        """
        self.string_parser = StringParser(NamesToParse)

    def _extractTablesFromText(self, content: str) -> tuple:
        """
        Extract tables and plain text from content

        Args:
            content: Content to process

        Returns:
            Tuple of (list of tables, list of plain text sections)
        """
        # For now, process the entire content as plain text
        # This can be extended later to detect table-like structures
        tables = []
        plainTexts = [PlainText(content=content, source_type='text_plain')]

        return tables, plainTexts

    def _anonymizePlainText(self, text: PlainText) -> PlainText:
        """
        Anonymize plain text content

        Args:
            text: PlainText object to anonymize

        Returns:
            PlainText: Anonymized text
        """
        # Use the string parser to process the content
        anonymizedContent = self.string_parser.processString(text.content)

        return PlainText(content=anonymizedContent, source_type=text.source_type)

    def processTextContent(self, content: str) -> tuple:
        """
        Process text content and return anonymized data

        Args:
            content: Text content to process

        Returns:
            Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
        """
        # Extract tables and plain text sections
        tables, plainTexts = self._extractTablesFromText(content)

        # Process plain text sections
        anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]

        # Combine all processed content
        result = content
        for text, anonymizedText in zip(plainTexts, anonymizedTexts):
            if text.content != anonymizedText.content:
                result = result.replace(text.content, anonymizedText.content)

        # Get processing information
        processedInfo = {
            'type': 'text',
            'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
                       if tables else [])
        }

        return result, self.string_parser.getMapping(), [], processedInfo

    def getMapping(self) -> Dict[str, str]:
        """
        Get the current mapping of original values to placeholders

        Returns:
            Dict[str, str]: Mapping dictionary
        """
        return self.string_parser.getMapping()

    def clearMapping(self):
        """Clear the current mapping"""
        self.string_parser.clearMapping()