gateway/modules/services/serviceNeutralization/subProcessText.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Text processing module for data anonymization
Handles plain text processing without header information
"""

from typing import Dict, List, Any
from dataclasses import dataclass
from modules.services.serviceNeutralization.subParseString import StringParser

@dataclass
class PlainText:
    """Repräsentiert normalen Text"""
    content: str
    source_type: str  # 'txt', 'docx', 'text_plain'

class TextProcessor:
    """Handles plain text processing for anonymization"""

    def __init__(self, NamesToParse: List[str] = None):
        """
        Initialize the text processor

        Args:
            NamesToParse: List of names to parse and replace
        """
        self.string_parser = StringParser(NamesToParse)

    def _extractTablesFromText(self, content: str) -> tuple:
        """
        Extract tables and plain text from content

        Args:
            content: Content to process

        Returns:
            Tuple of (list of tables, list of plain text sections)
        """
        # For now, process the entire content as plain text
        # This can be extended later to detect table-like structures
        tables = []
        plainTexts = [PlainText(content=content, source_type='text_plain')]

        return tables, plainTexts

    def _anonymizePlainText(self, text: PlainText) -> PlainText:
        """
        Anonymize plain text content

        Args:
            text: PlainText object to anonymize

        Returns:
            PlainText: Anonymized text
        """
        # Use the string parser to process the content
        anonymizedContent = self.string_parser.processString(text.content)

        return PlainText(content=anonymizedContent, source_type=text.source_type)

    def processTextContent(self, content: str) -> tuple:
        """
        Process text content and return anonymized data

        Args:
            content: Text content to process

        Returns:
            Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
        """
        # Extract tables and plain text sections
        tables, plainTexts = self._extractTablesFromText(content)

        # Process plain text sections
        anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]

        # Combine all processed content
        result = content
        for text, anonymizedText in zip(plainTexts, anonymizedTexts):
            if text.content != anonymizedText.content:
                result = result.replace(text.content, anonymizedText.content)

        # Get processing information
        processedInfo = {
            'type': 'text',
            'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
                       if tables else [])
        }

        return result, self.string_parser.getMapping(), [], processedInfo

    def getMapping(self) -> Dict[str, str]:
        """
        Get the current mapping of original values to placeholders

        Returns:
            Dict[str, str]: Mapping dictionary
        """
        return self.string_parser.getMapping()

    def clearMapping(self):
        """Clear the current mapping"""
        self.string_parser.clearMapping()