# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Text processing module for data anonymization Handles plain text processing without header information """ from typing import Dict, List, Any from dataclasses import dataclass from .subParseString import StringParser @dataclass class PlainText: """Repräsentiert normalen Text""" content: str source_type: str # 'txt', 'docx', 'text_plain' class TextProcessor: """Handles plain text processing for anonymization""" def __init__(self, NamesToParse: List[str] = None): """ Initialize the text processor Args: NamesToParse: List of names to parse and replace """ self.string_parser = StringParser(NamesToParse) def _extractTablesFromText(self, content: str) -> tuple: """ Extract tables and plain text from content Args: content: Content to process Returns: Tuple of (list of tables, list of plain text sections) """ # For now, process the entire content as plain text # This can be extended later to detect table-like structures tables = [] plainTexts = [PlainText(content=content, source_type='text_plain')] return tables, plainTexts def _anonymizePlainText(self, text: PlainText) -> PlainText: """ Anonymize plain text content Args: text: PlainText object to anonymize Returns: PlainText: Anonymized text """ # Use the string parser to process the content anonymizedContent = self.string_parser.processString(text.content) return PlainText(content=anonymizedContent, source_type=text.source_type) def processTextContent(self, content: str) -> tuple: """ Process text content and return anonymized data Args: content: Text content to process Returns: Tuple of (anonymized_content, mapping, replaced_fields, processed_info) """ # Extract tables and plain text sections tables, plainTexts = self._extractTablesFromText(content) # Process plain text sections anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts] # Combine all processed content result = content for text, anonymizedText in zip(plainTexts, anonymizedTexts): if text.content != anonymizedText.content: result = result.replace(text.content, anonymizedText.content) # Get processing information processedInfo = { 'type': 'text', 'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables] if tables else []) } return result, self.string_parser.getMapping(), [], processedInfo def getMapping(self) -> Dict[str, str]: """ Get the current mapping of original values to placeholders Returns: Dict[str, str]: Mapping dictionary """ return self.string_parser.getMapping() def clearMapping(self): """Clear the current mapping""" self.string_parser.clearMapping()