""" Text processing module for data anonymization Handles plain text processing without header information """ from typing import Dict, List, Any from dataclasses import dataclass from modules.services.serviceNeutralization.subParseString import StringParser @dataclass class PlainText: """Repräsentiert normalen Text""" content: str source_type: str # 'txt', 'docx', 'text_plain' class TextProcessor: """Handles plain text processing for anonymization""" def __init__(self, NamesToParse: List[str] = None): """ Initialize the text processor Args: NamesToParse: List of names to parse and replace """ self.string_parser = StringParser(NamesToParse) def _extractTablesFromText(self, content: str) -> tuple: """ Extract tables and plain text from content Args: content: Content to process Returns: Tuple of (list of tables, list of plain text sections) """ # For now, process the entire content as plain text # This can be extended later to detect table-like structures tables = [] plainTexts = [PlainText(content=content, source_type='text_plain')] return tables, plainTexts def _anonymizePlainText(self, text: PlainText) -> PlainText: """ Anonymize plain text content Args: text: PlainText object to anonymize Returns: PlainText: Anonymized text """ # Use the string parser to process the content anonymizedContent = self.string_parser.processString(text.content) return PlainText(content=anonymizedContent, source_type=text.source_type) def processTextContent(self, content: str) -> tuple: """ Process text content and return anonymized data Args: content: Text content to process Returns: Tuple of (anonymized_content, mapping, replaced_fields, processed_info) """ # Extract tables and plain text sections tables, plainTexts = self._extractTablesFromText(content) # Process plain text sections anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts] # Combine all processed content result = content for text, anonymizedText in zip(plainTexts, anonymizedTexts): if text.content != anonymizedText.content: result = result.replace(text.content, anonymizedText.content) # Get processing information processedInfo = { 'type': 'text', 'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables] if tables else []) } return result, self.string_parser.getMapping(), [], processedInfo def getMapping(self) -> Dict[str, str]: """ Get the current mapping of original values to placeholders Returns: Dict[str, str]: Mapping dictionary """ return self.string_parser.getMapping() def clearMapping(self): """Clear the current mapping""" self.string_parser.clearMapping()