104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Text processing module for data anonymization
|
|
Handles plain text processing without header information
|
|
"""
|
|
|
|
from typing import Dict, List, Any
|
|
from dataclasses import dataclass
|
|
from modules.services.serviceNeutralization.subParseString import StringParser
|
|
|
|
@dataclass
|
|
class PlainText:
|
|
"""Repräsentiert normalen Text"""
|
|
content: str
|
|
source_type: str # 'txt', 'docx', 'text_plain'
|
|
|
|
class TextProcessor:
|
|
"""Handles plain text processing for anonymization"""
|
|
|
|
def __init__(self, NamesToParse: List[str] = None):
|
|
"""
|
|
Initialize the text processor
|
|
|
|
Args:
|
|
NamesToParse: List of names to parse and replace
|
|
"""
|
|
self.string_parser = StringParser(NamesToParse)
|
|
|
|
def _extractTablesFromText(self, content: str) -> tuple:
|
|
"""
|
|
Extract tables and plain text from content
|
|
|
|
Args:
|
|
content: Content to process
|
|
|
|
Returns:
|
|
Tuple of (list of tables, list of plain text sections)
|
|
"""
|
|
# For now, process the entire content as plain text
|
|
# This can be extended later to detect table-like structures
|
|
tables = []
|
|
plainTexts = [PlainText(content=content, source_type='text_plain')]
|
|
|
|
return tables, plainTexts
|
|
|
|
def _anonymizePlainText(self, text: PlainText) -> PlainText:
|
|
"""
|
|
Anonymize plain text content
|
|
|
|
Args:
|
|
text: PlainText object to anonymize
|
|
|
|
Returns:
|
|
PlainText: Anonymized text
|
|
"""
|
|
# Use the string parser to process the content
|
|
anonymizedContent = self.string_parser.processString(text.content)
|
|
|
|
return PlainText(content=anonymizedContent, source_type=text.source_type)
|
|
|
|
def processTextContent(self, content: str) -> tuple:
|
|
"""
|
|
Process text content and return anonymized data
|
|
|
|
Args:
|
|
content: Text content to process
|
|
|
|
Returns:
|
|
Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
|
|
"""
|
|
# Extract tables and plain text sections
|
|
tables, plainTexts = self._extractTablesFromText(content)
|
|
|
|
# Process plain text sections
|
|
anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]
|
|
|
|
# Combine all processed content
|
|
result = content
|
|
for text, anonymizedText in zip(plainTexts, anonymizedTexts):
|
|
if text.content != anonymizedText.content:
|
|
result = result.replace(text.content, anonymizedText.content)
|
|
|
|
# Get processing information
|
|
processedInfo = {
|
|
'type': 'text',
|
|
'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
|
|
if tables else [])
|
|
}
|
|
|
|
return result, self.string_parser.getMapping(), [], processedInfo
|
|
|
|
def getMapping(self) -> Dict[str, str]:
|
|
"""
|
|
Get the current mapping of original values to placeholders
|
|
|
|
Returns:
|
|
Dict[str, str]: Mapping dictionary
|
|
"""
|
|
return self.string_parser.getMapping()
|
|
|
|
def clearMapping(self):
|
|
"""Clear the current mapping"""
|
|
self.string_parser.clearMapping()
|