gateway/modules/services/serviceNeutralization/subProcessText.py
2025-12-15 21:55:26 +01:00

104 lines
3.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Text processing module for data anonymization
Handles plain text processing without header information
"""
from typing import Dict, List, Any
from dataclasses import dataclass
from modules.services.serviceNeutralization.subParseString import StringParser
@dataclass
class PlainText:
"""Repräsentiert normalen Text"""
content: str
source_type: str # 'txt', 'docx', 'text_plain'
class TextProcessor:
"""Handles plain text processing for anonymization"""
def __init__(self, NamesToParse: List[str] = None):
"""
Initialize the text processor
Args:
NamesToParse: List of names to parse and replace
"""
self.string_parser = StringParser(NamesToParse)
def _extractTablesFromText(self, content: str) -> tuple:
"""
Extract tables and plain text from content
Args:
content: Content to process
Returns:
Tuple of (list of tables, list of plain text sections)
"""
# For now, process the entire content as plain text
# This can be extended later to detect table-like structures
tables = []
plainTexts = [PlainText(content=content, source_type='text_plain')]
return tables, plainTexts
def _anonymizePlainText(self, text: PlainText) -> PlainText:
"""
Anonymize plain text content
Args:
text: PlainText object to anonymize
Returns:
PlainText: Anonymized text
"""
# Use the string parser to process the content
anonymizedContent = self.string_parser.processString(text.content)
return PlainText(content=anonymizedContent, source_type=text.source_type)
def processTextContent(self, content: str) -> tuple:
"""
Process text content and return anonymized data
Args:
content: Text content to process
Returns:
Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
"""
# Extract tables and plain text sections
tables, plainTexts = self._extractTablesFromText(content)
# Process plain text sections
anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]
# Combine all processed content
result = content
for text, anonymizedText in zip(plainTexts, anonymizedTexts):
if text.content != anonymizedText.content:
result = result.replace(text.content, anonymizedText.content)
# Get processing information
processedInfo = {
'type': 'text',
'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
if tables else [])
}
return result, self.string_parser.getMapping(), [], processedInfo
def getMapping(self) -> Dict[str, str]:
"""
Get the current mapping of original values to placeholders
Returns:
Dict[str, str]: Mapping dictionary
"""
return self.string_parser.getMapping()
def clearMapping(self):
"""Clear the current mapping"""
self.string_parser.clearMapping()