gateway/modules/features/neutralization/serviceNeutralization/subProcessText.py
2026-01-25 03:01:01 +01:00

104 lines
3.3 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Text processing module for data anonymization
Handles plain text processing without header information
"""
from typing import Dict, List, Any
from dataclasses import dataclass
from .subParseString import StringParser
@dataclass
class PlainText:
"""Repräsentiert normalen Text"""
content: str
source_type: str # 'txt', 'docx', 'text_plain'
class TextProcessor:
"""Handles plain text processing for anonymization"""
def __init__(self, NamesToParse: List[str] = None):
"""
Initialize the text processor
Args:
NamesToParse: List of names to parse and replace
"""
self.string_parser = StringParser(NamesToParse)
def _extractTablesFromText(self, content: str) -> tuple:
"""
Extract tables and plain text from content
Args:
content: Content to process
Returns:
Tuple of (list of tables, list of plain text sections)
"""
# For now, process the entire content as plain text
# This can be extended later to detect table-like structures
tables = []
plainTexts = [PlainText(content=content, source_type='text_plain')]
return tables, plainTexts
def _anonymizePlainText(self, text: PlainText) -> PlainText:
"""
Anonymize plain text content
Args:
text: PlainText object to anonymize
Returns:
PlainText: Anonymized text
"""
# Use the string parser to process the content
anonymizedContent = self.string_parser.processString(text.content)
return PlainText(content=anonymizedContent, source_type=text.source_type)
def processTextContent(self, content: str) -> tuple:
"""
Process text content and return anonymized data
Args:
content: Text content to process
Returns:
Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
"""
# Extract tables and plain text sections
tables, plainTexts = self._extractTablesFromText(content)
# Process plain text sections
anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]
# Combine all processed content
result = content
for text, anonymizedText in zip(plainTexts, anonymizedTexts):
if text.content != anonymizedText.content:
result = result.replace(text.content, anonymizedText.content)
# Get processing information
processedInfo = {
'type': 'text',
'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
if tables else [])
}
return result, self.string_parser.getMapping(), [], processedInfo
def getMapping(self) -> Dict[str, str]:
"""
Get the current mapping of original values to placeholders
Returns:
Dict[str, str]: Mapping dictionary
"""
return self.string_parser.getMapping()
def clearMapping(self):
"""Clear the current mapping"""
self.string_parser.clearMapping()