platform-core/modules/features/neutralization/serviceNeutralization/subProcessText.py
ValueOn AG 4a60086c80
Some checks failed
Deploy Plattform-Core (Int) / test (push) Failing after 15s
Deploy Plattform-Core (Int) / deploy (push) Has been skipped
cp adapted to 2026 poweron
2026-06-09 09:53:31 +02:00

104 lines
3.3 KiB
Python

# Copyright (c) 2026 PowerOn AG
# All rights reserved.
"""
Text processing module for data anonymization
Handles plain text processing without header information
"""
from typing import Dict, List, Any
from dataclasses import dataclass
from .subParseString import StringParser
@dataclass
class PlainText:
"""Repräsentiert normalen Text"""
content: str
source_type: str # 'txt', 'docx', 'text_plain'
class TextProcessor:
"""Handles plain text processing for anonymization"""
def __init__(self, NamesToParse: List[str] = None):
"""
Initialize the text processor
Args:
NamesToParse: List of names to parse and replace
"""
self.string_parser = StringParser(NamesToParse)
def _extractTablesFromText(self, content: str) -> tuple:
"""
Extract tables and plain text from content
Args:
content: Content to process
Returns:
Tuple of (list of tables, list of plain text sections)
"""
# For now, process the entire content as plain text
# This can be extended later to detect table-like structures
tables = []
plainTexts = [PlainText(content=content, source_type='text_plain')]
return tables, plainTexts
def _anonymizePlainText(self, text: PlainText) -> PlainText:
"""
Anonymize plain text content
Args:
text: PlainText object to anonymize
Returns:
PlainText: Anonymized text
"""
# Use the string parser to process the content
anonymizedContent = self.string_parser.processString(text.content)
return PlainText(content=anonymizedContent, source_type=text.source_type)
def processTextContent(self, content: str) -> tuple:
"""
Process text content and return anonymized data
Args:
content: Text content to process
Returns:
Tuple of (anonymized_content, mapping, replaced_fields, processed_info)
"""
# Extract tables and plain text sections
tables, plainTexts = self._extractTablesFromText(content)
# Process plain text sections
anonymizedTexts = [self._anonymizePlainText(text) for text in plainTexts]
# Combine all processed content
result = content
for text, anonymizedText in zip(plainTexts, anonymizedTexts):
if text.content != anonymizedText.content:
result = result.replace(text.content, anonymizedText.content)
# Get processing information
processedInfo = {
'type': 'text',
'tables': ([{'headers': t.headers, 'row_count': len(t.rows)} for t in tables]
if tables else [])
}
return result, self.string_parser.getMapping(), [], processedInfo
def getMapping(self) -> Dict[str, str]:
"""
Get the current mapping of original values to placeholders
Returns:
Dict[str, str]: Mapping dictionary
"""
return self.string_parser.getMapping()
def clearMapping(self):
"""Clear the current mapping"""
self.string_parser.clearMapping()