streamlined neutralization flow

This commit is contained in:
ValueOn AG 2026-03-30 00:14:57 +02:00
parent 7e88005721
commit e0a09ae6b1
13 changed files with 655 additions and 97 deletions

View file

@ -18,7 +18,9 @@ from typing import List, Dict, Any, Optional, AsyncGenerator, Union
from modules.datamodels.datamodelAi import AiModel, AiModelCall, AiModelResponse from modules.datamodels.datamodelAi import AiModel, AiModelCall, AiModelResponse
_RETRY_AFTER_PATTERN = _re.compile(r"try again in (\d+(?:\.\d+)?)\s*s", _re.IGNORECASE) _RETRY_AFTER_PATTERN = _re.compile(
r"(?:try again in|retry after)\s+(\d+(?:\.\d+)?)\s*s", _re.IGNORECASE
)
def _parseRetryAfterSeconds(message: str) -> float: def _parseRetryAfterSeconds(message: str) -> float:

View file

@ -22,7 +22,7 @@ import time
from typing import List, Optional, Dict, Any from typing import List, Optional, Dict, Any
from fastapi import HTTPException from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG from modules.shared.configuration import APP_CONFIG
from .aicoreBase import BaseConnectorAi from .aicoreBase import BaseConnectorAi, RateLimitExceededException
from modules.datamodels.datamodelAi import ( from modules.datamodels.datamodelAi import (
AiModel, AiModel,
PriorityEnum, PriorityEnum,
@ -370,6 +370,9 @@ class AiPrivateLlm(BaseConnectorAi):
if response.status_code != 200: if response.status_code != 200:
errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}" errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}"
if response.status_code == 429:
logger.warning(errorMessage)
raise RateLimitExceededException(errorMessage)
logger.error(errorMessage) logger.error(errorMessage)
raise HTTPException(status_code=500, detail=errorMessage) raise HTTPException(status_code=500, detail=errorMessage)
@ -461,6 +464,9 @@ class AiPrivateLlm(BaseConnectorAi):
if response.status_code != 200: if response.status_code != 200:
errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}" errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}"
if response.status_code == 429:
logger.warning(errorMessage)
raise RateLimitExceededException(errorMessage)
logger.error(errorMessage) logger.error(errorMessage)
raise HTTPException(status_code=500, detail=errorMessage) raise HTTPException(status_code=500, detail=errorMessage)

View file

@ -58,6 +58,17 @@ class DataNeutralizerAttributes(BaseModel):
originalText: str = Field(description="Original text that was neutralized", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True}) originalText: str = Field(description="Original text that was neutralized", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True})
fileId: Optional[str] = Field(default=None, description="ID of the file this attribute belongs to", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False}) fileId: Optional[str] = Field(default=None, description="ID of the file this attribute belongs to", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
patternType: str = Field(description="Type of pattern that matched (email, phone, name, etc.)", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True}) patternType: str = Field(description="Type of pattern that matched (email, phone, name, etc.)", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True})
class DataNeutralizationSnapshot(BaseModel):
"""Stores the full neutralized text (with embedded placeholders) per source."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
mandateId: str = Field(description="Mandate scope")
featureInstanceId: str = Field(default="", description="Feature instance scope")
userId: str = Field(description="User who triggered neutralization")
sourceLabel: str = Field(description="Human label, e.g. 'Prompt', 'Kontext', 'Nachricht 3'")
neutralizedText: str = Field(description="Full text with [type.uuid] placeholders embedded")
placeholderCount: int = Field(default=0, description="Number of placeholders in the text")
registerModelLabels( registerModelLabels(
"DataNeutralizerAttributes", "DataNeutralizerAttributes",
{"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"}, {"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"},
@ -71,5 +82,18 @@ registerModelLabels(
"patternType": {"en": "Pattern Type", "fr": "Type de modèle"}, "patternType": {"en": "Pattern Type", "fr": "Type de modèle"},
}, },
) )
registerModelLabels(
"DataNeutralizationSnapshot",
{"en": "Neutralization Snapshot", "de": "Neutralisierungs-Snapshot"},
{
"id": {"en": "ID"},
"mandateId": {"en": "Mandate ID"},
"featureInstanceId": {"en": "Feature Instance ID"},
"userId": {"en": "User ID"},
"sourceLabel": {"en": "Source", "de": "Quelle"},
"neutralizedText": {"en": "Neutralized Text", "de": "Neutralisierter Text"},
"placeholderCount": {"en": "Placeholders", "de": "Platzhalter"},
},
)

View file

@ -11,6 +11,7 @@ from typing import Dict, List, Any, Optional
from modules.features.neutralization.datamodelFeatureNeutralizer import ( from modules.features.neutralization.datamodelFeatureNeutralizer import (
DataNeutraliserConfig, DataNeutraliserConfig,
DataNeutralizerAttributes, DataNeutralizerAttributes,
DataNeutralizationSnapshot,
) )
from modules.connectors.connectorDbPostgre import DatabaseConnector from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.interfaces.interfaceRbac import getRecordsetWithRBAC from modules.interfaces.interfaceRbac import getRecordsetWithRBAC
@ -227,6 +228,74 @@ class InterfaceFeatureNeutralizer:
logger.error(f"Error deleting attribute by ID: {str(e)}") logger.error(f"Error deleting attribute by ID: {str(e)}")
return False return False
# ------------------------------------------------------------------
# Snapshot CRUD
# ------------------------------------------------------------------
def getSnapshots(self) -> List[DataNeutralizationSnapshot]:
"""Return all neutralization snapshots for the current mandate + feature instance."""
try:
_filter: Dict[str, Any] = {"mandateId": self.mandateId}
if self.featureInstanceId:
_filter["featureInstanceId"] = self.featureInstanceId
rows = getRecordsetWithRBAC(
self.db,
DataNeutralizationSnapshot,
self.currentUser,
recordFilter=_filter,
mandateId=self.mandateId,
)
return [
DataNeutralizationSnapshot(**{k: v for k, v in r.items() if not k.startswith("_")})
for r in rows
]
except Exception as e:
logger.error(f"Error getting snapshots: {e}")
return []
def clearSnapshots(self) -> int:
"""Delete all snapshots for the current feature-instance scope. Returns count deleted."""
try:
_filter: Dict[str, Any] = {"mandateId": self.mandateId}
if self.featureInstanceId:
_filter["featureInstanceId"] = self.featureInstanceId
existing = self.db.getRecordset(DataNeutralizationSnapshot, recordFilter=_filter)
for row in existing:
self.db.recordDelete(DataNeutralizationSnapshot, row["id"])
return len(existing)
except Exception as e:
logger.error(f"Error clearing snapshots: {e}")
return 0
def createSnapshot(
self,
sourceLabel: str,
neutralizedText: str,
placeholderCount: int = 0,
) -> Optional[DataNeutralizationSnapshot]:
"""Persist one neutralization snapshot."""
try:
if not self.userId:
logger.warning("Cannot create snapshot: missing userId")
return None
snap = DataNeutralizationSnapshot(
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
userId=self.userId,
sourceLabel=sourceLabel,
neutralizedText=neutralizedText,
placeholderCount=placeholderCount,
)
created = self.db.recordCreate(DataNeutralizationSnapshot, snap.model_dump())
return DataNeutralizationSnapshot(**{k: v for k, v in created.items() if not k.startswith("_")})
except Exception as e:
logger.error(f"Error creating snapshot: {e}")
return None
# ------------------------------------------------------------------
# Attribute CRUD
# ------------------------------------------------------------------
def createAttribute( def createAttribute(
self, self,
attributeId: str, attributeId: str,

View file

@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelUam import User
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig, DataNeutralizationSnapshot
from .interfaceFeatureNeutralizer import getInterface as _getNeutralizerInterface from .interfaceFeatureNeutralizer import getInterface as _getNeutralizerInterface
from modules.serviceHub import getInterface as getServices from modules.serviceHub import getInterface as getServices
@ -86,7 +86,7 @@ class NeutralizationPlayground:
'neutralized_file_id': None, 'neutralized_file_id': None,
'processed_info': {'type': 'error', 'error': 'File could not be decoded as text. Supported: UTF-8, Latin-1. For PDF/Word/Excel, use supported binary formats.'} 'processed_info': {'type': 'error', 'error': 'File could not be decoded as text. Supported: UTF-8, Latin-1. For PDF/Word/Excel, use supported binary formats.'}
} }
result = self.services.neutralization.processText(text_content) result = await self.services.neutralization.processTextAsync(text_content)
result['neutralized_file_name'] = f'neutralized_{filename}' result['neutralized_file_name'] = f'neutralized_{filename}'
# Save neutralized text as file to user files # Save neutralized text as file to user files
if self.services.interfaceDbComponent and result.get('neutralized_text') is not None: if self.services.interfaceDbComponent and result.get('neutralized_text') is not None:
@ -198,12 +198,28 @@ class NeutralizationPlayground:
"""Resolve UIDs in neutralized text back to original text""" """Resolve UIDs in neutralized text back to original text"""
return self.services.neutralization.resolveText(text) return self.services.neutralization.resolveText(text)
def getSnapshots(self) -> List[DataNeutralizationSnapshot]:
"""Return stored neutralization text snapshots."""
try:
return self.services.neutralization.getSnapshots()
except Exception as e:
logger.error(f"Error getting snapshots: {e}")
return []
def getAttributes(self, fileId: str = None) -> List[DataNeutralizerAttributes]: def getAttributes(self, fileId: str = None) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID""" """Get neutralization attributes, optionally filtered by file ID"""
try: try:
allAttributes = self.services.neutralization.getAttributes() allAttributes = self.services.neutralization.getAttributes()
if fileId: if fileId:
return [attr for attr in allAttributes if attr.fileId == fileId] want = str(fileId).strip()
def _matches(a: DataNeutralizerAttributes) -> bool:
af = a.fileId
if af is None or (isinstance(af, str) and not str(af).strip()):
return False
return str(af).strip() == want
return [attr for attr in allAttributes if _matches(attr)]
return allAttributes return allAttributes
except Exception as e: except Exception as e:
logger.error(f"Error getting attributes: {str(e)}") logger.error(f"Error getting attributes: {str(e)}")
@ -396,7 +412,7 @@ class SharepointProcessor:
textContent = fileContent.decode('utf-8') textContent = fileContent.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
textContent = fileContent.decode('latin-1') textContent = fileContent.decode('latin-1')
result = self.services.neutralization.processText(textContent) result = await self.services.neutralization.processTextAsync(textContent)
content_to_upload = (result.get('neutralized_text') or '').encode('utf-8') content_to_upload = (result.get('neutralized_text') or '').encode('utf-8')
neutralizedFilename = f"neutralized_{fileInfo['name']}" neutralizedFilename = f"neutralized_{fileInfo['name']}"

View file

@ -8,12 +8,33 @@ import logging
from modules.auth import limiter, getRequestContext, RequestContext from modules.auth import limiter, getRequestContext, RequestContext
# Import interfaces # Import interfaces
from .datamodelFeatureNeutralizer import DataNeutraliserConfig, DataNeutralizerAttributes from .datamodelFeatureNeutralizer import DataNeutraliserConfig, DataNeutralizerAttributes, DataNeutralizationSnapshot
from .neutralizePlayground import NeutralizationPlayground from .neutralizePlayground import NeutralizationPlayground
# Configure logger # Configure logger
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _assertFeatureInstancePathMatchesContext(featureInstanceIdFromPath: str, context: RequestContext) -> None:
"""Reject path/instance mismatch when request context already carries an instance id."""
ctxId = str(context.featureInstanceId).strip() if getattr(context, "featureInstanceId", None) else ""
pathId = (featureInstanceIdFromPath or "").strip()
if ctxId and pathId and pathId != ctxId:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Feature instance id in URL does not match request context (X-Instance-Id)",
)
def _fetchNeutralizationAttributes(context: RequestContext, fileId: Optional[str]) -> List[DataNeutralizerAttributes]:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getAttributes(fileId)
# Create router for neutralization endpoints # Create router for neutralization endpoints
router = APIRouter( router = APIRouter(
prefix="/api/neutralization", prefix="/api/neutralization",
@ -208,15 +229,9 @@ def get_neutralization_attributes(
) -> List[DataNeutralizerAttributes]: ) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID""" """Get neutralization attributes, optionally filtered by file ID"""
try: try:
service = NeutralizationPlayground( return _fetchNeutralizationAttributes(context, fileId)
context.user, except HTTPException:
str(context.mandateId) if context.mandateId else "", raise
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
attributes = service.getAttributes(fileId)
return attributes
except Exception as e: except Exception as e:
logger.error(f"Error getting neutralization attributes: {str(e)}") logger.error(f"Error getting neutralization attributes: {str(e)}")
raise HTTPException( raise HTTPException(
@ -224,6 +239,72 @@ def get_neutralization_attributes(
detail=f"Error getting neutralization attributes: {str(e)}" detail=f"Error getting neutralization attributes: {str(e)}"
) )
@router.get("/{feature_instance_id}/attributes", response_model=List[DataNeutralizerAttributes])
@limiter.limit("30/minute")
def get_neutralization_attributes_scoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace / feature instance id (must match X-Instance-Id when set)"),
fileId: Optional[str] = Query(None, description="Filter by file ID"),
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizerAttributes]:
"""Same as GET /attributes; path includes instance id for workspace UI compatibility."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _fetchNeutralizationAttributes(context, fileId)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization attributes: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error getting neutralization attributes: {str(e)}"
)
@router.get("/snapshots", response_model=List[DataNeutralizationSnapshot])
@limiter.limit("30/minute")
def get_neutralization_snapshots(
request: Request,
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizationSnapshot]:
"""Return neutralized-text snapshots (full text with placeholders) for the current feature instance."""
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getSnapshots()
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization snapshots: {e}")
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
@router.get("/{feature_instance_id}/snapshots", response_model=List[DataNeutralizationSnapshot])
@limiter.limit("30/minute")
def get_neutralization_snapshots_scoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace instance id (must match X-Instance-Id when set)"),
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizationSnapshot]:
"""Same as GET /snapshots; path includes instance id for workspace UI (explicit scope)."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getSnapshots()
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization snapshots (scoped): {e}")
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
@router.post("/process-sharepoint", response_model=Dict[str, Any]) @router.post("/process-sharepoint", response_model=Dict[str, Any])
@limiter.limit("5/minute") @limiter.limit("5/minute")
async def process_sharepoint_files( async def process_sharepoint_files(
@ -317,6 +398,21 @@ def get_neutralization_stats(
detail=f"Error getting neutralization stats: {str(e)}" detail=f"Error getting neutralization stats: {str(e)}"
) )
def _deleteSingleNeutralizationAttribute(context: RequestContext, attributeId: str) -> Dict[str, str]:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
success = service.deleteAttribute(attributeId)
if success:
return {"message": f"Attribute {attributeId} deleted"}
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Attribute {attributeId} not found",
)
@router.delete("/attributes/single/{attributeId}", response_model=Dict[str, str]) @router.delete("/attributes/single/{attributeId}", response_model=Dict[str, str])
@limiter.limit("30/minute") @limiter.limit("30/minute")
def deleteAttribute( def deleteAttribute(
@ -326,20 +422,7 @@ def deleteAttribute(
) -> Dict[str, str]: ) -> Dict[str, str]:
"""Delete a single neutralization attribute by ID.""" """Delete a single neutralization attribute by ID."""
try: try:
service = NeutralizationPlayground( return _deleteSingleNeutralizationAttribute(context, attributeId)
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
success = service.deleteAttribute(attributeId)
if success:
return {"message": f"Attribute {attributeId} deleted"}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Attribute {attributeId} not found"
)
except HTTPException: except HTTPException:
raise raise
except Exception as e: except Exception as e:
@ -347,6 +430,40 @@ def deleteAttribute(
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@router.delete("/{feature_instance_id}/attributes/single/{attributeId}", response_model=Dict[str, str])
@limiter.limit("30/minute")
def deleteAttributeScoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace / feature instance id"),
attributeId: str = Path(..., description="Attribute ID to delete"),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, str]:
"""Same as DELETE /attributes/single/{attributeId}; path includes instance id for workspace UI."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _deleteSingleNeutralizationAttribute(context, attributeId)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error deleting attribute: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
def _retriggerNeutralizationBody(context: RequestContext, fileId: str) -> Dict[str, str]:
if not fileId:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="fileId is required",
)
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
service.cleanupFileAttributes(fileId)
return {"message": f"Neutralization re-triggered for file {fileId}", "fileId": fileId}
@router.post("/retrigger", response_model=Dict[str, str]) @router.post("/retrigger", response_model=Dict[str, str])
@limiter.limit("10/minute") @limiter.limit("10/minute")
def retriggerNeutralization( def retriggerNeutralization(
@ -356,20 +473,26 @@ def retriggerNeutralization(
) -> Dict[str, str]: ) -> Dict[str, str]:
"""Re-trigger neutralization for a specific file.""" """Re-trigger neutralization for a specific file."""
try: try:
fileId = retriggerData.get("fileId", "") return _retriggerNeutralizationBody(context, retriggerData.get("fileId", ""))
if not fileId: except HTTPException:
raise HTTPException( raise
status_code=status.HTTP_400_BAD_REQUEST, except Exception as e:
detail="fileId is required" logger.error(f"Error re-triggering neutralization: {str(e)}")
) raise HTTPException(status_code=500, detail=str(e))
service = NeutralizationPlayground(
context.user, @router.post("/{feature_instance_id}/retrigger", response_model=Dict[str, str])
str(context.mandateId) if context.mandateId else "", @limiter.limit("10/minute")
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None def retriggerNeutralizationScoped(
) request: Request,
service.cleanupFileAttributes(fileId) feature_instance_id: str = Path(..., description="Workspace / feature instance id"),
return {"message": f"Neutralization re-triggered for file {fileId}", "fileId": fileId} retriggerData: Dict[str, str] = Body(...),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, str]:
"""Same as POST /retrigger; path includes instance id for workspace UI compatibility."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _retriggerNeutralizationBody(context, retriggerData.get("fileId", ""))
except HTTPException: except HTTPException:
raise raise
except Exception as e: except Exception as e:

View file

@ -60,6 +60,12 @@ class NeutralizationService:
mandateId=serviceCenter.mandateId or dbApp.mandateId, mandateId=serviceCenter.mandateId or dbApp.mandateId,
featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(dbApp, 'featureInstanceId', None) featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(dbApp, 'featureInstanceId', None)
) )
elif serviceCenter and getattr(serviceCenter, "user", None):
self.interfaceNeutralizer = getNeutralizerInterface(
currentUser=serviceCenter.user,
mandateId=getattr(serviceCenter, 'mandateId', None) or getattr(serviceCenter, 'mandate_id', None),
featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(serviceCenter, 'feature_instance_id', None),
)
namesList = NamesToParse if isinstance(NamesToParse, list) else [] namesList = NamesToParse if isinstance(NamesToParse, list) else []
self.NamesToParse = namesList self.NamesToParse = namesList
@ -82,11 +88,213 @@ class NeutralizationService:
# Public API: process text or file # Public API: process text or file
def processText(self, text: str) -> Dict[str, Any]: _NEUT_INSTRUCTION = (
"""Neutralize a raw text string and return a standard result dict.""" "Analyze the following text and identify ALL sensitive content that must be neutralized:\n"
result = self._neutralizeText(text, 'text') "1. Personal data (PII): names of persons, email addresses, phone numbers, "
self._persistAttributes(result.get('mapping', {}), None) "physical addresses, ID numbers, dates of birth, financial data (IBAN, account numbers), "
return result "social security numbers\n"
"2. Protected business logic: proprietary algorithms, trade secrets, confidential "
"processes, internal procedures, code snippets that reveal implementation details\n"
"3. Named entities: company names, product names, project names, brand names\n\n"
"Return ONLY a JSON array (no markdown, no explanation):\n"
'[{"text":"exact substring","type":"name|email|phone|address|id|financial|logic|company|product|location|other"}]\n\n'
"Rules:\n"
"- Every entry's 'text' must be an exact, verbatim substring of the input.\n"
"- Do NOT include generic words, common language constructs or non-sensitive terms.\n"
"- If nothing is sensitive, return [].\n\n"
)
_BYTES_PER_TOKEN = 3
_SELECTOR_MAX_RATIO = 0.8
_CHUNK_SAFETY_MARGIN = 0.9
def _resolveNeutModel(self):
"""Query the model registry for the best NEUTRALIZATION_TEXT model.
Returns the model object (with contextLength etc.) or None."""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector as _modSel
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
_models = modelRegistry.getAvailableModels()
_opts = AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT)
_failover = _modSel.getFailoverModelList("x", "", _opts, _models)
return _failover[0] if _failover else None
except Exception as _e:
logger.warning(f"_resolveNeutModel failed: {_e}")
return None
def _calcMaxChunkChars(self, model) -> int:
"""Derive the maximum text-chunk size (in characters) from the selected
model's contextLength, mirroring the rules in aicoreModelSelector:
promptTokens = promptBytes / 3 must be <= contextLength * 0.8
Subtract the instruction overhead and apply a safety margin."""
if not model or getattr(model, 'contextLength', 0) <= 0:
return 5000
_instructionBytes = len(self._NEUT_INSTRUCTION.encode('utf-8')) + 30
_maxPromptBytes = int(model.contextLength * self._SELECTOR_MAX_RATIO * self._BYTES_PER_TOKEN)
_maxChunkChars = int((_maxPromptBytes - _instructionBytes) * self._CHUNK_SAFETY_MARGIN)
return max(_maxChunkChars, 500)
@staticmethod
def _splitTextIntoChunks(text: str, maxChars: int) -> List[str]:
"""Split *text* into chunks of at most *maxChars*, preferring paragraph
then sentence boundaries so that the LLM sees coherent blocks."""
if len(text) <= maxChars:
return [text]
chunks: List[str] = []
remaining = text
while remaining:
if len(remaining) <= maxChars:
chunks.append(remaining)
break
_cut = maxChars
_para = remaining.rfind("\n\n", 0, _cut)
if _para > maxChars // 3:
_cut = _para + 2
else:
_nl = remaining.rfind("\n", 0, _cut)
if _nl > maxChars // 3:
_cut = _nl + 1
else:
_dot = remaining.rfind(". ", 0, _cut)
if _dot > maxChars // 3:
_cut = _dot + 2
else:
_sp = remaining.rfind(" ", 0, _cut)
if _sp > maxChars // 3:
_cut = _sp + 1
chunks.append(remaining[:_cut])
remaining = remaining[_cut:]
return chunks
async def _analyseChunk(self, aiService, chunkText: str) -> List[dict]:
"""Send one chunk to the NEUTRALIZATION_TEXT model, return raw findings list."""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
_prompt = self._NEUT_INSTRUCTION + "Text to analyze:\n---\n" + chunkText + "\n---"
_request = AiCallRequest(
prompt=_prompt,
options=AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT),
)
_response = await aiService.callAi(_request)
if not _response or not getattr(_response, 'content', None):
raise RuntimeError(
"Neutralization AI call returned no response "
"(no model available for NEUTRALIZATION_TEXT?)"
)
if getattr(_response, 'errorCount', 0) > 0 or getattr(_response, 'modelName', '') == 'error':
raise RuntimeError(
f"Neutralization AI call failed: {_response.content}"
)
_content = _response.content.strip()
if _content.startswith("```"):
_content = _content.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
try:
return json.loads(_content)
except json.JSONDecodeError:
_bracket = _content.find("[")
if _bracket >= 0:
try:
return json.loads(_content[_bracket:])
except json.JSONDecodeError:
pass
return []
async def processTextAsync(self, text: str, fileId: Optional[str] = None) -> Dict[str, Any]:
"""AI-powered text neutralization with automatic chunking.
If *text* exceeds the safe token budget for the neutralization model
it is split into smaller chunks, each analysed separately. Findings
are merged and de-duplicated before placeholder replacement.
Regex patterns run as a supplementary pass to catch anything the
model missed.
"""
import uuid as _uuid
aiService = None
if self._getService:
try:
aiService = self._getService("ai")
except Exception:
pass
aiMapping: Dict[str, str] = {}
if not aiService or not hasattr(aiService, 'callAi'):
raise RuntimeError("Neutralization requires an AI service but none is available")
if text.strip():
_neutModel = self._resolveNeutModel()
_maxChunkChars = self._calcMaxChunkChars(_neutModel)
logger.info(
f"processTextAsync: model={getattr(_neutModel, 'name', '?')}, "
f"contextLength={getattr(_neutModel, 'contextLength', '?')} tokens, "
f"maxChunkChars={_maxChunkChars}"
)
_chunks = self._splitTextIntoChunks(text, _maxChunkChars)
if len(_chunks) > 1:
logger.info(
f"processTextAsync: text ({len(text)} chars) "
f"split into {len(_chunks)} chunk(s) of max {_maxChunkChars} chars"
)
for _chunkIdx, _chunkText in enumerate(_chunks):
_findings = await self._analyseChunk(aiService, _chunkText)
if not isinstance(_findings, list):
continue
for _f in _findings:
if not isinstance(_f, dict):
continue
_origText = _f.get("text", "")
_patType = _f.get("type", "other").lower()
if not _origText or _origText not in text:
continue
if _origText in aiMapping:
continue
_uid = str(_uuid.uuid4())
_placeholder = f"[{_patType}.{_uid}]"
aiMapping[_origText] = _placeholder
logger.info(f"AI neutralization found {len(aiMapping)} item(s)"
+ (f" across {len(_chunks)} chunk(s)" if len(_chunks) > 1 else ""))
neutralizedText = text
for _orig, _ph in sorted(aiMapping.items(), key=lambda x: -len(x[0])):
neutralizedText = neutralizedText.replace(_orig, _ph)
regexMapping: Dict[str, str] = {}
finalText = neutralizedText
allMapping = {**aiMapping, **regexMapping}
if allMapping:
_loop = asyncio.get_event_loop()
await _loop.run_in_executor(
None, self._persistAttributes, allMapping, fileId
)
logger.debug(f"processTextAsync: {len(allMapping)} attribute(s) persisted")
return {
'neutralized_text': finalText,
'mapping': allMapping,
'attributes': [
NeutralizationAttribute(original=k, placeholder=v)
for k, v in allMapping.items()
],
'processed_info': {'type': 'text', 'ai_findings': len(aiMapping), 'regex_findings': len(regexMapping)},
}
def processText(self, text: str, fileId: Optional[str] = None) -> Dict[str, Any]:
"""Sync wrapper around processTextAsync. Propagates errors."""
try:
return asyncio.run(self.processTextAsync(text, fileId))
except RuntimeError as _re:
if "cannot be called from a running event loop" in str(_re):
loop = asyncio.get_event_loop()
return loop.run_until_complete(self.processTextAsync(text, fileId))
raise
def processFile(self, fileId: str) -> Dict[str, Any]: def processFile(self, fileId: str) -> Dict[str, Any]:
"""Neutralize a file referenced by its fileId using component interface. """Neutralize a file referenced by its fileId using component interface.
@ -153,8 +361,7 @@ class NeutralizationService:
raise ValueError("Unable to decode file content as text.") raise ValueError("Unable to decode file content as text.")
textContent = decoded textContent = decoded
result = self._neutralizeText(textContent, textType) result = self.processText(textContent, fileId)
self._persistAttributes(result.get('mapping', {}), fileId)
if fileName: if fileName:
result['neutralized_file_name'] = f"neutralized_{fileName}" result['neutralized_file_name'] = f"neutralized_{fileName}"
result['file_id'] = fileId result['file_id'] = fileId
@ -319,6 +526,22 @@ class NeutralizationService:
return False return False
return self.interfaceNeutralizer.deleteNeutralizationAttributes(fileId) return self.interfaceNeutralizer.deleteNeutralizationAttributes(fileId)
def getSnapshots(self):
if not self.interfaceNeutralizer:
return []
return self.interfaceNeutralizer.getSnapshots()
def clearSnapshots(self) -> int:
if not self.interfaceNeutralizer:
return 0
return self.interfaceNeutralizer.clearSnapshots()
def saveSnapshot(self, sourceLabel: str, neutralizedText: str, placeholderCount: int = 0):
if not self.interfaceNeutralizer:
logger.warning("saveSnapshot: interfaceNeutralizer is None — snapshot not stored")
return None
return self.interfaceNeutralizer.createSnapshot(sourceLabel, neutralizedText, placeholderCount)
def _persistAttributes(self, mapping: Dict[str, str], fileId: Optional[str]) -> None: def _persistAttributes(self, mapping: Dict[str, str], fileId: Optional[str]) -> None:
"""Persist mapping to DB for resolve to work. mapping: originalText -> placeholder e.g. '[email.uuid]'""" """Persist mapping to DB for resolve to work. mapping: originalText -> placeholder e.g. '[email.uuid]'"""
if not self.interfaceNeutralizer or not mapping: if not self.interfaceNeutralizer or not mapping:
@ -393,7 +616,7 @@ class NeutralizationService:
except Exception as _imgErr: except Exception as _imgErr:
logger.warning(f"Image check failed in binary file '{fileName}': {_imgErr}, removing (fail-safe)") logger.warning(f"Image check failed in binary file '{fileName}': {_imgErr}, removing (fail-safe)")
continue continue
nr = self._neutralizeText(str(data), 'text' if type_group != 'table' else 'csv') nr = await self.processTextAsync(str(data), fileId)
proc = nr.get('processed_info', {}) or {} proc = nr.get('processed_info', {}) or {}
if isinstance(proc, dict) and proc.get('type') == 'error': if isinstance(proc, dict) and proc.get('type') == 'error':
neutralization_error = proc.get('error', 'Neutralization failed') neutralization_error = proc.get('error', 'Neutralization failed')
@ -402,7 +625,6 @@ class NeutralizationService:
all_mapping.update(mapping) all_mapping.update(mapping)
new_part = {**p, 'data': neu_text} new_part = {**p, 'data': neu_text}
neutralized_parts.append(new_part) neutralized_parts.append(new_part)
self._persistAttributes(all_mapping, fileId)
# 3. PDF: Use in-place only; no fallback to render # 3. PDF: Use in-place only; no fallback to render
if mimeType == "application/pdf": if mimeType == "application/pdf":
@ -546,10 +768,31 @@ class NeutralizationService:
# Helper functions # Helper functions
def _neutralizeTextLight(self, text: str) -> Dict[str, Any]:
"""Regex-only supplementary pass using already-initialised processors.
Unlike ``_neutralizeText`` this does **no** DB I/O
(``_reloadNamesFromConfig`` is skipped) so it is safe to call from
an async context without blocking the event-loop or risking a
DB-connection-pool deadlock during parallel document processing.
"""
try:
data, mapping, replaced_fields, processed_info = self.textProcessor.processTextContent(text)
neutralized_text = str(data)
attributes = [NeutralizationAttribute(original=k, placeholder=v) for k, v in mapping.items()]
return NeutralizationResult(
neutralized_text=neutralized_text,
mapping=mapping,
attributes=attributes,
processed_info=processed_info,
).model_dump()
except Exception as e:
logger.warning(f"_neutralizeTextLight error: {e}")
return {'neutralized_text': text, 'mapping': {}, 'attributes': [], 'processed_info': {'type': 'error', 'error': str(e)}}
def _neutralizeText(self, text: str, textType: str = None) -> Dict[str, Any]: def _neutralizeText(self, text: str, textType: str = None) -> Dict[str, Any]:
"""Process text and return unified dict for API consumption.""" """Process text and return unified dict for API consumption."""
try: try:
# Reload names from config before processing to ensure we have the latest names
self._reloadNamesFromConfig() self._reloadNamesFromConfig()
# Auto-detect content type if not provided # Auto-detect content type if not provided

View file

@ -689,6 +689,9 @@ async def _runWorkspaceAgent(
if allowedProviders: if allowedProviders:
aiService.services.allowedProviders = allowedProviders aiService.services.allowedProviders = allowedProviders
logger.info(f"Workspace agent: allowedProviders={allowedProviders}")
else:
logger.debug("Workspace agent: no allowedProviders in request")
if requireNeutralization is not None: if requireNeutralization is not None:
ctx.requireNeutralization = requireNeutralization ctx.requireNeutralization = requireNeutralization

View file

@ -690,8 +690,8 @@ def _registerCoreTools(registry: ToolRegistry, services):
if _fileNeedNeutralize: if _fileNeedNeutralize:
try: try:
_nSvc = services.getService("neutralization") if hasattr(services, "getService") else None _nSvc = services.getService("neutralization") if hasattr(services, "getService") else None
if _nSvc and hasattr(_nSvc, 'processText'): if _nSvc and hasattr(_nSvc, 'processTextAsync'):
_nResult = _nSvc.processText(text) _nResult = await _nSvc.processTextAsync(text, fileId)
if _nResult and _nResult.get("neutralized_text"): if _nResult and _nResult.get("neutralized_text"):
text = _nResult["neutralized_text"] text = _nResult["neutralized_text"]
logger.debug(f"readFile: neutralized text for file {fileId}") logger.debug(f"readFile: neutralized text for file {fileId}")
@ -3054,7 +3054,7 @@ def _registerCoreTools(registry: ToolRegistry, services):
if not neutralizationService.interfaceDbComponent: if not neutralizationService.interfaceDbComponent:
neutralizationService.interfaceDbComponent = services.chat.interfaceDbComponent neutralizationService.interfaceDbComponent = services.chat.interfaceDbComponent
if text: if text:
result = neutralizationService.processText(text) result = await neutralizationService.processTextAsync(text, fileId or None)
else: else:
result = neutralizationService.processFile(fileId) result = neutralizationService.processFile(fileId)
if result: if result:

View file

@ -181,13 +181,11 @@ class AiService:
_wasNeutralized = False _wasNeutralized = False
_excludedDocs: List[str] = [] _excludedDocs: List[str] = []
if self._shouldNeutralize(request): if self._shouldNeutralize(request):
request, _wasNeutralized, _excludedDocs = self._neutralizeRequest(request) request, _wasNeutralized, _excludedDocs = await self._neutralizeRequest(request)
if _excludedDocs: if _excludedDocs:
logger.warning(f"Neutralization partial failures (continuing): {_excludedDocs}") logger.warning(f"Neutralization partial failures (continuing): {_excludedDocs}")
# Set billing callback on aiObjects BEFORE the AI call logger.debug("callAi: neutralization phase done, starting main AI call")
# This callback is invoked by _callWithModel() after EVERY individual model call
# For parallel content parts (e.g., 200 MB doc), each model call creates its own transaction
self.aiObjects.billingCallback = self._createBillingCallback() self.aiObjects.billingCallback = self._createBillingCallback()
try: try:
@ -229,10 +227,11 @@ class AiService:
_wasNeutralized = False _wasNeutralized = False
_excludedDocs: List[str] = [] _excludedDocs: List[str] = []
if self._shouldNeutralize(request): if self._shouldNeutralize(request):
request, _wasNeutralized, _excludedDocs = self._neutralizeRequest(request) request, _wasNeutralized, _excludedDocs = await self._neutralizeRequest(request)
if _excludedDocs: if _excludedDocs:
logger.warning(f"Neutralization partial failures in stream (continuing): {_excludedDocs}") logger.warning(f"Neutralization partial failures in stream (continuing): {_excludedDocs}")
logger.debug("callAiStream: neutralization phase done, starting main AI stream")
self.aiObjects.billingCallback = self._createBillingCallback() self.aiObjects.billingCallback = self._createBillingCallback()
try: try:
async for chunk in self.aiObjects.callWithTextContextStream(request): async for chunk in self.aiObjects.callWithTextContextStream(request):
@ -557,6 +556,25 @@ detectedIntent-Werte:
# NEUTRALIZATION: Centralized prompt neutralization / response rehydration # NEUTRALIZATION: Centralized prompt neutralization / response rehydration
# ========================================================================= # =========================================================================
async def _hasNeutralizationModel(self) -> bool:
"""Fast check: is at least one model available for NEUTRALIZATION_TEXT
given the current effective provider list? No AI call is made."""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector as _modSel
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
_models = modelRegistry.getAvailableModels()
_providers = self._calculateEffectiveProviders()
if _providers:
_models = [m for m in _models if m.connectorType in _providers]
_opts = AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT)
_failover = _modSel.getFailoverModelList("x", "", _opts, _models)
return bool(_failover)
except Exception as _e:
logger.warning(f"_hasNeutralizationModel check failed: {_e}")
return True
def _shouldNeutralize(self, request: AiCallRequest) -> bool: def _shouldNeutralize(self, request: AiCallRequest) -> bool:
"""Check if this AI request should have neutralization applied. """Check if this AI request should have neutralization applied.
@ -566,11 +584,17 @@ detectedIntent-Werte:
3. Per-request (request.requireNeutralization) 3. Per-request (request.requireNeutralization)
No source can override another's True with False. No source can override another's True with False.
Neutralization calls themselves (NEUTRALIZATION_TEXT / NEUTRALIZATION_IMAGE)
are never re-neutralized (recursion guard).
""" """
try: try:
if not request.prompt and not request.messages and not request.context: if not request.prompt and not request.messages and not request.context:
return False return False
_opType = request.options.operationType if request.options else None
if _opType in (OperationTypeEnum.NEUTRALIZATION_TEXT, OperationTypeEnum.NEUTRALIZATION_IMAGE):
return False
_sources = [] _sources = []
# Source 1: Feature-Instance config # Source 1: Feature-Instance config
@ -600,11 +624,15 @@ detectedIntent-Werte:
logger.error(f"_shouldNeutralize check failed: {e} — defaulting to False") logger.error(f"_shouldNeutralize check failed: {e} — defaulting to False")
return False return False
def _neutralizeRequest(self, request: AiCallRequest) -> Tuple[AiCallRequest, bool, List[str]]: async def _neutralizeRequest(self, request: AiCallRequest) -> Tuple[AiCallRequest, bool, List[str]]:
"""Neutralize the prompt text and messages in an AiCallRequest. """Neutralize the prompt text and messages in an AiCallRequest (async).
Returns (modifiedRequest, wasNeutralized, excludedDocs). Returns (modifiedRequest, wasNeutralized, excludedDocs).
Uses ``processTextAsync`` which calls AI with NEUTRALIZATION_TEXT
to identify PII, protected logic and names then applies regex as
supplementary pass.
FAILSAFE behaviour when ``requireNeutralization is True`` (explicit): FAILSAFE behaviour when ``requireNeutralization is True`` (explicit):
- Service unavailable raises (caller must not send raw data to AI). - Service unavailable raises (caller must not send raw data to AI).
- Prompt neutralization fails raises. - Prompt neutralization fails raises.
@ -619,7 +647,7 @@ detectedIntent-Werte:
excludedDocs: List[str] = [] excludedDocs: List[str] = []
neutralSvc = self._get_service("neutralization") neutralSvc = self._get_service("neutralization")
if not neutralSvc or not hasattr(neutralSvc, 'processText'): if not neutralSvc or not hasattr(neutralSvc, 'processTextAsync'):
if _hardMode: if _hardMode:
raise RuntimeError("Neutralization explicitly required but service unavailable — AI call BLOCKED") raise RuntimeError("Neutralization explicitly required but service unavailable — AI call BLOCKED")
logger.warning("Neutralization required by config but service unavailable — continuing without neutralization") logger.warning("Neutralization required by config but service unavailable — continuing without neutralization")
@ -627,13 +655,25 @@ detectedIntent-Werte:
return request, False, excludedDocs return request, False, excludedDocs
_wasNeutralized = False _wasNeutralized = False
_snapshots: list = []
if _hardMode:
_hasNeutModel = await self._hasNeutralizationModel()
if not _hasNeutModel:
raise RuntimeError(
"Neutralisierung ist aktiviert, aber es ist kein AI-Modell für "
"NEUTRALIZATION_TEXT verfügbar. Bitte ein Modell für Neutralisierung "
"freigeben oder die Neutralisierung deaktivieren."
)
if request.prompt: if request.prompt:
logger.debug(f"_neutralizeRequest: neutralizing prompt ({len(request.prompt)} chars)")
try: try:
result = neutralSvc.processText(request.prompt) result = await neutralSvc.processTextAsync(request.prompt)
if result and result.get("neutralized_text"): if result and result.get("neutralized_text"):
request.prompt = result["neutralized_text"] request.prompt = result["neutralized_text"]
_wasNeutralized = True _wasNeutralized = True
_snapshots.append(("Prompt", result["neutralized_text"], len(result.get("mapping", {}))))
logger.debug("Neutralized prompt in AiCallRequest") logger.debug("Neutralized prompt in AiCallRequest")
else: else:
if _hardMode: if _hardMode:
@ -649,11 +689,13 @@ detectedIntent-Werte:
excludedDocs.append(f"Prompt neutralization error: {e}") excludedDocs.append(f"Prompt neutralization error: {e}")
if request.context: if request.context:
logger.debug(f"_neutralizeRequest: neutralizing context ({len(request.context)} chars)")
try: try:
result = neutralSvc.processText(request.context) result = await neutralSvc.processTextAsync(request.context)
if result and result.get("neutralized_text"): if result and result.get("neutralized_text"):
request.context = result["neutralized_text"] request.context = result["neutralized_text"]
_wasNeutralized = True _wasNeutralized = True
_snapshots.append(("Kontext", result["neutralized_text"], len(result.get("mapping", {}))))
logger.debug("Neutralized context in AiCallRequest") logger.debug("Neutralized context in AiCallRequest")
else: else:
if _hardMode: if _hardMode:
@ -668,6 +710,9 @@ detectedIntent-Werte:
logger.warning(f"Neutralization of context failed: {e} — sending original context") logger.warning(f"Neutralization of context failed: {e} — sending original context")
excludedDocs.append(f"Context neutralization error: {e}") excludedDocs.append(f"Context neutralization error: {e}")
_msgCount = len(request.messages) if request.messages and isinstance(request.messages, list) else 0
if _msgCount:
logger.debug(f"_neutralizeRequest: neutralizing {_msgCount} message(s)")
if request.messages and isinstance(request.messages, list): if request.messages and isinstance(request.messages, list):
cleanMessages = [] cleanMessages = []
for idx, msg in enumerate(request.messages): for idx, msg in enumerate(request.messages):
@ -680,27 +725,33 @@ detectedIntent-Werte:
cleanMessages.append(msg) cleanMessages.append(msg)
continue continue
try: try:
result = neutralSvc.processText(content) result = await neutralSvc.processTextAsync(content)
if result and result.get("neutralized_text"): if result and result.get("neutralized_text"):
msg["content"] = result["neutralized_text"] msg["content"] = result["neutralized_text"]
_wasNeutralized = True _wasNeutralized = True
_role = msg.get("role", "?")
_snapshots.append((f"Nachricht {idx+1} ({_role})", result["neutralized_text"], len(result.get("mapping", {}))))
cleanMessages.append(msg) cleanMessages.append(msg)
else: else:
if _hardMode: if _hardMode:
logger.warning(f"Message[{idx}] neutralization empty — REMOVING message (hard mode)") raise RuntimeError(
excludedDocs.append(f"Message[{idx}] neutralization failed; message REMOVED") f"Neutralisierung von Nachricht {idx+1}/{_msgCount} schlug fehl "
else: f"(leere Antwort). Konversation kann nicht sicher gesendet werden."
logger.warning(f"Neutralization of message[{idx}] returned no neutralized_text — keeping original") )
excludedDocs.append(f"Message[{idx}] neutralization failed; original kept") logger.warning(f"Neutralization of message[{idx}] returned no neutralized_text — keeping original")
cleanMessages.append(msg) excludedDocs.append(f"Message[{idx}] neutralization failed; original kept")
cleanMessages.append(msg)
except RuntimeError:
raise
except Exception as e: except Exception as e:
if _hardMode: if _hardMode:
logger.warning(f"Message[{idx}] neutralization error — REMOVING message (hard mode): {e}") raise RuntimeError(
excludedDocs.append(f"Message[{idx}] neutralization error; message REMOVED: {e}") f"Neutralisierung von Nachricht {idx+1}/{_msgCount} schlug fehl: {e}. "
else: f"Konversation kann nicht sicher gesendet werden."
logger.warning(f"Neutralization of message[{idx}] failed: {e} — keeping original") ) from e
excludedDocs.append(f"Message[{idx}] neutralization error: {e}") logger.warning(f"Neutralization of message[{idx}] failed: {e} — keeping original")
cleanMessages.append(msg) excludedDocs.append(f"Message[{idx}] neutralization error: {e}")
cleanMessages.append(msg)
elif isinstance(content, list): elif isinstance(content, list):
_cleanParts = [] _cleanParts = []
for _partIdx, _part in enumerate(content): for _partIdx, _part in enumerate(content):
@ -710,23 +761,29 @@ detectedIntent-Werte:
_partType = _part.get("type", "") _partType = _part.get("type", "")
if _partType == "text" and _part.get("text"): if _partType == "text" and _part.get("text"):
try: try:
_result = neutralSvc.processText(_part["text"]) _result = await neutralSvc.processTextAsync(_part["text"])
if _result and _result.get("neutralized_text"): if _result and _result.get("neutralized_text"):
_part["text"] = _result["neutralized_text"] _part["text"] = _result["neutralized_text"]
_wasNeutralized = True _wasNeutralized = True
_role = msg.get("role", "?")
_snapshots.append((f"Nachricht {idx+1}.{_partIdx+1} ({_role})", _result["neutralized_text"], len(_result.get("mapping", {}))))
_cleanParts.append(_part) _cleanParts.append(_part)
else: else:
if _hardMode: if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] text neutralization empty — REMOVING part") raise RuntimeError(
excludedDocs.append(f"Message[{idx}].content[{_partIdx}] text removed") f"Neutralisierung von Nachricht {idx+1}, Teil {_partIdx+1} "
else: f"schlug fehl (leere Antwort)."
_cleanParts.append(_part) )
_cleanParts.append(_part)
except RuntimeError:
raise
except Exception as e: except Exception as e:
if _hardMode: if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] text neutralization error — REMOVING: {e}") raise RuntimeError(
excludedDocs.append(f"Message[{idx}].content[{_partIdx}] text error: {e}") f"Neutralisierung von Nachricht {idx+1}, Teil {_partIdx+1} "
else: f"schlug fehl: {e}"
_cleanParts.append(_part) ) from e
_cleanParts.append(_part)
elif _partType == "image_url": elif _partType == "image_url":
if _hardMode: if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] image_url — REMOVING (neutralization active)") logger.warning(f"Message[{idx}].content[{_partIdx}] image_url — REMOVING (neutralization active)")
@ -738,12 +795,12 @@ detectedIntent-Werte:
if _cleanParts: if _cleanParts:
msg["content"] = _cleanParts msg["content"] = _cleanParts
cleanMessages.append(msg) cleanMessages.append(msg)
elif _hardMode: else:
logger.warning(f"Message[{idx}] all parts removed — REMOVING message") cleanMessages.append(msg)
excludedDocs.append(f"Message[{idx}] fully removed after neutralization")
else: else:
cleanMessages.append(msg) cleanMessages.append(msg)
request.messages = cleanMessages request.messages = cleanMessages
logger.debug(f"_neutralizeRequest: messages done, {len(cleanMessages)} kept of {_msgCount}")
if hasattr(request, 'contentParts') and request.contentParts: if hasattr(request, 'contentParts') and request.contentParts:
_cleanParts = [] _cleanParts = []
@ -752,10 +809,11 @@ detectedIntent-Werte:
_data = getattr(_cp, 'data', '') or '' _data = getattr(_cp, 'data', '') or ''
if _tg in ('text', 'table') and _data: if _tg in ('text', 'table') and _data:
try: try:
_result = neutralSvc.processText(str(_data)) _result = await neutralSvc.processTextAsync(str(_data))
if _result and _result.get("neutralized_text"): if _result and _result.get("neutralized_text"):
_cp.data = _result["neutralized_text"] _cp.data = _result["neutralized_text"]
_wasNeutralized = True _wasNeutralized = True
_snapshots.append((f"Inhalt {_cpIdx+1} ({_tg})", _result["neutralized_text"], len(_result.get("mapping", {}))))
_cleanParts.append(_cp) _cleanParts.append(_cp)
else: else:
if _hardMode: if _hardMode:
@ -778,7 +836,18 @@ detectedIntent-Werte:
else: else:
_cleanParts.append(_cp) _cleanParts.append(_cp)
request.contentParts = _cleanParts request.contentParts = _cleanParts
logger.debug(f"_neutralizeRequest: contentParts done, {len(_cleanParts)} kept")
if _snapshots and _wasNeutralized:
try:
neutralSvc.clearSnapshots()
for _label, _text, _phCount in _snapshots:
neutralSvc.saveSnapshot(_label, _text, _phCount)
logger.debug(f"_neutralizeRequest: saved {len(_snapshots)} snapshot(s)")
except Exception as _snapErr:
logger.warning(f"_neutralizeRequest: could not save snapshots: {_snapErr}")
logger.info(f"_neutralizeRequest complete: neutralized={_wasNeutralized}, excluded={len(excludedDocs)}")
return request, _wasNeutralized, excludedDocs return request, _wasNeutralized, excludedDocs
def _rehydrateResponse(self, responseText: str) -> str: def _rehydrateResponse(self, responseText: str) -> str:

View file

@ -11,6 +11,7 @@ import logging
import importlib import importlib
from typing import Dict, Type, List, Optional, Tuple from typing import Dict, Type, List, Optional, Tuple
from .documentRendererBaseTemplate import BaseRenderer from .documentRendererBaseTemplate import BaseRenderer
from .codeRendererBaseTemplate import BaseCodeRenderer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -52,9 +53,9 @@ class RendererRegistry:
for attrName in dir(module): for attrName in dir(module):
attr = getattr(module, attrName) attr = getattr(module, attrName)
if (isinstance(attr, type) and if (isinstance(attr, type) and
issubclass(attr, BaseRenderer) and issubclass(attr, BaseRenderer) and
attr != BaseRenderer and attr not in (BaseRenderer, BaseCodeRenderer) and
hasattr(attr, 'getSupportedFormats')): hasattr(attr, 'getSupportedFormats')):
self._registerRendererClass(attr) self._registerRendererClass(attr)
@ -72,6 +73,8 @@ class RendererRegistry:
"""Register a renderer class keyed by (format, outputStyle).""" """Register a renderer class keyed by (format, outputStyle)."""
try: try:
supportedFormats = rendererClass.getSupportedFormats() supportedFormats = rendererClass.getSupportedFormats()
if not supportedFormats:
return
priority = rendererClass.getPriority() if hasattr(rendererClass, 'getPriority') else 0 priority = rendererClass.getPriority() if hasattr(rendererClass, 'getPriority') else 0
for formatName in supportedFormats: for formatName in supportedFormats:

View file

@ -158,7 +158,7 @@ class KnowledgeService:
if not _textContent: if not _textContent:
continue continue
try: try:
_neutralResult = _neutralSvc.processText(_textContent) _neutralResult = await _neutralSvc.processTextAsync(_textContent, fileId)
if _neutralResult and _neutralResult.get("neutralized_text"): if _neutralResult and _neutralResult.get("neutralized_text"):
_obj["data"] = _neutralResult["neutralized_text"] _obj["data"] = _neutralResult["neutralized_text"]
_neutralizedObjects.append(_obj) _neutralizedObjects.append(_obj)

View file

@ -169,7 +169,7 @@ async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}" f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}"
) )
neutralizationResult = self.services.neutralization.processText(part.data) neutralizationResult = await self.services.neutralization.processTextAsync(part.data)
if neutralizationResult and 'neutralized_text' in neutralizationResult: if neutralizationResult and 'neutralized_text' in neutralizationResult:
neutralizedData = neutralizationResult['neutralized_text'] neutralizedData = neutralizationResult['neutralized_text']