streamlined neutralization flow

This commit is contained in:
ValueOn AG 2026-03-30 00:14:57 +02:00
parent 7e88005721
commit e0a09ae6b1
13 changed files with 655 additions and 97 deletions

View file

@ -18,7 +18,9 @@ from typing import List, Dict, Any, Optional, AsyncGenerator, Union
from modules.datamodels.datamodelAi import AiModel, AiModelCall, AiModelResponse
_RETRY_AFTER_PATTERN = _re.compile(r"try again in (\d+(?:\.\d+)?)\s*s", _re.IGNORECASE)
_RETRY_AFTER_PATTERN = _re.compile(
r"(?:try again in|retry after)\s+(\d+(?:\.\d+)?)\s*s", _re.IGNORECASE
)
def _parseRetryAfterSeconds(message: str) -> float:

View file

@ -22,7 +22,7 @@ import time
from typing import List, Optional, Dict, Any
from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG
from .aicoreBase import BaseConnectorAi
from .aicoreBase import BaseConnectorAi, RateLimitExceededException
from modules.datamodels.datamodelAi import (
AiModel,
PriorityEnum,
@ -370,6 +370,9 @@ class AiPrivateLlm(BaseConnectorAi):
if response.status_code != 200:
errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}"
if response.status_code == 429:
logger.warning(errorMessage)
raise RateLimitExceededException(errorMessage)
logger.error(errorMessage)
raise HTTPException(status_code=500, detail=errorMessage)
@ -461,6 +464,9 @@ class AiPrivateLlm(BaseConnectorAi):
if response.status_code != 200:
errorMessage = f"Private-LLM API error: {response.status_code} - {response.text}"
if response.status_code == 429:
logger.warning(errorMessage)
raise RateLimitExceededException(errorMessage)
logger.error(errorMessage)
raise HTTPException(status_code=500, detail=errorMessage)

View file

@ -58,6 +58,17 @@ class DataNeutralizerAttributes(BaseModel):
originalText: str = Field(description="Original text that was neutralized", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True})
fileId: Optional[str] = Field(default=None, description="ID of the file this attribute belongs to", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
patternType: str = Field(description="Type of pattern that matched (email, phone, name, etc.)", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": True})
class DataNeutralizationSnapshot(BaseModel):
"""Stores the full neutralized text (with embedded placeholders) per source."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
mandateId: str = Field(description="Mandate scope")
featureInstanceId: str = Field(default="", description="Feature instance scope")
userId: str = Field(description="User who triggered neutralization")
sourceLabel: str = Field(description="Human label, e.g. 'Prompt', 'Kontext', 'Nachricht 3'")
neutralizedText: str = Field(description="Full text with [type.uuid] placeholders embedded")
placeholderCount: int = Field(default=0, description="Number of placeholders in the text")
registerModelLabels(
"DataNeutralizerAttributes",
{"en": "Neutralized Data Attribute", "fr": "Attribut de données neutralisées"},
@ -71,5 +82,18 @@ registerModelLabels(
"patternType": {"en": "Pattern Type", "fr": "Type de modèle"},
},
)
registerModelLabels(
"DataNeutralizationSnapshot",
{"en": "Neutralization Snapshot", "de": "Neutralisierungs-Snapshot"},
{
"id": {"en": "ID"},
"mandateId": {"en": "Mandate ID"},
"featureInstanceId": {"en": "Feature Instance ID"},
"userId": {"en": "User ID"},
"sourceLabel": {"en": "Source", "de": "Quelle"},
"neutralizedText": {"en": "Neutralized Text", "de": "Neutralisierter Text"},
"placeholderCount": {"en": "Placeholders", "de": "Platzhalter"},
},
)

View file

@ -11,6 +11,7 @@ from typing import Dict, List, Any, Optional
from modules.features.neutralization.datamodelFeatureNeutralizer import (
DataNeutraliserConfig,
DataNeutralizerAttributes,
DataNeutralizationSnapshot,
)
from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.interfaces.interfaceRbac import getRecordsetWithRBAC
@ -227,6 +228,74 @@ class InterfaceFeatureNeutralizer:
logger.error(f"Error deleting attribute by ID: {str(e)}")
return False
# ------------------------------------------------------------------
# Snapshot CRUD
# ------------------------------------------------------------------
def getSnapshots(self) -> List[DataNeutralizationSnapshot]:
"""Return all neutralization snapshots for the current mandate + feature instance."""
try:
_filter: Dict[str, Any] = {"mandateId": self.mandateId}
if self.featureInstanceId:
_filter["featureInstanceId"] = self.featureInstanceId
rows = getRecordsetWithRBAC(
self.db,
DataNeutralizationSnapshot,
self.currentUser,
recordFilter=_filter,
mandateId=self.mandateId,
)
return [
DataNeutralizationSnapshot(**{k: v for k, v in r.items() if not k.startswith("_")})
for r in rows
]
except Exception as e:
logger.error(f"Error getting snapshots: {e}")
return []
def clearSnapshots(self) -> int:
"""Delete all snapshots for the current feature-instance scope. Returns count deleted."""
try:
_filter: Dict[str, Any] = {"mandateId": self.mandateId}
if self.featureInstanceId:
_filter["featureInstanceId"] = self.featureInstanceId
existing = self.db.getRecordset(DataNeutralizationSnapshot, recordFilter=_filter)
for row in existing:
self.db.recordDelete(DataNeutralizationSnapshot, row["id"])
return len(existing)
except Exception as e:
logger.error(f"Error clearing snapshots: {e}")
return 0
def createSnapshot(
self,
sourceLabel: str,
neutralizedText: str,
placeholderCount: int = 0,
) -> Optional[DataNeutralizationSnapshot]:
"""Persist one neutralization snapshot."""
try:
if not self.userId:
logger.warning("Cannot create snapshot: missing userId")
return None
snap = DataNeutralizationSnapshot(
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
userId=self.userId,
sourceLabel=sourceLabel,
neutralizedText=neutralizedText,
placeholderCount=placeholderCount,
)
created = self.db.recordCreate(DataNeutralizationSnapshot, snap.model_dump())
return DataNeutralizationSnapshot(**{k: v for k, v in created.items() if not k.startswith("_")})
except Exception as e:
logger.error(f"Error creating snapshot: {e}")
return None
# ------------------------------------------------------------------
# Attribute CRUD
# ------------------------------------------------------------------
def createAttribute(
self,
attributeId: str,

View file

@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
from urllib.parse import urlparse, unquote
from modules.datamodels.datamodelUam import User
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig, DataNeutralizationSnapshot
from .interfaceFeatureNeutralizer import getInterface as _getNeutralizerInterface
from modules.serviceHub import getInterface as getServices
@ -86,7 +86,7 @@ class NeutralizationPlayground:
'neutralized_file_id': None,
'processed_info': {'type': 'error', 'error': 'File could not be decoded as text. Supported: UTF-8, Latin-1. For PDF/Word/Excel, use supported binary formats.'}
}
result = self.services.neutralization.processText(text_content)
result = await self.services.neutralization.processTextAsync(text_content)
result['neutralized_file_name'] = f'neutralized_{filename}'
# Save neutralized text as file to user files
if self.services.interfaceDbComponent and result.get('neutralized_text') is not None:
@ -198,12 +198,28 @@ class NeutralizationPlayground:
"""Resolve UIDs in neutralized text back to original text"""
return self.services.neutralization.resolveText(text)
def getSnapshots(self) -> List[DataNeutralizationSnapshot]:
"""Return stored neutralization text snapshots."""
try:
return self.services.neutralization.getSnapshots()
except Exception as e:
logger.error(f"Error getting snapshots: {e}")
return []
def getAttributes(self, fileId: str = None) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID"""
try:
allAttributes = self.services.neutralization.getAttributes()
if fileId:
return [attr for attr in allAttributes if attr.fileId == fileId]
want = str(fileId).strip()
def _matches(a: DataNeutralizerAttributes) -> bool:
af = a.fileId
if af is None or (isinstance(af, str) and not str(af).strip()):
return False
return str(af).strip() == want
return [attr for attr in allAttributes if _matches(attr)]
return allAttributes
except Exception as e:
logger.error(f"Error getting attributes: {str(e)}")
@ -396,7 +412,7 @@ class SharepointProcessor:
textContent = fileContent.decode('utf-8')
except UnicodeDecodeError:
textContent = fileContent.decode('latin-1')
result = self.services.neutralization.processText(textContent)
result = await self.services.neutralization.processTextAsync(textContent)
content_to_upload = (result.get('neutralized_text') or '').encode('utf-8')
neutralizedFilename = f"neutralized_{fileInfo['name']}"

View file

@ -8,12 +8,33 @@ import logging
from modules.auth import limiter, getRequestContext, RequestContext
# Import interfaces
from .datamodelFeatureNeutralizer import DataNeutraliserConfig, DataNeutralizerAttributes
from .datamodelFeatureNeutralizer import DataNeutraliserConfig, DataNeutralizerAttributes, DataNeutralizationSnapshot
from .neutralizePlayground import NeutralizationPlayground
# Configure logger
logger = logging.getLogger(__name__)
def _assertFeatureInstancePathMatchesContext(featureInstanceIdFromPath: str, context: RequestContext) -> None:
"""Reject path/instance mismatch when request context already carries an instance id."""
ctxId = str(context.featureInstanceId).strip() if getattr(context, "featureInstanceId", None) else ""
pathId = (featureInstanceIdFromPath or "").strip()
if ctxId and pathId and pathId != ctxId:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Feature instance id in URL does not match request context (X-Instance-Id)",
)
def _fetchNeutralizationAttributes(context: RequestContext, fileId: Optional[str]) -> List[DataNeutralizerAttributes]:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getAttributes(fileId)
# Create router for neutralization endpoints
router = APIRouter(
prefix="/api/neutralization",
@ -208,15 +229,9 @@ def get_neutralization_attributes(
) -> List[DataNeutralizerAttributes]:
"""Get neutralization attributes, optionally filtered by file ID"""
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
attributes = service.getAttributes(fileId)
return attributes
return _fetchNeutralizationAttributes(context, fileId)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization attributes: {str(e)}")
raise HTTPException(
@ -224,6 +239,72 @@ def get_neutralization_attributes(
detail=f"Error getting neutralization attributes: {str(e)}"
)
@router.get("/{feature_instance_id}/attributes", response_model=List[DataNeutralizerAttributes])
@limiter.limit("30/minute")
def get_neutralization_attributes_scoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace / feature instance id (must match X-Instance-Id when set)"),
fileId: Optional[str] = Query(None, description="Filter by file ID"),
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizerAttributes]:
"""Same as GET /attributes; path includes instance id for workspace UI compatibility."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _fetchNeutralizationAttributes(context, fileId)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization attributes: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error getting neutralization attributes: {str(e)}"
)
@router.get("/snapshots", response_model=List[DataNeutralizationSnapshot])
@limiter.limit("30/minute")
def get_neutralization_snapshots(
request: Request,
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizationSnapshot]:
"""Return neutralized-text snapshots (full text with placeholders) for the current feature instance."""
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getSnapshots()
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization snapshots: {e}")
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
@router.get("/{feature_instance_id}/snapshots", response_model=List[DataNeutralizationSnapshot])
@limiter.limit("30/minute")
def get_neutralization_snapshots_scoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace instance id (must match X-Instance-Id when set)"),
context: RequestContext = Depends(getRequestContext),
) -> List[DataNeutralizationSnapshot]:
"""Same as GET /snapshots; path includes instance id for workspace UI (explicit scope)."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return service.getSnapshots()
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting neutralization snapshots (scoped): {e}")
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
@router.post("/process-sharepoint", response_model=Dict[str, Any])
@limiter.limit("5/minute")
async def process_sharepoint_files(
@ -317,6 +398,21 @@ def get_neutralization_stats(
detail=f"Error getting neutralization stats: {str(e)}"
)
def _deleteSingleNeutralizationAttribute(context: RequestContext, attributeId: str) -> Dict[str, str]:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
success = service.deleteAttribute(attributeId)
if success:
return {"message": f"Attribute {attributeId} deleted"}
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Attribute {attributeId} not found",
)
@router.delete("/attributes/single/{attributeId}", response_model=Dict[str, str])
@limiter.limit("30/minute")
def deleteAttribute(
@ -326,20 +422,7 @@ def deleteAttribute(
) -> Dict[str, str]:
"""Delete a single neutralization attribute by ID."""
try:
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
success = service.deleteAttribute(attributeId)
if success:
return {"message": f"Attribute {attributeId} deleted"}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Attribute {attributeId} not found"
)
return _deleteSingleNeutralizationAttribute(context, attributeId)
except HTTPException:
raise
except Exception as e:
@ -347,6 +430,40 @@ def deleteAttribute(
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/{feature_instance_id}/attributes/single/{attributeId}", response_model=Dict[str, str])
@limiter.limit("30/minute")
def deleteAttributeScoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace / feature instance id"),
attributeId: str = Path(..., description="Attribute ID to delete"),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, str]:
"""Same as DELETE /attributes/single/{attributeId}; path includes instance id for workspace UI."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _deleteSingleNeutralizationAttribute(context, attributeId)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error deleting attribute: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
def _retriggerNeutralizationBody(context: RequestContext, fileId: str) -> Dict[str, str]:
if not fileId:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="fileId is required",
)
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
service.cleanupFileAttributes(fileId)
return {"message": f"Neutralization re-triggered for file {fileId}", "fileId": fileId}
@router.post("/retrigger", response_model=Dict[str, str])
@limiter.limit("10/minute")
def retriggerNeutralization(
@ -356,20 +473,26 @@ def retriggerNeutralization(
) -> Dict[str, str]:
"""Re-trigger neutralization for a specific file."""
try:
fileId = retriggerData.get("fileId", "")
if not fileId:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="fileId is required"
)
return _retriggerNeutralizationBody(context, retriggerData.get("fileId", ""))
except HTTPException:
raise
except Exception as e:
logger.error(f"Error re-triggering neutralization: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
service = NeutralizationPlayground(
context.user,
str(context.mandateId) if context.mandateId else "",
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
service.cleanupFileAttributes(fileId)
return {"message": f"Neutralization re-triggered for file {fileId}", "fileId": fileId}
@router.post("/{feature_instance_id}/retrigger", response_model=Dict[str, str])
@limiter.limit("10/minute")
def retriggerNeutralizationScoped(
request: Request,
feature_instance_id: str = Path(..., description="Workspace / feature instance id"),
retriggerData: Dict[str, str] = Body(...),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, str]:
"""Same as POST /retrigger; path includes instance id for workspace UI compatibility."""
_assertFeatureInstancePathMatchesContext(feature_instance_id, context)
try:
return _retriggerNeutralizationBody(context, retriggerData.get("fileId", ""))
except HTTPException:
raise
except Exception as e:

View file

@ -60,6 +60,12 @@ class NeutralizationService:
mandateId=serviceCenter.mandateId or dbApp.mandateId,
featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(dbApp, 'featureInstanceId', None)
)
elif serviceCenter and getattr(serviceCenter, "user", None):
self.interfaceNeutralizer = getNeutralizerInterface(
currentUser=serviceCenter.user,
mandateId=getattr(serviceCenter, 'mandateId', None) or getattr(serviceCenter, 'mandate_id', None),
featureInstanceId=getattr(serviceCenter, 'featureInstanceId', None) or getattr(serviceCenter, 'feature_instance_id', None),
)
namesList = NamesToParse if isinstance(NamesToParse, list) else []
self.NamesToParse = namesList
@ -82,11 +88,213 @@ class NeutralizationService:
# Public API: process text or file
def processText(self, text: str) -> Dict[str, Any]:
"""Neutralize a raw text string and return a standard result dict."""
result = self._neutralizeText(text, 'text')
self._persistAttributes(result.get('mapping', {}), None)
return result
_NEUT_INSTRUCTION = (
"Analyze the following text and identify ALL sensitive content that must be neutralized:\n"
"1. Personal data (PII): names of persons, email addresses, phone numbers, "
"physical addresses, ID numbers, dates of birth, financial data (IBAN, account numbers), "
"social security numbers\n"
"2. Protected business logic: proprietary algorithms, trade secrets, confidential "
"processes, internal procedures, code snippets that reveal implementation details\n"
"3. Named entities: company names, product names, project names, brand names\n\n"
"Return ONLY a JSON array (no markdown, no explanation):\n"
'[{"text":"exact substring","type":"name|email|phone|address|id|financial|logic|company|product|location|other"}]\n\n'
"Rules:\n"
"- Every entry's 'text' must be an exact, verbatim substring of the input.\n"
"- Do NOT include generic words, common language constructs or non-sensitive terms.\n"
"- If nothing is sensitive, return [].\n\n"
)
_BYTES_PER_TOKEN = 3
_SELECTOR_MAX_RATIO = 0.8
_CHUNK_SAFETY_MARGIN = 0.9
def _resolveNeutModel(self):
"""Query the model registry for the best NEUTRALIZATION_TEXT model.
Returns the model object (with contextLength etc.) or None."""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector as _modSel
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
_models = modelRegistry.getAvailableModels()
_opts = AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT)
_failover = _modSel.getFailoverModelList("x", "", _opts, _models)
return _failover[0] if _failover else None
except Exception as _e:
logger.warning(f"_resolveNeutModel failed: {_e}")
return None
def _calcMaxChunkChars(self, model) -> int:
"""Derive the maximum text-chunk size (in characters) from the selected
model's contextLength, mirroring the rules in aicoreModelSelector:
promptTokens = promptBytes / 3 must be <= contextLength * 0.8
Subtract the instruction overhead and apply a safety margin."""
if not model or getattr(model, 'contextLength', 0) <= 0:
return 5000
_instructionBytes = len(self._NEUT_INSTRUCTION.encode('utf-8')) + 30
_maxPromptBytes = int(model.contextLength * self._SELECTOR_MAX_RATIO * self._BYTES_PER_TOKEN)
_maxChunkChars = int((_maxPromptBytes - _instructionBytes) * self._CHUNK_SAFETY_MARGIN)
return max(_maxChunkChars, 500)
@staticmethod
def _splitTextIntoChunks(text: str, maxChars: int) -> List[str]:
"""Split *text* into chunks of at most *maxChars*, preferring paragraph
then sentence boundaries so that the LLM sees coherent blocks."""
if len(text) <= maxChars:
return [text]
chunks: List[str] = []
remaining = text
while remaining:
if len(remaining) <= maxChars:
chunks.append(remaining)
break
_cut = maxChars
_para = remaining.rfind("\n\n", 0, _cut)
if _para > maxChars // 3:
_cut = _para + 2
else:
_nl = remaining.rfind("\n", 0, _cut)
if _nl > maxChars // 3:
_cut = _nl + 1
else:
_dot = remaining.rfind(". ", 0, _cut)
if _dot > maxChars // 3:
_cut = _dot + 2
else:
_sp = remaining.rfind(" ", 0, _cut)
if _sp > maxChars // 3:
_cut = _sp + 1
chunks.append(remaining[:_cut])
remaining = remaining[_cut:]
return chunks
async def _analyseChunk(self, aiService, chunkText: str) -> List[dict]:
"""Send one chunk to the NEUTRALIZATION_TEXT model, return raw findings list."""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
_prompt = self._NEUT_INSTRUCTION + "Text to analyze:\n---\n" + chunkText + "\n---"
_request = AiCallRequest(
prompt=_prompt,
options=AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT),
)
_response = await aiService.callAi(_request)
if not _response or not getattr(_response, 'content', None):
raise RuntimeError(
"Neutralization AI call returned no response "
"(no model available for NEUTRALIZATION_TEXT?)"
)
if getattr(_response, 'errorCount', 0) > 0 or getattr(_response, 'modelName', '') == 'error':
raise RuntimeError(
f"Neutralization AI call failed: {_response.content}"
)
_content = _response.content.strip()
if _content.startswith("```"):
_content = _content.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
try:
return json.loads(_content)
except json.JSONDecodeError:
_bracket = _content.find("[")
if _bracket >= 0:
try:
return json.loads(_content[_bracket:])
except json.JSONDecodeError:
pass
return []
async def processTextAsync(self, text: str, fileId: Optional[str] = None) -> Dict[str, Any]:
"""AI-powered text neutralization with automatic chunking.
If *text* exceeds the safe token budget for the neutralization model
it is split into smaller chunks, each analysed separately. Findings
are merged and de-duplicated before placeholder replacement.
Regex patterns run as a supplementary pass to catch anything the
model missed.
"""
import uuid as _uuid
aiService = None
if self._getService:
try:
aiService = self._getService("ai")
except Exception:
pass
aiMapping: Dict[str, str] = {}
if not aiService or not hasattr(aiService, 'callAi'):
raise RuntimeError("Neutralization requires an AI service but none is available")
if text.strip():
_neutModel = self._resolveNeutModel()
_maxChunkChars = self._calcMaxChunkChars(_neutModel)
logger.info(
f"processTextAsync: model={getattr(_neutModel, 'name', '?')}, "
f"contextLength={getattr(_neutModel, 'contextLength', '?')} tokens, "
f"maxChunkChars={_maxChunkChars}"
)
_chunks = self._splitTextIntoChunks(text, _maxChunkChars)
if len(_chunks) > 1:
logger.info(
f"processTextAsync: text ({len(text)} chars) "
f"split into {len(_chunks)} chunk(s) of max {_maxChunkChars} chars"
)
for _chunkIdx, _chunkText in enumerate(_chunks):
_findings = await self._analyseChunk(aiService, _chunkText)
if not isinstance(_findings, list):
continue
for _f in _findings:
if not isinstance(_f, dict):
continue
_origText = _f.get("text", "")
_patType = _f.get("type", "other").lower()
if not _origText or _origText not in text:
continue
if _origText in aiMapping:
continue
_uid = str(_uuid.uuid4())
_placeholder = f"[{_patType}.{_uid}]"
aiMapping[_origText] = _placeholder
logger.info(f"AI neutralization found {len(aiMapping)} item(s)"
+ (f" across {len(_chunks)} chunk(s)" if len(_chunks) > 1 else ""))
neutralizedText = text
for _orig, _ph in sorted(aiMapping.items(), key=lambda x: -len(x[0])):
neutralizedText = neutralizedText.replace(_orig, _ph)
regexMapping: Dict[str, str] = {}
finalText = neutralizedText
allMapping = {**aiMapping, **regexMapping}
if allMapping:
_loop = asyncio.get_event_loop()
await _loop.run_in_executor(
None, self._persistAttributes, allMapping, fileId
)
logger.debug(f"processTextAsync: {len(allMapping)} attribute(s) persisted")
return {
'neutralized_text': finalText,
'mapping': allMapping,
'attributes': [
NeutralizationAttribute(original=k, placeholder=v)
for k, v in allMapping.items()
],
'processed_info': {'type': 'text', 'ai_findings': len(aiMapping), 'regex_findings': len(regexMapping)},
}
def processText(self, text: str, fileId: Optional[str] = None) -> Dict[str, Any]:
"""Sync wrapper around processTextAsync. Propagates errors."""
try:
return asyncio.run(self.processTextAsync(text, fileId))
except RuntimeError as _re:
if "cannot be called from a running event loop" in str(_re):
loop = asyncio.get_event_loop()
return loop.run_until_complete(self.processTextAsync(text, fileId))
raise
def processFile(self, fileId: str) -> Dict[str, Any]:
"""Neutralize a file referenced by its fileId using component interface.
@ -153,8 +361,7 @@ class NeutralizationService:
raise ValueError("Unable to decode file content as text.")
textContent = decoded
result = self._neutralizeText(textContent, textType)
self._persistAttributes(result.get('mapping', {}), fileId)
result = self.processText(textContent, fileId)
if fileName:
result['neutralized_file_name'] = f"neutralized_{fileName}"
result['file_id'] = fileId
@ -319,6 +526,22 @@ class NeutralizationService:
return False
return self.interfaceNeutralizer.deleteNeutralizationAttributes(fileId)
def getSnapshots(self):
if not self.interfaceNeutralizer:
return []
return self.interfaceNeutralizer.getSnapshots()
def clearSnapshots(self) -> int:
if not self.interfaceNeutralizer:
return 0
return self.interfaceNeutralizer.clearSnapshots()
def saveSnapshot(self, sourceLabel: str, neutralizedText: str, placeholderCount: int = 0):
if not self.interfaceNeutralizer:
logger.warning("saveSnapshot: interfaceNeutralizer is None — snapshot not stored")
return None
return self.interfaceNeutralizer.createSnapshot(sourceLabel, neutralizedText, placeholderCount)
def _persistAttributes(self, mapping: Dict[str, str], fileId: Optional[str]) -> None:
"""Persist mapping to DB for resolve to work. mapping: originalText -> placeholder e.g. '[email.uuid]'"""
if not self.interfaceNeutralizer or not mapping:
@ -393,7 +616,7 @@ class NeutralizationService:
except Exception as _imgErr:
logger.warning(f"Image check failed in binary file '{fileName}': {_imgErr}, removing (fail-safe)")
continue
nr = self._neutralizeText(str(data), 'text' if type_group != 'table' else 'csv')
nr = await self.processTextAsync(str(data), fileId)
proc = nr.get('processed_info', {}) or {}
if isinstance(proc, dict) and proc.get('type') == 'error':
neutralization_error = proc.get('error', 'Neutralization failed')
@ -402,7 +625,6 @@ class NeutralizationService:
all_mapping.update(mapping)
new_part = {**p, 'data': neu_text}
neutralized_parts.append(new_part)
self._persistAttributes(all_mapping, fileId)
# 3. PDF: Use in-place only; no fallback to render
if mimeType == "application/pdf":
@ -546,10 +768,31 @@ class NeutralizationService:
# Helper functions
def _neutralizeTextLight(self, text: str) -> Dict[str, Any]:
"""Regex-only supplementary pass using already-initialised processors.
Unlike ``_neutralizeText`` this does **no** DB I/O
(``_reloadNamesFromConfig`` is skipped) so it is safe to call from
an async context without blocking the event-loop or risking a
DB-connection-pool deadlock during parallel document processing.
"""
try:
data, mapping, replaced_fields, processed_info = self.textProcessor.processTextContent(text)
neutralized_text = str(data)
attributes = [NeutralizationAttribute(original=k, placeholder=v) for k, v in mapping.items()]
return NeutralizationResult(
neutralized_text=neutralized_text,
mapping=mapping,
attributes=attributes,
processed_info=processed_info,
).model_dump()
except Exception as e:
logger.warning(f"_neutralizeTextLight error: {e}")
return {'neutralized_text': text, 'mapping': {}, 'attributes': [], 'processed_info': {'type': 'error', 'error': str(e)}}
def _neutralizeText(self, text: str, textType: str = None) -> Dict[str, Any]:
"""Process text and return unified dict for API consumption."""
try:
# Reload names from config before processing to ensure we have the latest names
self._reloadNamesFromConfig()
# Auto-detect content type if not provided

View file

@ -689,6 +689,9 @@ async def _runWorkspaceAgent(
if allowedProviders:
aiService.services.allowedProviders = allowedProviders
logger.info(f"Workspace agent: allowedProviders={allowedProviders}")
else:
logger.debug("Workspace agent: no allowedProviders in request")
if requireNeutralization is not None:
ctx.requireNeutralization = requireNeutralization

View file

@ -690,8 +690,8 @@ def _registerCoreTools(registry: ToolRegistry, services):
if _fileNeedNeutralize:
try:
_nSvc = services.getService("neutralization") if hasattr(services, "getService") else None
if _nSvc and hasattr(_nSvc, 'processText'):
_nResult = _nSvc.processText(text)
if _nSvc and hasattr(_nSvc, 'processTextAsync'):
_nResult = await _nSvc.processTextAsync(text, fileId)
if _nResult and _nResult.get("neutralized_text"):
text = _nResult["neutralized_text"]
logger.debug(f"readFile: neutralized text for file {fileId}")
@ -3054,7 +3054,7 @@ def _registerCoreTools(registry: ToolRegistry, services):
if not neutralizationService.interfaceDbComponent:
neutralizationService.interfaceDbComponent = services.chat.interfaceDbComponent
if text:
result = neutralizationService.processText(text)
result = await neutralizationService.processTextAsync(text, fileId or None)
else:
result = neutralizationService.processFile(fileId)
if result:

View file

@ -181,13 +181,11 @@ class AiService:
_wasNeutralized = False
_excludedDocs: List[str] = []
if self._shouldNeutralize(request):
request, _wasNeutralized, _excludedDocs = self._neutralizeRequest(request)
request, _wasNeutralized, _excludedDocs = await self._neutralizeRequest(request)
if _excludedDocs:
logger.warning(f"Neutralization partial failures (continuing): {_excludedDocs}")
# Set billing callback on aiObjects BEFORE the AI call
# This callback is invoked by _callWithModel() after EVERY individual model call
# For parallel content parts (e.g., 200 MB doc), each model call creates its own transaction
logger.debug("callAi: neutralization phase done, starting main AI call")
self.aiObjects.billingCallback = self._createBillingCallback()
try:
@ -229,10 +227,11 @@ class AiService:
_wasNeutralized = False
_excludedDocs: List[str] = []
if self._shouldNeutralize(request):
request, _wasNeutralized, _excludedDocs = self._neutralizeRequest(request)
request, _wasNeutralized, _excludedDocs = await self._neutralizeRequest(request)
if _excludedDocs:
logger.warning(f"Neutralization partial failures in stream (continuing): {_excludedDocs}")
logger.debug("callAiStream: neutralization phase done, starting main AI stream")
self.aiObjects.billingCallback = self._createBillingCallback()
try:
async for chunk in self.aiObjects.callWithTextContextStream(request):
@ -557,6 +556,25 @@ detectedIntent-Werte:
# NEUTRALIZATION: Centralized prompt neutralization / response rehydration
# =========================================================================
async def _hasNeutralizationModel(self) -> bool:
"""Fast check: is at least one model available for NEUTRALIZATION_TEXT
given the current effective provider list? No AI call is made."""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
from modules.aicore.aicoreModelSelector import modelSelector as _modSel
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
_models = modelRegistry.getAvailableModels()
_providers = self._calculateEffectiveProviders()
if _providers:
_models = [m for m in _models if m.connectorType in _providers]
_opts = AiCallOptions(operationType=OperationTypeEnum.NEUTRALIZATION_TEXT)
_failover = _modSel.getFailoverModelList("x", "", _opts, _models)
return bool(_failover)
except Exception as _e:
logger.warning(f"_hasNeutralizationModel check failed: {_e}")
return True
def _shouldNeutralize(self, request: AiCallRequest) -> bool:
"""Check if this AI request should have neutralization applied.
@ -566,11 +584,17 @@ detectedIntent-Werte:
3. Per-request (request.requireNeutralization)
No source can override another's True with False.
Neutralization calls themselves (NEUTRALIZATION_TEXT / NEUTRALIZATION_IMAGE)
are never re-neutralized (recursion guard).
"""
try:
if not request.prompt and not request.messages and not request.context:
return False
_opType = request.options.operationType if request.options else None
if _opType in (OperationTypeEnum.NEUTRALIZATION_TEXT, OperationTypeEnum.NEUTRALIZATION_IMAGE):
return False
_sources = []
# Source 1: Feature-Instance config
@ -600,11 +624,15 @@ detectedIntent-Werte:
logger.error(f"_shouldNeutralize check failed: {e} — defaulting to False")
return False
def _neutralizeRequest(self, request: AiCallRequest) -> Tuple[AiCallRequest, bool, List[str]]:
"""Neutralize the prompt text and messages in an AiCallRequest.
async def _neutralizeRequest(self, request: AiCallRequest) -> Tuple[AiCallRequest, bool, List[str]]:
"""Neutralize the prompt text and messages in an AiCallRequest (async).
Returns (modifiedRequest, wasNeutralized, excludedDocs).
Uses ``processTextAsync`` which calls AI with NEUTRALIZATION_TEXT
to identify PII, protected logic and names then applies regex as
supplementary pass.
FAILSAFE behaviour when ``requireNeutralization is True`` (explicit):
- Service unavailable raises (caller must not send raw data to AI).
- Prompt neutralization fails raises.
@ -619,7 +647,7 @@ detectedIntent-Werte:
excludedDocs: List[str] = []
neutralSvc = self._get_service("neutralization")
if not neutralSvc or not hasattr(neutralSvc, 'processText'):
if not neutralSvc or not hasattr(neutralSvc, 'processTextAsync'):
if _hardMode:
raise RuntimeError("Neutralization explicitly required but service unavailable — AI call BLOCKED")
logger.warning("Neutralization required by config but service unavailable — continuing without neutralization")
@ -627,13 +655,25 @@ detectedIntent-Werte:
return request, False, excludedDocs
_wasNeutralized = False
_snapshots: list = []
if _hardMode:
_hasNeutModel = await self._hasNeutralizationModel()
if not _hasNeutModel:
raise RuntimeError(
"Neutralisierung ist aktiviert, aber es ist kein AI-Modell für "
"NEUTRALIZATION_TEXT verfügbar. Bitte ein Modell für Neutralisierung "
"freigeben oder die Neutralisierung deaktivieren."
)
if request.prompt:
logger.debug(f"_neutralizeRequest: neutralizing prompt ({len(request.prompt)} chars)")
try:
result = neutralSvc.processText(request.prompt)
result = await neutralSvc.processTextAsync(request.prompt)
if result and result.get("neutralized_text"):
request.prompt = result["neutralized_text"]
_wasNeutralized = True
_snapshots.append(("Prompt", result["neutralized_text"], len(result.get("mapping", {}))))
logger.debug("Neutralized prompt in AiCallRequest")
else:
if _hardMode:
@ -649,11 +689,13 @@ detectedIntent-Werte:
excludedDocs.append(f"Prompt neutralization error: {e}")
if request.context:
logger.debug(f"_neutralizeRequest: neutralizing context ({len(request.context)} chars)")
try:
result = neutralSvc.processText(request.context)
result = await neutralSvc.processTextAsync(request.context)
if result and result.get("neutralized_text"):
request.context = result["neutralized_text"]
_wasNeutralized = True
_snapshots.append(("Kontext", result["neutralized_text"], len(result.get("mapping", {}))))
logger.debug("Neutralized context in AiCallRequest")
else:
if _hardMode:
@ -668,6 +710,9 @@ detectedIntent-Werte:
logger.warning(f"Neutralization of context failed: {e} — sending original context")
excludedDocs.append(f"Context neutralization error: {e}")
_msgCount = len(request.messages) if request.messages and isinstance(request.messages, list) else 0
if _msgCount:
logger.debug(f"_neutralizeRequest: neutralizing {_msgCount} message(s)")
if request.messages and isinstance(request.messages, list):
cleanMessages = []
for idx, msg in enumerate(request.messages):
@ -680,27 +725,33 @@ detectedIntent-Werte:
cleanMessages.append(msg)
continue
try:
result = neutralSvc.processText(content)
result = await neutralSvc.processTextAsync(content)
if result and result.get("neutralized_text"):
msg["content"] = result["neutralized_text"]
_wasNeutralized = True
_role = msg.get("role", "?")
_snapshots.append((f"Nachricht {idx+1} ({_role})", result["neutralized_text"], len(result.get("mapping", {}))))
cleanMessages.append(msg)
else:
if _hardMode:
logger.warning(f"Message[{idx}] neutralization empty — REMOVING message (hard mode)")
excludedDocs.append(f"Message[{idx}] neutralization failed; message REMOVED")
else:
logger.warning(f"Neutralization of message[{idx}] returned no neutralized_text — keeping original")
excludedDocs.append(f"Message[{idx}] neutralization failed; original kept")
cleanMessages.append(msg)
raise RuntimeError(
f"Neutralisierung von Nachricht {idx+1}/{_msgCount} schlug fehl "
f"(leere Antwort). Konversation kann nicht sicher gesendet werden."
)
logger.warning(f"Neutralization of message[{idx}] returned no neutralized_text — keeping original")
excludedDocs.append(f"Message[{idx}] neutralization failed; original kept")
cleanMessages.append(msg)
except RuntimeError:
raise
except Exception as e:
if _hardMode:
logger.warning(f"Message[{idx}] neutralization error — REMOVING message (hard mode): {e}")
excludedDocs.append(f"Message[{idx}] neutralization error; message REMOVED: {e}")
else:
logger.warning(f"Neutralization of message[{idx}] failed: {e} — keeping original")
excludedDocs.append(f"Message[{idx}] neutralization error: {e}")
cleanMessages.append(msg)
raise RuntimeError(
f"Neutralisierung von Nachricht {idx+1}/{_msgCount} schlug fehl: {e}. "
f"Konversation kann nicht sicher gesendet werden."
) from e
logger.warning(f"Neutralization of message[{idx}] failed: {e} — keeping original")
excludedDocs.append(f"Message[{idx}] neutralization error: {e}")
cleanMessages.append(msg)
elif isinstance(content, list):
_cleanParts = []
for _partIdx, _part in enumerate(content):
@ -710,23 +761,29 @@ detectedIntent-Werte:
_partType = _part.get("type", "")
if _partType == "text" and _part.get("text"):
try:
_result = neutralSvc.processText(_part["text"])
_result = await neutralSvc.processTextAsync(_part["text"])
if _result and _result.get("neutralized_text"):
_part["text"] = _result["neutralized_text"]
_wasNeutralized = True
_role = msg.get("role", "?")
_snapshots.append((f"Nachricht {idx+1}.{_partIdx+1} ({_role})", _result["neutralized_text"], len(_result.get("mapping", {}))))
_cleanParts.append(_part)
else:
if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] text neutralization empty — REMOVING part")
excludedDocs.append(f"Message[{idx}].content[{_partIdx}] text removed")
else:
_cleanParts.append(_part)
raise RuntimeError(
f"Neutralisierung von Nachricht {idx+1}, Teil {_partIdx+1} "
f"schlug fehl (leere Antwort)."
)
_cleanParts.append(_part)
except RuntimeError:
raise
except Exception as e:
if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] text neutralization error — REMOVING: {e}")
excludedDocs.append(f"Message[{idx}].content[{_partIdx}] text error: {e}")
else:
_cleanParts.append(_part)
raise RuntimeError(
f"Neutralisierung von Nachricht {idx+1}, Teil {_partIdx+1} "
f"schlug fehl: {e}"
) from e
_cleanParts.append(_part)
elif _partType == "image_url":
if _hardMode:
logger.warning(f"Message[{idx}].content[{_partIdx}] image_url — REMOVING (neutralization active)")
@ -738,12 +795,12 @@ detectedIntent-Werte:
if _cleanParts:
msg["content"] = _cleanParts
cleanMessages.append(msg)
elif _hardMode:
logger.warning(f"Message[{idx}] all parts removed — REMOVING message")
excludedDocs.append(f"Message[{idx}] fully removed after neutralization")
else:
cleanMessages.append(msg)
else:
cleanMessages.append(msg)
request.messages = cleanMessages
logger.debug(f"_neutralizeRequest: messages done, {len(cleanMessages)} kept of {_msgCount}")
if hasattr(request, 'contentParts') and request.contentParts:
_cleanParts = []
@ -752,10 +809,11 @@ detectedIntent-Werte:
_data = getattr(_cp, 'data', '') or ''
if _tg in ('text', 'table') and _data:
try:
_result = neutralSvc.processText(str(_data))
_result = await neutralSvc.processTextAsync(str(_data))
if _result and _result.get("neutralized_text"):
_cp.data = _result["neutralized_text"]
_wasNeutralized = True
_snapshots.append((f"Inhalt {_cpIdx+1} ({_tg})", _result["neutralized_text"], len(_result.get("mapping", {}))))
_cleanParts.append(_cp)
else:
if _hardMode:
@ -778,7 +836,18 @@ detectedIntent-Werte:
else:
_cleanParts.append(_cp)
request.contentParts = _cleanParts
logger.debug(f"_neutralizeRequest: contentParts done, {len(_cleanParts)} kept")
if _snapshots and _wasNeutralized:
try:
neutralSvc.clearSnapshots()
for _label, _text, _phCount in _snapshots:
neutralSvc.saveSnapshot(_label, _text, _phCount)
logger.debug(f"_neutralizeRequest: saved {len(_snapshots)} snapshot(s)")
except Exception as _snapErr:
logger.warning(f"_neutralizeRequest: could not save snapshots: {_snapErr}")
logger.info(f"_neutralizeRequest complete: neutralized={_wasNeutralized}, excluded={len(excludedDocs)}")
return request, _wasNeutralized, excludedDocs
def _rehydrateResponse(self, responseText: str) -> str:

View file

@ -11,6 +11,7 @@ import logging
import importlib
from typing import Dict, Type, List, Optional, Tuple
from .documentRendererBaseTemplate import BaseRenderer
from .codeRendererBaseTemplate import BaseCodeRenderer
logger = logging.getLogger(__name__)
@ -54,7 +55,7 @@ class RendererRegistry:
attr = getattr(module, attrName)
if (isinstance(attr, type) and
issubclass(attr, BaseRenderer) and
attr != BaseRenderer and
attr not in (BaseRenderer, BaseCodeRenderer) and
hasattr(attr, 'getSupportedFormats')):
self._registerRendererClass(attr)
@ -72,6 +73,8 @@ class RendererRegistry:
"""Register a renderer class keyed by (format, outputStyle)."""
try:
supportedFormats = rendererClass.getSupportedFormats()
if not supportedFormats:
return
priority = rendererClass.getPriority() if hasattr(rendererClass, 'getPriority') else 0
for formatName in supportedFormats:

View file

@ -158,7 +158,7 @@ class KnowledgeService:
if not _textContent:
continue
try:
_neutralResult = _neutralSvc.processText(_textContent)
_neutralResult = await _neutralSvc.processTextAsync(_textContent, fileId)
if _neutralResult and _neutralResult.get("neutralized_text"):
_obj["data"] = _neutralResult["neutralized_text"]
_neutralizedObjects.append(_obj)

View file

@ -169,7 +169,7 @@ async def neutralizeData(self, parameters: Dict[str, Any]) -> ActionResult:
f"Neutralizing part {len(neutralizedParts) + 1} of document {i+1}"
)
neutralizationResult = self.services.neutralization.processText(part.data)
neutralizationResult = await self.services.neutralization.processTextAsync(part.data)
if neutralizationResult and 'neutralized_text' in neutralizationResult:
neutralizedData = neutralizationResult['neutralized_text']