Merge pull request #106 from valueonag/refactor/service-migrations

Refactor/service migrations
This commit is contained in:
Patrick Motsch 2026-03-16 15:51:07 +01:00 committed by GitHub
commit 6154eb2553
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
199 changed files with 10502 additions and 38615 deletions

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = http://localhost:5176
# AI configuration
Connector_AiOpenai_API_SECRET = DEV_ENC:Z0FBQUFBQnBaSnM4TWFRRmxVQmNQblVIYmc1Y0Q3aW9zZUtDWlNWdGZjbFpncGp2NHN2QjkxMWxibUJnZDBId252MWk5TXN3Yk14ajFIdi1CTkx2ZWx2QzF5OFR6LUx5azQ3dnNLaXJBOHNxc0tlWmtZcTFVelF4eXBSM2JkbHd2eTM0VHNXdHNtVUprZWtPVzctNlJsZHNmM20tU1N6Q1Q2cHFYSi1tNlhZNDNabTVuaEVGWmIydEhadTcyMlBURmw2aUJxOF9GTzR0dTZiNGZfOFlHaVpPZ1A1LXhhOEFtN1J5TEVNNWtMcGpyNkMzSl8xRnZsaTF1WTZrOUZmb0cxVURjSGFLS2dIYTQyZEJtTm90bEYxVWxNNXVPdTVjaVhYbXhxT3JsVDM5VjZMVFZKSE1tZnM9
Connector_AiAnthropic_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpENmFBWG16STFQUVZxNzZZRzRLYTA4X3lRanF1VkF4cU45OExNMzlsQmdISGFxTUxud1dXODBKcFhMVG9KNjdWVnlTTFFROVc3NDlsdlNHLUJXeG41NDBHaXhHR0VHVWl5UW9RNkVWbmlhakRKVW5pM0R4VHk0LUw0TV9LdkljNHdBLXJua21NQkl2b3l4UkVkMGN1YjBrMmJEeWtMay1jbmxrYWJNbUV0aktCXzU1djR2d2RSQXZORTNwcG92ZUVvVGMtQzQzTTVncEZTRGRtZUFIZWQ0dz09
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQTdnUHMwd2pIaXNtMmtCTFREd0pyQXRKb1F5eGtHSnkyOGZiUnlBOFc0b3Vzcndrc3ViRm1nMDJIOEZKYWxqdWNkZGh5N0Z4R0JlQmxXSG5pVnJUR2VYckZhMWNMZ1FNeXJ3enJLVlpiblhOZTNleUg3ZzZyUzRZanFSeDlVMkI=
Connector_AiPrivateLlm_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGRHM5eFdUVmVZU1R1cHBwN1RlMUx4T0NlLTJLUFFVX3J2OElDWFpuZmJHVmp4Z3BNNWMwZUVVZUd2TFhRSjVmVkVlcFlVRWtybXh0ZHloZ01ZcnVvX195YjdlWVdEcjZSWFFTTlNBWUlaTlNoLWhqVFBIb0thVlBiaWhjYjFQOFY=
Connector_AiMistral_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGeEQxYUIxOHhia0JlQWpWQ2dWQWZzY3l6SWwyUnJoR1hRQWloX2lxb2lGNkc4UnA4U2tWNjJaYzB1d1hvNG9fWUp1N3V4OW9FMGhaWVhjSlVwWEc1X2loVDBSZDEtdHdfcTA5QkcxQTR4OHc4RkRzclJrU2d1RFZpNDJkRDRURlE=

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla-int.poweron-center.net
# AI configuration
Connector_AiOpenai_API_SECRET = INT_ENC:Z0FBQUFBQnBaSnM4MENkQ2xJVmE5WFZKUkh2SHJFby1YVXN3ZmVxRkptS3ZWRmlwdU93ZEJjSjlMV2NGbU5mS3NCdmFfcmFYTEJNZXFIQ3ozTWE4ZC1pemlQNk9wbjU1d3BPS0ZCTTZfOF8yWmVXMWx0TU1DamlJLVFhSTJXclZsY3hMVWlPcXVqQWtMdER4T252NHZUWEhUOTdIN1VGR3ltazEweXFqQ0lvb0hYWmxQQnpxb0JwcFNhRDNGWXdoRTVJWm9FalZpTUF5b1RqZlRaYnVKYkp0NWR5Vko1WWJ0Wmg2VWJzYXZ0Z3Q4UkpsTldDX2dsekhKMmM4YjRoa2RwemMwYVQwM2cyMFlvaU5mOTVTWGlROU8xY2ZVRXlxZzJqWkxURWlGZGI2STZNb0NpdEtWUnM9
Connector_AiAnthropic_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjT1ZlRWVJdVZMT3ljSFJDcFdxRFBRVkZhS204NnN5RDBlQ0tpenhTM0FFVktuWW9mWHNwRWx2dHB0eDBSZ0JFQnZKWlp6c01pVGREWHd1eGpERnU0Q2xhaks1clQ1ZXVsdnd2ZzhpNXNQS1BhY3FjSkdkVEhHalNaRGR4emhpakZncnpDQUVxOHVXQzVUWmtQc0FsYmFwTF9TSG5FOUFtWk5Ick1NcHFvY2s1T1c2WXlRUFFJZnh6TWhuaVpMYmppcDR0QUx0a0R6RXlwbGRYb1R4dzJkUT09
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkdkJMTDY0akhXNzZDWHVYSEt1cDZoOWEzSktneHZEV2JndTNmWlNSMV9KbFNIZmQzeVlrNE5qUEIwcUlBSGM1a0hOZ3J6djIyOVhnZzI3M1dIUkdicl9FVXF3RGktMmlEYmhnaHJfWTdGUkktSXVUSGdQMC1vSEV6VE8zR2F1SVk=
Connector_AiPrivateLlm_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGSjZ1NWh0aWc1R3Z4MHNaeS1HamtUbndhcUZFZDlqUDhjSmg5eHFfdlVkU0RsVkJ2UVRaMWs3aWhraG5jSlc0YkxNWHVmR2JoSW5ENFFCdkJBM0VienlKSnhzNnBKbTJOUTFKczRfWlQ3bWpmUkRTT1I1OGNUSTlQdExacGRpeXg=
Connector_AiMistral_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGZTNtZ1E4TWIxSEU1OUlreUpxZkJIR0Vxcm9xRHRUbnBxbTQ1cXlkbnltWkJVdTdMYWZ4c3Fsam42TERWUTVhNzZFMU9xVjdyRGFCYml6bmZsZFd2YmJzemlrSWN6Q3o3X0NXX2xXNUQteTNONHdKYzJ5YVpLLWdhU2JhSTJQZnI=

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla.poweron-center.net
# AI configuration
Connector_AiOpenai_API_SECRET = PROD_ENC:Z0FBQUFBQnBaSnM4TWJOVm4xVkx6azRlNDdxN3UxLUdwY2hhdGYxRGp4VFJqYXZIcmkxM1ZyOWV2M0Z4MHdFNkVYQ0ROb1d6LUZFUEdvMHhLMEtXYVBCRzM5TlYyY3ROYWtJRk41cDZxd0tYYi00MjVqMTh4QVcyTXl0bmVocEFHbXQwREpwNi1vODdBNmwzazE5bkpNelE2WXpvblIzWlQwbGdEelI2WXFqT1RibXVHcjNWbVhwYzBOM25XTzNmTDAwUjRvYk4yNjIyZHc5c2RSZzREQUFCdUwyb0ZuOXN1dzI2c2FKdXI4NGxEbk92czZWamJXU3ZSbUlLejZjRklRRk4tLV9aVUFZekI2bTU4OHYxNTUybDg3RVo0ZTh6dXNKRW5GNXVackZvcm9laGI0X3R6V3M9
Connector_AiAnthropic_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3TnhYdlhSLW5RbXJyMHFXX0V0bHhuTDlTaFJsRDl2dTdIUTFtVFAwTE8tY3hLbzNSMnVTLXd3RUZualN3MGNzc1kwOTIxVUN2WW1rYi1TendFRVVBSVNqRFVjckEzNExyTGNaUkJLMmozazUwemI1cnhrcEtZVXJrWkdaVFFramp3MWZ6RmY2aGlRMXVEYjM2M3ZlbmxMdnNCRDM1QWR0Wmd6MWVnS1I1c01nV3hRLXg3d2NTZXVfTi1Wdm16UnRyNGsyRTZ0bG9TQ1g1OFB5Z002bmQ3QT09
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3NmItcDh6V0JpcE5Jc0NlUWZqcmllRHB5eDlNZmVnUlNVenhNTm5xWExzbjJqdE1GZ0hTSUYtb2dvdWNhTnlQNmVWQ2NGVDgwZ0MwMWZBMlNKWEhzdlF3TlZzTXhCZWM4Z1Uwb18tSTRoU1JBVTVkSkJHOTJwX291b3dPaVphVFg=
Connector_AiPrivateLlm_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGanZ6U3pzZWkwXzVPWGtIQ040XzFrTXc5QWRnazdEeEktaUJ0akJmNnEzbWUzNHczLTJfc2dIdzBDY0FTaXZYcDhxNFdNbTNtbEJTb2VRZ0ZYd05hdlNLR1h6SUFzVml2Z1FLY1BjTl90UWozUGxtak1URnhhZmNDRWFTb0dKVUo=
Connector_AiMistral_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGc2tQc2lvMk1YZk01Q1dob1U5cnR0dG03WWE3WkpoOWo0SEpvLU9Rc2lCNDExdy1wZExaN3lpT2FEQkxnaHRmWmZUUUZUUUJmblZreGlpaFpOdnFhbzlEd1RsVVJtX216cmhxTm5BcTN2eUZ2T054cDE5bmlEamJ3NGR6MVpFQnA=

View file

@ -12,8 +12,8 @@ IMPORTANT: Model Registration Requirements
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelAi import AiModel
from typing import List, Dict, Any, Optional, AsyncGenerator, Union
from modules.datamodels.datamodelAi import AiModel, AiModelCall, AiModelResponse
class BaseConnectorAi(ABC):
@ -102,3 +102,24 @@ class BaseConnectorAi(ABC):
"""Get only available models."""
models = self.getCachedModels()
return [model for model in models if model.isAvailable]
async def callAiBasicStream(self, modelCall: AiModelCall) -> AsyncGenerator[Union[str, AiModelResponse], None]:
"""Stream AI response. Yields str deltas during generation, then final AiModelResponse.
Default implementation: falls back to non-streaming callAiBasic.
Override in connectors that support streaming.
"""
response = await self.callAiBasic(modelCall)
if response.content:
yield response.content
yield response
async def callEmbedding(self, modelCall: AiModelCall) -> AiModelResponse:
"""Generate embeddings for input texts. Override in connectors that support embeddings.
Reads texts from modelCall.embeddingInput.
Returns AiModelResponse with metadata["embeddings"] containing the vectors.
"""
raise NotImplementedError(
f"{self.__class__.__name__} does not support embeddings"
)

View file

@ -1,9 +1,10 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import json
import logging
import httpx
import os
from typing import Dict, Any, List
from typing import Dict, Any, List, AsyncGenerator, Union
from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG
from .aicoreBase import BaseConnectorAi
@ -61,13 +62,15 @@ class AiAnthropic(BaseConnectorAi):
speedRating=6, # Slower due to high-quality processing
qualityRating=10, # Best quality available
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.QUALITY,
processingMode=ProcessingModeEnum.DETAILED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 9),
(OperationTypeEnum.DATA_ANALYSE, 9),
(OperationTypeEnum.DATA_GENERATE, 9),
(OperationTypeEnum.DATA_EXTRACT, 8)
(OperationTypeEnum.DATA_EXTRACT, 8),
(OperationTypeEnum.AGENT, 9),
),
version="claude-sonnet-4-5-20250929",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.003 + (bytesReceived / 4 / 1000) * 0.015
@ -85,13 +88,15 @@ class AiAnthropic(BaseConnectorAi):
speedRating=9, # Very fast, lightweight model
qualityRating=8, # Good quality, cost-efficient
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.SPEED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 8),
(OperationTypeEnum.DATA_ANALYSE, 8),
(OperationTypeEnum.DATA_GENERATE, 8),
(OperationTypeEnum.DATA_EXTRACT, 7)
(OperationTypeEnum.DATA_EXTRACT, 7),
(OperationTypeEnum.AGENT, 7),
),
version="claude-haiku-4-5-20251001",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.001 + (bytesReceived / 4 / 1000) * 0.005
@ -109,13 +114,15 @@ class AiAnthropic(BaseConnectorAi):
speedRating=5, # Moderate latency, most capable
qualityRating=10, # Top-tier intelligence
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.QUALITY,
processingMode=ProcessingModeEnum.DETAILED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 10),
(OperationTypeEnum.DATA_ANALYSE, 10),
(OperationTypeEnum.DATA_ANALYSE, 8),
(OperationTypeEnum.DATA_GENERATE, 10),
(OperationTypeEnum.DATA_EXTRACT, 9)
(OperationTypeEnum.DATA_EXTRACT, 9),
(OperationTypeEnum.AGENT, 10),
),
version="claude-opus-4-6",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.005 + (bytesReceived / 4 / 1000) * 0.025
@ -158,8 +165,6 @@ class AiAnthropic(BaseConnectorAi):
HTTPException: For errors in API communication
"""
try:
# Extract parameters from modelCall
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = getattr(options, "temperature", None)
@ -167,44 +172,8 @@ class AiAnthropic(BaseConnectorAi):
temperature = model.temperature
maxTokens = model.maxTokens
# Transform OpenAI-style messages to Anthropic format:
# - Move any 'system' role content to top-level 'system'
# - Keep only 'user'/'assistant' messages in the list
system_contents: List[str] = []
converted_messages: List[Dict[str, Any]] = []
for m in messages:
role = m.get("role")
content = m.get("content", "")
if role == "system":
# Collect system content; Anthropic expects top-level 'system'
if isinstance(content, list):
# Join text parts if provided as blocks
joined = "\n\n".join(
[
(part.get("text") if isinstance(part, dict) else str(part))
for part in content
]
)
system_contents.append(joined)
else:
system_contents.append(str(content))
continue
# For Anthropic, content can be a string; pass through strings, collapse blocks
if isinstance(content, list):
# Collapse to text if blocks are provided
collapsed = "\n\n".join(
[
(part.get("text") if isinstance(part, dict) else str(part))
for part in content
]
)
converted_messages.append({"role": role, "content": collapsed})
else:
converted_messages.append({"role": role, "content": content})
converted_messages, system_prompt = _convertMessagesForAnthropic(modelCall.messages)
system_prompt = "\n\n".join([s for s in system_contents if s]) if system_contents else None
# Create Anthropic API payload
payload: Dict[str, Any] = {
"model": model.name,
"messages": converted_messages,
@ -218,6 +187,13 @@ class AiAnthropic(BaseConnectorAi):
if system_prompt:
payload["system"] = system_prompt
if modelCall.tools:
payload["tools"] = _convertToolsToAnthropicFormat(modelCall.tools)
if modelCall.toolChoice:
payload["tool_choice"] = modelCall.toolChoice
else:
payload["tool_choice"] = {"type": "auto"}
response = await self.httpClient.post(
model.apiUrl,
json=payload
@ -244,29 +220,39 @@ class AiAnthropic(BaseConnectorAi):
# Parse response
anthropicResponse = response.json()
# Extract content from response
# Extract content and tool_use blocks from response
content = ""
toolCalls = []
if "content" in anthropicResponse:
if isinstance(anthropicResponse["content"], list):
# Content is a list of parts (in newer API versions)
for part in anthropicResponse["content"]:
if part.get("type") == "text":
content += part.get("text", "")
elif part.get("type") == "tool_use":
toolCalls.append({
"id": part.get("id", ""),
"type": "function",
"function": {
"name": part.get("name", ""),
"arguments": json.dumps(part.get("input", {})) if isinstance(part.get("input"), dict) else str(part.get("input", "{}"))
}
})
else:
# Direct content as string (in older API versions)
content = anthropicResponse["content"]
# Debug logging for empty responses
if not content or content.strip() == "":
if not content and not toolCalls:
logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}")
content = "[Anthropic API returned empty response]"
# Return standardized response
metadata = {"response_id": anthropicResponse.get("id", "")}
if toolCalls:
metadata["toolCalls"] = toolCalls
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": anthropicResponse.get("id", "")}
metadata=metadata
)
except Exception as e:
@ -279,6 +265,101 @@ class AiAnthropic(BaseConnectorAi):
logger.error(error_detail, exc_info=True)
raise HTTPException(status_code=500, detail=error_detail)
async def callAiBasicStream(self, modelCall: AiModelCall) -> AsyncGenerator[Union[str, AiModelResponse], None]:
"""Stream Anthropic response. Yields str deltas, then final AiModelResponse."""
try:
model = modelCall.model
options = modelCall.options
temperature = getattr(options, "temperature", None)
if temperature is None:
temperature = model.temperature
converted, system_prompt = _convertMessagesForAnthropic(modelCall.messages)
payload: Dict[str, Any] = {
"model": model.name,
"messages": converted,
"temperature": temperature,
"max_tokens": model.maxTokens,
"stream": True,
}
if system_prompt:
payload["system"] = system_prompt
if modelCall.tools:
payload["tools"] = _convertToolsToAnthropicFormat(modelCall.tools)
payload["tool_choice"] = modelCall.toolChoice or {"type": "auto"}
fullContent = ""
toolUseBlocks: Dict[int, Dict[str, Any]] = {}
currentToolIdx = -1
async with self.httpClient.stream("POST", model.apiUrl, json=payload) as response:
if response.status_code != 200:
body = await response.aread()
raise HTTPException(status_code=500, detail=f"Anthropic stream error: {response.status_code} - {body.decode()}")
async for line in response.aiter_lines():
if not line.startswith("data: "):
continue
try:
event = json.loads(line[6:])
except json.JSONDecodeError:
continue
eventType = event.get("type", "")
if eventType == "content_block_start":
block = event.get("content_block", {})
idx = event.get("index", 0)
if block.get("type") == "tool_use":
currentToolIdx = idx
toolUseBlocks[idx] = {
"id": block.get("id", ""),
"name": block.get("name", ""),
"arguments": "",
}
elif eventType == "content_block_delta":
delta = event.get("delta", {})
if delta.get("type") == "text_delta":
text = delta.get("text", "")
fullContent += text
yield text
elif delta.get("type") == "input_json_delta":
idx = event.get("index", currentToolIdx)
if idx in toolUseBlocks:
toolUseBlocks[idx]["arguments"] += delta.get("partial_json", "")
elif eventType == "message_stop":
break
metadata: Dict[str, Any] = {}
if toolUseBlocks:
metadata["toolCalls"] = [
{
"id": tb["id"],
"type": "function",
"function": {
"name": tb["name"],
"arguments": tb["arguments"],
},
}
for tb in toolUseBlocks.values()
]
yield AiModelResponse(
content=fullContent,
success=True,
modelId=model.name,
metadata=metadata,
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error streaming Anthropic API: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Error streaming Anthropic API: {e}")
async def callAiImage(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Analyzes an image using Anthropic's vision capabilities using standardized pattern.
@ -331,6 +412,20 @@ class AiAnthropic(BaseConnectorAi):
mimeType = parts[0].replace("data:", "")
base64Data = parts[1]
import base64 as _b64
try:
rawHead = _b64.b64decode(base64Data[:32])
if rawHead[:3] == b"\xff\xd8\xff":
mimeType = "image/jpeg"
elif rawHead[:8] == b"\x89PNG\r\n\x1a\n":
mimeType = "image/png"
elif rawHead[:4] == b"GIF8":
mimeType = "image/gif"
elif rawHead[:4] == b"RIFF" and rawHead[8:12] == b"WEBP":
mimeType = "image/webp"
except Exception:
pass
# Convert to Anthropic's vision format
anthropicMessages = [{
"role": "user",
@ -425,3 +520,100 @@ class AiAnthropic(BaseConnectorAi):
success=False,
error=f"Error during image analysis: {str(e)}"
)
def _convertMessagesForAnthropic(messages: List[Dict[str, Any]]):
"""Convert OpenAI-style messages to Anthropic format. Returns (messages, system_prompt)."""
system_contents: List[str] = []
converted_messages: List[Dict[str, Any]] = []
pendingToolResults: List[Dict[str, Any]] = []
def _flush():
if not pendingToolResults:
return
converted_messages.append({"role": "user", "content": list(pendingToolResults)})
pendingToolResults.clear()
def _collapse(content):
if isinstance(content, list):
return "\n\n".join(
(part.get("text") if isinstance(part, dict) else str(part))
for part in content
)
return str(content) if content else ""
for m in messages:
role = m.get("role")
content = m.get("content", "")
if role == "system":
system_contents.append(_collapse(content))
continue
if role == "tool":
pendingToolResults.append({
"type": "tool_result",
"tool_use_id": m.get("tool_call_id", ""),
"content": str(content) if content else "",
})
continue
_flush()
if role == "assistant" and m.get("tool_calls"):
contentBlocks = []
textPart = _collapse(content)
if textPart:
contentBlocks.append({"type": "text", "text": textPart})
for tc in m["tool_calls"]:
fn = tc.get("function", {})
inputData = fn.get("arguments", "{}")
if isinstance(inputData, str):
try:
inputData = json.loads(inputData)
except (json.JSONDecodeError, ValueError):
inputData = {}
contentBlocks.append({
"type": "tool_use",
"id": tc.get("id", ""),
"name": fn.get("name", ""),
"input": inputData,
})
converted_messages.append({"role": "assistant", "content": contentBlocks})
continue
converted_messages.append({"role": role, "content": _collapse(content)})
_flush()
merged: List[Dict[str, Any]] = []
for msg in converted_messages:
if merged and merged[-1]["role"] == msg["role"]:
prev = merged[-1]
pc, nc = prev["content"], msg["content"]
if isinstance(pc, str) and isinstance(nc, str):
prev["content"] = pc + "\n\n" + nc
elif isinstance(pc, list) and isinstance(nc, list):
prev["content"] = pc + nc
elif isinstance(pc, str) and isinstance(nc, list):
prev["content"] = [{"type": "text", "text": pc}] + nc
elif isinstance(pc, list) and isinstance(nc, str):
prev["content"] = pc + [{"type": "text", "text": nc}]
else:
merged.append(msg)
system_prompt = "\n\n".join([s for s in system_contents if s]) if system_contents else None
return merged, system_prompt
def _convertToolsToAnthropicFormat(openaiTools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert OpenAI-style tool definitions to Anthropic format."""
anthropicTools = []
for tool in openaiTools:
if tool.get("type") == "function":
fn = tool["function"]
anthropicTools.append({
"name": fn["name"],
"description": fn.get("description", ""),
"input_schema": fn.get("parameters", {"type": "object", "properties": {}})
})
return anthropicTools

View file

@ -1,8 +1,9 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import json as _json
import httpx
from typing import List
from typing import List, Dict, Any, AsyncGenerator, Union
from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG
from .aicoreBase import BaseConnectorAi
@ -66,13 +67,15 @@ class AiMistral(BaseConnectorAi):
speedRating=8, # Good speed for complex tasks
qualityRating=9, # High quality
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.ADVANCED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 9),
(OperationTypeEnum.DATA_ANALYSE, 9),
(OperationTypeEnum.DATA_GENERATE, 9),
(OperationTypeEnum.DATA_EXTRACT, 8)
(OperationTypeEnum.DATA_EXTRACT, 8),
(OperationTypeEnum.AGENT, 8),
),
version="mistral-large-latest",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.0005 + (bytesReceived / 4 / 1000) * 0.0015
@ -90,17 +93,40 @@ class AiMistral(BaseConnectorAi):
speedRating=9, # Very fast, lightweight model
qualityRating=7, # Good quality, cost-efficient
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.SPEED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 7),
(OperationTypeEnum.DATA_ANALYSE, 7),
(OperationTypeEnum.DATA_GENERATE, 8),
(OperationTypeEnum.DATA_EXTRACT, 7)
(OperationTypeEnum.DATA_EXTRACT, 7),
(OperationTypeEnum.AGENT, 6),
),
version="mistral-small-latest",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.00006 + (bytesReceived / 4 / 1000) * 0.00018
),
AiModel(
name="mistral-embed",
displayName="Mistral Embed",
connectorType="mistral",
apiUrl="https://api.mistral.ai/v1/embeddings",
temperature=0.0,
maxTokens=0,
contextLength=8192,
costPer1kTokensInput=0.0001, # $0.10/M tokens
costPer1kTokensOutput=0.0,
speedRating=10,
qualityRating=7,
functionCall=self.callEmbedding,
priority=PriorityEnum.COST,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.EMBEDDING, 8)
),
version="mistral-embed",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.0001
),
AiModel(
name="mistral-large-latest",
displayName="Mistral Large 3 Vision",
@ -216,6 +242,104 @@ class AiMistral(BaseConnectorAi):
logger.error(f"Error calling Mistral API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Mistral API: {str(e)}")
async def callAiBasicStream(self, modelCall: AiModelCall) -> AsyncGenerator[Union[str, AiModelResponse], None]:
"""Stream Mistral response. Yields str deltas, then final AiModelResponse."""
try:
model = modelCall.model
options = modelCall.options
temperature = getattr(options, "temperature", None)
if temperature is None:
temperature = model.temperature
payload: Dict[str, Any] = {
"model": model.name,
"messages": modelCall.messages,
"temperature": temperature,
"max_tokens": model.maxTokens,
"stream": True,
}
fullContent = ""
async with self.httpClient.stream("POST", model.apiUrl, json=payload) as response:
if response.status_code != 200:
body = await response.aread()
raise HTTPException(status_code=500, detail=f"Mistral stream error: {response.status_code} - {body.decode()}")
async for line in response.aiter_lines():
if not line.startswith("data: "):
continue
data = line[6:]
if data.strip() == "[DONE]":
break
try:
chunk = _json.loads(data)
except _json.JSONDecodeError:
continue
delta = chunk.get("choices", [{}])[0].get("delta", {})
if "content" in delta and delta["content"]:
fullContent += delta["content"]
yield delta["content"]
yield AiModelResponse(
content=fullContent,
success=True,
modelId=model.name,
metadata={},
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error streaming Mistral API: {e}")
raise HTTPException(status_code=500, detail=f"Error streaming Mistral API: {e}")
async def callEmbedding(self, modelCall: AiModelCall) -> AiModelResponse:
"""Generate embeddings via the Mistral Embeddings API.
Reads texts from modelCall.embeddingInput.
Returns vectors in metadata["embeddings"].
"""
try:
model = modelCall.model
texts = modelCall.embeddingInput or []
if not texts:
return AiModelResponse(
content="", success=False, error="No embeddingInput provided"
)
payload = {"model": model.name, "input": texts}
response = await self.httpClient.post(model.apiUrl, json=payload)
if response.status_code != 200:
errorMessage = f"Mistral Embedding API error: {response.status_code} - {response.text}"
logger.error(errorMessage)
if response.status_code == 429:
raise RateLimitExceededException(f"Rate limit exceeded for {model.name}")
raise HTTPException(status_code=500, detail=errorMessage)
responseJson = response.json()
embeddings = [item["embedding"] for item in responseJson["data"]]
usage = responseJson.get("usage", {})
return AiModelResponse(
content="",
success=True,
modelId=model.name,
tokensUsed={
"input": usage.get("prompt_tokens", 0),
"output": 0,
"total": usage.get("total_tokens", 0),
},
metadata={"embeddings": embeddings},
)
except RateLimitExceededException:
raise
except Exception as e:
logger.error(f"Error calling Mistral Embedding API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling Mistral Embedding API: {str(e)}")
async def callAiImage(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Analyzes an image with the Mistral Vision API using standardized pattern.

View file

@ -1,8 +1,9 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import json as _json
import httpx
from typing import List
from typing import List, Dict, Any, AsyncGenerator, Union
from fastapi import HTTPException
from modules.shared.configuration import APP_CONFIG
from .aicoreBase import BaseConnectorAi
@ -67,13 +68,15 @@ class AiOpenai(BaseConnectorAi):
speedRating=8, # Good speed for complex tasks
qualityRating=10, # High quality
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.ADVANCED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 9),
(OperationTypeEnum.DATA_ANALYSE, 10),
(OperationTypeEnum.DATA_GENERATE, 10),
(OperationTypeEnum.DATA_EXTRACT, 7)
(OperationTypeEnum.DATA_EXTRACT, 7),
(OperationTypeEnum.AGENT, 9),
),
version="gpt-4o",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.0025 + (bytesReceived / 4 / 1000) * 0.01
@ -92,13 +95,15 @@ class AiOpenai(BaseConnectorAi):
speedRating=9, # Very fast
qualityRating=8, # Good quality, replaces gpt-3.5-turbo
functionCall=self.callAiBasic,
functionCallStream=self.callAiBasicStream,
priority=PriorityEnum.SPEED,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 8),
(OperationTypeEnum.DATA_ANALYSE, 8),
(OperationTypeEnum.DATA_GENERATE, 9),
(OperationTypeEnum.DATA_EXTRACT, 7)
(OperationTypeEnum.DATA_EXTRACT, 7),
(OperationTypeEnum.AGENT, 8),
),
version="gpt-4o-mini",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.00015 + (bytesReceived / 4 / 1000) * 0.0006
@ -125,6 +130,48 @@ class AiOpenai(BaseConnectorAi):
version="gpt-4o",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.0025 + (bytesReceived / 4 / 1000) * 0.01
),
AiModel(
name="text-embedding-3-small",
displayName="OpenAI Embedding Small",
connectorType="openai",
apiUrl="https://api.openai.com/v1/embeddings",
temperature=0.0,
maxTokens=0,
contextLength=8191,
costPer1kTokensInput=0.00002, # $0.02/M tokens
costPer1kTokensOutput=0.0,
speedRating=10,
qualityRating=8,
functionCall=self.callEmbedding,
priority=PriorityEnum.COST,
processingMode=ProcessingModeEnum.BASIC,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.EMBEDDING, 10)
),
version="text-embedding-3-small",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.00002
),
AiModel(
name="text-embedding-3-large",
displayName="OpenAI Embedding Large",
connectorType="openai",
apiUrl="https://api.openai.com/v1/embeddings",
temperature=0.0,
maxTokens=0,
contextLength=8191,
costPer1kTokensInput=0.00013, # $0.13/M tokens
costPer1kTokensOutput=0.0,
speedRating=9,
qualityRating=10,
functionCall=self.callEmbedding,
priority=PriorityEnum.QUALITY,
processingMode=ProcessingModeEnum.ADVANCED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.EMBEDDING, 10)
),
version="text-embedding-3-large",
calculatepriceCHF=lambda processingTime, bytesSent, bytesReceived: (bytesSent / 4 / 1000) * 0.00013
),
AiModel(
name="dall-e-3",
displayName="OpenAI DALL-E 3",
@ -179,6 +226,10 @@ class AiOpenai(BaseConnectorAi):
"max_tokens": maxTokens
}
if modelCall.tools:
payload["tools"] = modelCall.tools
payload["tool_choice"] = modelCall.toolChoice or "auto"
response = await self.httpClient.post(
model.apiUrl,
json=payload
@ -218,22 +269,150 @@ class AiOpenai(BaseConnectorAi):
raise HTTPException(status_code=500, detail=error_message)
responseJson = response.json()
content = responseJson["choices"][0]["message"]["content"]
choiceMessage = responseJson["choices"][0]["message"]
content = choiceMessage.get("content") or ""
metadata = {"response_id": responseJson.get("id", "")}
if choiceMessage.get("tool_calls"):
metadata["toolCalls"] = choiceMessage["tool_calls"]
return AiModelResponse(
content=content,
success=True,
modelId=model.name,
metadata={"response_id": responseJson.get("id", "")}
metadata=metadata
)
except ContextLengthExceededException:
# Re-raise context length exceptions without wrapping
raise
except Exception as e:
logger.error(f"Error calling OpenAI API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling OpenAI API: {str(e)}")
async def callAiBasicStream(self, modelCall: AiModelCall) -> AsyncGenerator[Union[str, AiModelResponse], None]:
"""Stream OpenAI response. Yields str deltas, then final AiModelResponse."""
try:
messages = modelCall.messages
model = modelCall.model
options = modelCall.options
temperature = getattr(options, "temperature", None)
if temperature is None:
temperature = model.temperature
payload: Dict[str, Any] = {
"model": model.name,
"messages": messages,
"temperature": temperature,
"max_tokens": model.maxTokens,
"stream": True,
}
if modelCall.tools:
payload["tools"] = modelCall.tools
payload["tool_choice"] = modelCall.toolChoice or "auto"
fullContent = ""
toolCallsAccum: Dict[int, Dict[str, Any]] = {}
async with self.httpClient.stream("POST", model.apiUrl, json=payload) as response:
if response.status_code != 200:
body = await response.aread()
raise HTTPException(status_code=500, detail=f"OpenAI stream error: {response.status_code} - {body.decode()}")
async for line in response.aiter_lines():
if not line.startswith("data: "):
continue
data = line[6:]
if data.strip() == "[DONE]":
break
try:
chunk = _json.loads(data)
except _json.JSONDecodeError:
continue
delta = chunk.get("choices", [{}])[0].get("delta", {})
if "content" in delta and delta["content"]:
fullContent += delta["content"]
yield delta["content"]
for tcDelta in delta.get("tool_calls", []):
idx = tcDelta.get("index", 0)
if idx not in toolCallsAccum:
toolCallsAccum[idx] = {
"id": tcDelta.get("id", ""),
"type": "function",
"function": {"name": "", "arguments": ""},
}
if tcDelta.get("id"):
toolCallsAccum[idx]["id"] = tcDelta["id"]
fn = tcDelta.get("function", {})
if fn.get("name"):
toolCallsAccum[idx]["function"]["name"] = fn["name"]
if fn.get("arguments"):
toolCallsAccum[idx]["function"]["arguments"] += fn["arguments"]
metadata: Dict[str, Any] = {}
if toolCallsAccum:
metadata["toolCalls"] = [toolCallsAccum[i] for i in sorted(toolCallsAccum)]
yield AiModelResponse(
content=fullContent,
success=True,
modelId=model.name,
metadata=metadata,
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error streaming OpenAI API: {e}")
raise HTTPException(status_code=500, detail=f"Error streaming OpenAI API: {e}")
async def callEmbedding(self, modelCall: AiModelCall) -> AiModelResponse:
"""Generate embeddings via the OpenAI Embeddings API.
Reads texts from modelCall.embeddingInput.
Returns vectors in metadata["embeddings"].
"""
try:
model = modelCall.model
texts = modelCall.embeddingInput or []
if not texts:
return AiModelResponse(
content="", success=False, error="No embeddingInput provided"
)
payload = {"model": model.name, "input": texts}
response = await self.httpClient.post(model.apiUrl, json=payload)
if response.status_code != 200:
errorMessage = f"OpenAI Embedding API error: {response.status_code} - {response.text}"
logger.error(errorMessage)
if response.status_code == 429:
raise RateLimitExceededException(f"Rate limit exceeded for {model.name}")
raise HTTPException(status_code=500, detail=errorMessage)
responseJson = response.json()
embeddings = [item["embedding"] for item in responseJson["data"]]
usage = responseJson.get("usage", {})
return AiModelResponse(
content="",
success=True,
modelId=model.name,
tokensUsed={
"input": usage.get("prompt_tokens", 0),
"output": 0,
"total": usage.get("total_tokens", 0),
},
metadata={"embeddings": embeddings},
)
except (RateLimitExceededException, ContextLengthExceededException):
raise
except Exception as e:
logger.error(f"Error calling OpenAI Embedding API: {str(e)}")
raise HTTPException(status_code=500, detail=f"Error calling OpenAI Embedding API: {str(e)}")
async def callAiImage(self, modelCall: AiModelCall) -> AiModelResponse:
"""
Analyzes an image with the OpenAI Vision API using standardized pattern.

View file

@ -288,7 +288,16 @@ class AiTavily(BaseConnectorAi):
if maxResults < minResults or maxResults > maxAllowedResults:
raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")
# Perform actual API call
# Tavily enforces a 400-character query limit
TAVILY_MAX_QUERY_LENGTH = 400
if len(query) > TAVILY_MAX_QUERY_LENGTH:
truncated = query[:TAVILY_MAX_QUERY_LENGTH]
lastSpace = truncated.rfind(' ')
if lastSpace > TAVILY_MAX_QUERY_LENGTH // 2:
truncated = truncated[:lastSpace]
logger.warning(f"Tavily query truncated from {len(query)} to {len(truncated)} chars")
query = truncated
# Build kwargs only for provided options to avoid API rejections
kwargs: dict = {"query": query, "max_results": maxResults}
if searchDepth is not None:

View file

@ -41,6 +41,11 @@ class SystemTable(BaseModel):
)
def _isVectorType(sqlType: str) -> bool:
"""Check if a SQL type string represents a pgvector column."""
return sqlType.upper().startswith("VECTOR")
def _isJsonbType(fieldType) -> bool:
"""Check if a type should be stored as JSONB in PostgreSQL."""
# Direct dict or list
@ -70,20 +75,26 @@ def _isJsonbType(fieldType) -> bool:
def _get_model_fields(model_class) -> Dict[str, str]:
"""Get all fields from Pydantic model and map to SQL types."""
# Pydantic v2
"""Get all fields from Pydantic model and map to SQL types.
Supports explicit db_type override via json_schema_extra={"db_type": "vector(1536)"}.
This enables pgvector columns without special-casing field names.
"""
model_fields = model_class.model_fields
fields = {}
for field_name, field_info in model_fields.items():
# Pydantic v2
field_type = field_info.annotation
# Explicit db_type override (e.g. vector columns)
extra = field_info.json_schema_extra
if extra and isinstance(extra, dict) and "db_type" in extra:
fields[field_name] = extra["db_type"]
continue
# Check for JSONB fields (Dict, List, or complex types)
# Purely type-based detection - no hardcoded field names
if _isJsonbType(field_type):
fields[field_name] = "JSONB"
# Simple type mapping
elif field_type in (str, type(None)) or (
get_origin(field_type) is Union and type(None) in get_args(field_type)
):
@ -95,11 +106,45 @@ def _get_model_fields(model_class) -> Dict[str, str]:
elif field_type == bool:
fields[field_name] = "BOOLEAN"
else:
fields[field_name] = "TEXT" # Default to TEXT
fields[field_name] = "TEXT"
return fields
def _parseRecordFields(record: Dict[str, Any], fields: Dict[str, str], context: str = "") -> None:
"""Parse record fields in-place: numeric typing, vector parsing, JSONB deserialization."""
import json as _json
for fieldName, fieldType in fields.items():
if fieldName not in record:
continue
value = record[fieldName]
if fieldType in ("DOUBLE PRECISION", "INTEGER") and value is not None:
try:
record[fieldName] = float(value) if fieldType == "DOUBLE PRECISION" else int(value)
except (ValueError, TypeError):
logger.warning(f"Could not convert {fieldName} to {fieldType} ({context}): {value}")
elif _isVectorType(fieldType) and value is not None:
if isinstance(value, str):
try:
record[fieldName] = [float(v) for v in value.strip("[]").split(",")]
except (ValueError, TypeError):
logger.warning(f"Could not parse vector field {fieldName} ({context})")
elif isinstance(value, list):
pass # already a list
elif fieldType == "JSONB" and value is not None:
try:
if isinstance(value, str):
record[fieldName] = _json.loads(value)
elif not isinstance(value, (dict, list)):
record[fieldName] = _json.loads(str(value))
except (_json.JSONDecodeError, TypeError, ValueError):
logger.warning(f"Could not parse JSONB field {fieldName}, keeping as string ({context})")
# Cache connectors by (host, database, port) to avoid duplicate inits for same database.
# Thread safety: _connector_cache_lock protects cache access. userId is request-scoped via
# contextvars to avoid races when concurrent requests share the same connector.
@ -187,6 +232,9 @@ class DatabaseConnector:
# Thread safety
self._lock = threading.Lock()
# pgvector extension state
self._vectorExtensionEnabled = False
# Initialize system table
self._systemTableName = "_system"
self._initializeSystemTable()
@ -500,10 +548,32 @@ class DatabaseConnector:
self.connection.rollback()
return False
def _ensureVectorExtension(self) -> bool:
"""Enable pgvector extension if not already enabled. Called lazily on first vector table."""
if self._vectorExtensionEnabled:
return True
try:
self._ensure_connection()
with self.connection.cursor() as cursor:
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
self.connection.commit()
self._vectorExtensionEnabled = True
logger.info("pgvector extension enabled")
return True
except Exception as e:
logger.error(f"Failed to enable pgvector extension: {e}")
if hasattr(self, "connection") and self.connection:
self.connection.rollback()
return False
def _create_table_from_model(self, cursor, table: str, model_class: type) -> None:
"""Create table with columns matching Pydantic model fields."""
fields = _get_model_fields(model_class)
# Enable pgvector if any field uses vector type
if any(_isVectorType(sqlType) for sqlType in fields.values()):
self._ensureVectorExtension()
# Build column definitions with quoted identifiers to preserve exact case
columns = ['"id" VARCHAR(255) PRIMARY KEY']
for field_name, sql_type in fields.items():
@ -576,28 +646,25 @@ class DatabaseConnector:
elif hasattr(value, "value"):
value = value.value
# Handle vector fields (pgvector) - convert List[float] to string
elif col in fields and _isVectorType(fields[col]) and value is not None:
if isinstance(value, list):
value = f"[{','.join(str(v) for v in value)}]"
# Handle JSONB fields - ensure proper JSON format for PostgreSQL
elif col in fields and fields[col] == "JSONB" and value is not None:
import json
if isinstance(value, (dict, list)):
# Convert Python objects to JSON string for PostgreSQL JSONB
value = json.dumps(value)
elif isinstance(value, str):
# Validate that it's valid JSON, if not, try to parse and re-serialize
try:
# Test if it's already valid JSON
json.loads(value)
# If successful, keep as is
pass
except (json.JSONDecodeError, TypeError):
# If not valid JSON, convert to JSON string
value = json.dumps(value)
elif hasattr(value, 'model_dump'):
# Handle Pydantic models
value = json.dumps(value.model_dump())
else:
# Convert other types to JSON
value = json.dumps(value)
values.append(value)
@ -635,46 +702,7 @@ class DatabaseConnector:
record = dict(row)
fields = _get_model_fields(model_class)
# Ensure numeric fields are properly typed and parse JSONB fields
for field_name, field_type in fields.items():
# Ensure numeric fields (float/int) are properly typed
# psycopg2 may return them as strings in some environments (e.g., Azure PostgreSQL)
if field_type in ("DOUBLE PRECISION", "INTEGER") and field_name in record:
value = record[field_name]
if value is not None:
try:
if field_type == "DOUBLE PRECISION":
record[field_name] = float(value)
elif field_type == "INTEGER":
record[field_name] = int(value)
except (ValueError, TypeError):
# If conversion fails, log warning but keep original value
logger.warning(
f"Could not convert {field_name} to {field_type} for record {recordId}: {value}"
)
elif (
field_type == "JSONB"
and field_name in record
and record[field_name] is not None
):
import json
try:
if isinstance(record[field_name], str):
# Parse JSON string back to Python object
record[field_name] = json.loads(record[field_name])
elif isinstance(record[field_name], (dict, list)):
# Already a Python object, keep as is
pass
else:
# Try to parse as JSON
record[field_name] = json.loads(str(record[field_name]))
except (json.JSONDecodeError, TypeError, ValueError):
# If parsing fails, keep as string
logger.warning(
f"Could not parse JSONB field {field_name}, keeping as string: {record[field_name]}"
)
pass
_parseRecordFields(record, fields, f"record {recordId}")
return record
except Exception as e:
@ -737,55 +765,24 @@ class DatabaseConnector:
cursor.execute(f'SELECT * FROM "{table}" ORDER BY "id"')
records = [dict(row) for row in cursor.fetchall()]
# Handle JSONB fields for all records
fields = _get_model_fields(model_class)
model_fields = model_class.model_fields # Get Pydantic model fields
modelFields = model_class.model_fields
for record in records:
for field_name, field_type in fields.items():
if field_type == "JSONB" and field_name in record:
if record[field_name] is None:
# Generic type-based default: List types -> [], Dict types -> {}
# Interfaces handle domain-specific defaults
field_info = model_fields.get(field_name)
if field_info:
field_annotation = field_info.annotation
# Check if it's a List type
if (field_annotation == list or
(hasattr(field_annotation, "__origin__") and
field_annotation.__origin__ is list)):
record[field_name] = []
# Check if it's a Dict type
elif (field_annotation == dict or
(hasattr(field_annotation, "__origin__") and
field_annotation.__origin__ is dict)):
record[field_name] = {}
else:
record[field_name] = None
else:
record[field_name] = None
else:
import json
try:
if isinstance(record[field_name], str):
# Parse JSON string back to Python object
record[field_name] = json.loads(
record[field_name]
)
elif isinstance(record[field_name], (dict, list)):
# Already a Python object, keep as is
pass
else:
# Try to parse as JSON
record[field_name] = json.loads(
str(record[field_name])
)
except (json.JSONDecodeError, TypeError, ValueError):
# If parsing fails, keep as string
logger.warning(
f"Could not parse JSONB field {field_name}, keeping as string: {record[field_name]}"
)
pass
_parseRecordFields(record, fields, f"table {table}")
# Set type-aware defaults for NULL JSONB fields
for fieldName, fieldType in fields.items():
if fieldType == "JSONB" and fieldName in record and record[fieldName] is None:
fieldInfo = modelFields.get(fieldName)
if fieldInfo:
fieldAnnotation = fieldInfo.annotation
if (fieldAnnotation == list or
(hasattr(fieldAnnotation, "__origin__") and
fieldAnnotation.__origin__ is list)):
record[fieldName] = []
elif (fieldAnnotation == dict or
(hasattr(fieldAnnotation, "__origin__") and
fieldAnnotation.__origin__ is dict)):
record[fieldName] = {}
return records
except Exception as e:
@ -936,70 +933,23 @@ class DatabaseConnector:
cursor.execute(query, where_values)
records = [dict(row) for row in cursor.fetchall()]
# Handle JSONB fields and ensure numeric types are correct
fields = _get_model_fields(model_class)
model_fields = model_class.model_fields # Get Pydantic model fields
modelFields = model_class.model_fields
for record in records:
for field_name, field_type in fields.items():
# Ensure numeric fields (float/int) are properly typed
# psycopg2 may return them as strings in some environments (e.g., Azure PostgreSQL)
if field_type in ("DOUBLE PRECISION", "INTEGER") and field_name in record:
value = record[field_name]
if value is not None:
try:
if field_type == "DOUBLE PRECISION":
record[field_name] = float(value)
elif field_type == "INTEGER":
record[field_name] = int(value)
except (ValueError, TypeError):
# If conversion fails, log warning but keep original value
logger.warning(
f"Could not convert {field_name} to {field_type} for record {record.get('id', 'unknown')}: {value}"
)
elif field_type == "JSONB" and field_name in record:
if record[field_name] is None:
# Generic type-based default: List types -> [], Dict types -> {}
# Interfaces handle domain-specific defaults
field_info = model_fields.get(field_name)
if field_info:
field_annotation = field_info.annotation
# Check if it's a List type
if (field_annotation == list or
(hasattr(field_annotation, "__origin__") and
field_annotation.__origin__ is list)):
record[field_name] = []
# Check if it's a Dict type
elif (field_annotation == dict or
(hasattr(field_annotation, "__origin__") and
field_annotation.__origin__ is dict)):
record[field_name] = {}
else:
record[field_name] = None
else:
record[field_name] = None
else:
import json
try:
if isinstance(record[field_name], str):
# Parse JSON string back to Python object
record[field_name] = json.loads(
record[field_name]
)
elif isinstance(record[field_name], (dict, list)):
# Already a Python object, keep as is
pass
else:
# Try to parse as JSON
record[field_name] = json.loads(
str(record[field_name])
)
except (json.JSONDecodeError, TypeError, ValueError):
# If parsing fails, keep as string
logger.warning(
f"Could not parse JSONB field {field_name}, keeping as string: {record[field_name]}"
)
pass
_parseRecordFields(record, fields, f"table {table}")
for fieldName, fieldType in fields.items():
if fieldType == "JSONB" and fieldName in record and record[fieldName] is None:
fieldInfo = modelFields.get(fieldName)
if fieldInfo:
fieldAnnotation = fieldInfo.annotation
if (fieldAnnotation == list or
(hasattr(fieldAnnotation, "__origin__") and
fieldAnnotation.__origin__ is list)):
record[fieldName] = []
elif (fieldAnnotation == dict or
(hasattr(fieldAnnotation, "__origin__") and
fieldAnnotation.__origin__ is dict)):
record[fieldName] = {}
# If fieldFilter is available, reduce the fields
if fieldFilter and isinstance(fieldFilter, list):
@ -1080,7 +1030,10 @@ class DatabaseConnector:
existingRecord.update(record)
# Save updated record
self._saveRecord(model_class, recordId, existingRecord)
saved = self._saveRecord(model_class, recordId, existingRecord)
if not saved:
table = model_class.__name__
raise ValueError(f"Failed to save record {recordId} to table {table}")
return existingRecord
def recordDelete(self, model_class: type, recordId: str) -> bool:
@ -1127,6 +1080,85 @@ class DatabaseConnector:
initialId = systemData.get(table)
return initialId
def semanticSearch(
self,
modelClass: type,
vectorColumn: str,
queryVector: List[float],
limit: int = 10,
recordFilter: Dict[str, Any] = None,
minScore: float = None,
) -> List[Dict[str, Any]]:
"""Semantic search using pgvector cosine distance.
Args:
modelClass: Pydantic model class for the table.
vectorColumn: Name of the vector column to search.
queryVector: Query vector as List[float].
limit: Maximum number of results.
recordFilter: Additional WHERE filters (field: value).
minScore: Minimum cosine similarity (0.0 - 1.0).
Returns:
List of records with an added '_score' field (cosine similarity),
sorted by similarity descending.
"""
table = modelClass.__name__
try:
if not self._ensureTableExists(modelClass):
return []
vectorStr = f"[{','.join(str(v) for v in queryVector)}]"
whereConditions = []
whereValues = []
if recordFilter:
for field, value in recordFilter.items():
if value is None:
whereConditions.append(f'"{field}" IS NULL')
elif isinstance(value, (list, tuple)):
if not value:
whereConditions.append("1 = 0")
else:
whereConditions.append(f'"{field}" = ANY(%s)')
whereValues.append(list(value))
else:
whereConditions.append(f'"{field}" = %s')
whereValues.append(value)
if minScore is not None:
whereConditions.append(
f'1 - ("{vectorColumn}" <=> %s::vector) >= %s'
)
whereValues.extend([vectorStr, minScore])
whereClause = ""
if whereConditions:
whereClause = " WHERE " + " AND ".join(whereConditions)
query = (
f'SELECT *, 1 - ("{vectorColumn}" <=> %s::vector) AS "_score" '
f'FROM "{table}"{whereClause} '
f'ORDER BY "{vectorColumn}" <=> %s::vector '
f'LIMIT %s'
)
params = [vectorStr] + whereValues + [vectorStr, limit]
with self.connection.cursor() as cursor:
cursor.execute(query, params)
records = [dict(row) for row in cursor.fetchall()]
fields = _get_model_fields(modelClass)
for record in records:
_parseRecordFields(record, fields, f"semanticSearch {table}")
return records
except Exception as e:
logger.error(f"Error in semantic search on {table}: {e}")
return []
def close(self):
"""Close the database connection."""
if (
@ -1141,5 +1173,4 @@ class DatabaseConnector:
try:
self.close()
except Exception:
# Ignore errors during cleanup
pass

View file

@ -0,0 +1,54 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Abstract base classes for the Provider-Connector architecture (1:n).
One ProviderConnector per vendor (e.g. MsftConnector, GoogleConnector).
Each ProviderConnector exposes n ServiceAdapters (e.g. SharepointAdapter, OutlookAdapter).
All ServiceAdapters share the same access token from the UserConnection.
"""
from abc import ABC, abstractmethod
from typing import List, Optional
class ServiceAdapter(ABC):
"""Standardized operations for a single service of a provider."""
@abstractmethod
async def browse(self, path: str, filter: Optional[str] = None) -> list:
"""List items (files/folders) at the given path."""
...
@abstractmethod
async def download(self, path: str) -> bytes:
"""Download a file and return its content bytes."""
...
@abstractmethod
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
"""Upload a file to the given path. Returns metadata of the created entry."""
...
@abstractmethod
async def search(self, query: str, path: Optional[str] = None) -> list:
"""Search for items matching the query."""
...
class ProviderConnector(ABC):
"""One connector per provider. Manages a UserConnection + token.
Provides access to n services of the provider."""
def __init__(self, connection, accessToken: str):
self.connection = connection
self.accessToken = accessToken
@abstractmethod
def getAvailableServices(self) -> List[str]:
"""Which services does this provider offer?"""
...
@abstractmethod
def getServiceAdapter(self, service: str) -> ServiceAdapter:
"""Return the ServiceAdapter for a specific service."""
...

View file

@ -0,0 +1,94 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""ConnectorResolver -- resolves a connectionId to the correct ProviderConnector and ServiceAdapter.
Registry maps authority values to ProviderConnector classes.
The resolver loads the UserConnection, obtains a fresh token via SecurityService,
and instantiates the appropriate connector.
"""
import logging
from typing import Dict, Any, Type, Optional
from modules.connectors.connectorProviderBase import ProviderConnector, ServiceAdapter
logger = logging.getLogger(__name__)
class ConnectorResolver:
"""Resolves connectionId → ProviderConnector (with fresh token) → ServiceAdapter."""
_providerRegistry: Dict[str, Type[ProviderConnector]] = {}
def __init__(self, securityService, dbInterface):
"""
Args:
securityService: SecurityService instance (for getFreshToken)
dbInterface: DB interface with getUserConnection(connectionId)
"""
self._security = securityService
self._db = dbInterface
self._ensureRegistered()
def _ensureRegistered(self):
"""Lazy-register known providers on first instantiation."""
if ConnectorResolver._providerRegistry:
return
try:
from modules.connectors.providerMsft.connectorMsft import MsftConnector
ConnectorResolver._providerRegistry["msft"] = MsftConnector
except ImportError:
logger.warning("MsftConnector not available")
try:
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
ConnectorResolver._providerRegistry["google"] = GoogleConnector
except ImportError:
logger.debug("GoogleConnector not available (stub)")
try:
from modules.connectors.providerFtp.connectorFtp import FtpConnector
ConnectorResolver._providerRegistry["local:ftp"] = FtpConnector
except ImportError:
logger.debug("FtpConnector not available (stub)")
async def resolve(self, connectionId: str) -> ProviderConnector:
"""Resolve connectionId to a ProviderConnector with a fresh access token."""
connection = await self._loadConnection(connectionId)
if not connection:
raise ValueError(f"UserConnection not found: {connectionId}")
authority = getattr(connection, "authority", None)
if not authority:
raise ValueError(f"Connection {connectionId} has no authority")
authorityStr = authority.value if hasattr(authority, "value") else str(authority)
providerClass = self._providerRegistry.get(authorityStr)
if not providerClass:
raise ValueError(f"No ProviderConnector registered for authority: {authorityStr}")
token = self._security.getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
return providerClass(connection, token.tokenAccess)
async def resolveService(self, connectionId: str, service: str) -> ServiceAdapter:
"""Resolve connectionId + service name to a concrete ServiceAdapter."""
provider = await self.resolve(connectionId)
available = provider.getAvailableServices()
if service not in available:
raise ValueError(f"Service '{service}' not available. Options: {available}")
return provider.getServiceAdapter(service)
async def _loadConnection(self, connectionId: str) -> Optional[Any]:
"""Load UserConnection from DB."""
try:
if hasattr(self._db, "getUserConnection"):
return self._db.getUserConnection(connectionId)
if hasattr(self._db, "loadRecord"):
from modules.datamodels.datamodelUam import UserConnection
return self._db.loadRecord(UserConnection, connectionId)
except Exception as e:
logger.error(f"Failed to load connection {connectionId}: {e}")
return None

View file

@ -1,4 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FTP/SFTP Provider Connector stub."""

View file

@ -0,0 +1,48 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FTP/SFTP ProviderConnector stub.
Implements the ProviderConnector interface for FTP/SFTP file access.
Full implementation follows when FTP integration is prioritized.
"""
import logging
from typing import List, Optional
from modules.connectors.connectorProviderBase import ProviderConnector, ServiceAdapter
from modules.datamodels.datamodelDataSource import ExternalEntry
logger = logging.getLogger(__name__)
class FtpFilesAdapter(ServiceAdapter):
"""FTP files ServiceAdapter (stub)."""
def __init__(self, accessToken: str):
self._accessToken = accessToken
async def browse(self, path: str, filter: Optional[str] = None) -> List[ExternalEntry]:
logger.info(f"FTP browse stub: {path}")
return []
async def download(self, path: str) -> bytes:
logger.info(f"FTP download stub: {path}")
return b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
return {"error": "FTP upload not yet implemented"}
async def search(self, query: str, path: Optional[str] = None) -> List[ExternalEntry]:
return []
class FtpConnector(ProviderConnector):
"""FTP ProviderConnector -- 1 connection -> files."""
def getAvailableServices(self) -> List[str]:
return ["files"]
def getServiceAdapter(self, service: str) -> ServiceAdapter:
if service != "files":
raise ValueError(f"FTP only supports 'files' service, got '{service}'")
return FtpFilesAdapter(self.accessToken)

View file

@ -0,0 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Provider Connector -- 1 Connection : n Services (Drive, Gmail)."""

View file

@ -0,0 +1,232 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google ProviderConnector -- Drive and Gmail via Google OAuth."""
import logging
from typing import Any, Dict, List, Optional
import aiohttp
from modules.connectors.connectorProviderBase import ProviderConnector, ServiceAdapter
from modules.datamodels.datamodelDataSource import ExternalEntry
logger = logging.getLogger(__name__)
_DRIVE_BASE = "https://www.googleapis.com/drive/v3"
_GMAIL_BASE = "https://gmail.googleapis.com/gmail/v1"
async def _googleGet(token: str, url: str) -> Dict[str, Any]:
headers = {"Authorization": f"Bearer {token}"}
timeout = aiohttp.ClientTimeout(total=20)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=headers) as resp:
if resp.status in (200, 201):
return await resp.json()
errorText = await resp.text()
logger.warning(f"Google API {resp.status}: {errorText[:300]}")
return {"error": f"{resp.status}: {errorText[:200]}"}
except Exception as e:
return {"error": str(e)}
class DriveAdapter(ServiceAdapter):
"""Google Drive ServiceAdapter -- browse files and folders."""
def __init__(self, accessToken: str):
self._token = accessToken
async def browse(self, path: str, filter: Optional[str] = None) -> List[ExternalEntry]:
folderId = (path or "").strip("/") or "root"
query = f"'{folderId}' in parents and trashed=false"
fields = "files(id,name,mimeType,size,modifiedTime,parents)"
url = f"{_DRIVE_BASE}/files?q={query}&fields={fields}&pageSize=100&orderBy=folder,name"
result = await _googleGet(self._token, url)
if "error" in result:
logger.warning(f"Google Drive browse failed: {result['error']}")
return []
entries = []
for f in result.get("files", []):
isFolder = f.get("mimeType") == "application/vnd.google-apps.folder"
entries.append(ExternalEntry(
name=f.get("name", ""),
path=f"/{f.get('id', '')}",
isFolder=isFolder,
size=int(f.get("size", 0)) if f.get("size") else None,
mimeType=f.get("mimeType") if not isFolder else None,
metadata={"id": f.get("id"), "modifiedTime": f.get("modifiedTime")},
))
return entries
_EXPORT_MIME_MAP = {
"application/vnd.google-apps.document": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.google-apps.spreadsheet": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.google-apps.drawing": "application/pdf",
}
async def download(self, path: str) -> bytes:
fileId = (path or "").strip("/")
if not fileId:
return b""
headers = {"Authorization": f"Bearer {self._token}"}
timeout = aiohttp.ClientTimeout(total=60)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# Try direct download first
url = f"{_DRIVE_BASE}/files/{fileId}?alt=media"
async with session.get(url, headers=headers) as resp:
if resp.status == 200:
return await resp.read()
logger.debug(f"Google Drive direct download returned {resp.status} for {fileId}")
# If 403/404, check if it's a native Google file that needs export
metaUrl = f"{_DRIVE_BASE}/files/{fileId}?fields=mimeType,name"
async with session.get(metaUrl, headers=headers) as metaResp:
if metaResp.status != 200:
logger.warning(f"Google Drive metadata fetch failed ({metaResp.status}) for {fileId}")
return b""
meta = await metaResp.json()
fileMime = meta.get("mimeType", "")
fileName = meta.get("name", fileId)
exportMime = self._EXPORT_MIME_MAP.get(fileMime)
if not exportMime:
logger.warning(f"Google Drive: unsupported mimeType '{fileMime}' for file '{fileName}' ({fileId})")
return b""
exportUrl = f"{_DRIVE_BASE}/files/{fileId}/export?mimeType={exportMime}"
logger.info(f"Google Drive: exporting '{fileName}' as {exportMime}")
async with session.get(exportUrl, headers=headers) as exportResp:
if exportResp.status == 200:
return await exportResp.read()
logger.warning(f"Google Drive export failed ({exportResp.status}) for '{fileName}'")
except Exception as e:
logger.error(f"Google Drive download failed for {fileId}: {e}")
return b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
return {"error": "Google Drive upload not yet implemented"}
async def search(self, query: str, path: Optional[str] = None) -> List[ExternalEntry]:
safeQuery = query.replace("'", "\\'")
folderId = (path or "").strip("/")
qParts = [f"name contains '{safeQuery}'", "trashed=false"]
if folderId:
qParts.append(f"'{folderId}' in parents")
qStr = " and ".join(qParts)
url = f"{_DRIVE_BASE}/files?q={qStr}&fields=files(id,name,mimeType,size)&pageSize=25"
logger.debug(f"Google Drive search: q={qStr}")
result = await _googleGet(self._token, url)
if "error" in result:
return []
return [
ExternalEntry(
name=f.get("name", ""),
path=f"/{f.get('id', '')}",
isFolder=f.get("mimeType") == "application/vnd.google-apps.folder",
size=int(f.get("size", 0)) if f.get("size") else None,
)
for f in result.get("files", [])
]
class GmailAdapter(ServiceAdapter):
"""Gmail ServiceAdapter -- browse labels and messages."""
def __init__(self, accessToken: str):
self._token = accessToken
async def browse(self, path: str, filter: Optional[str] = None) -> list:
cleanPath = (path or "").strip("/")
if not cleanPath:
url = f"{_GMAIL_BASE}/users/me/labels"
result = await _googleGet(self._token, url)
if "error" in result:
logger.warning(f"Gmail labels failed: {result['error']}")
return []
_SYSTEM_LABELS = {"INBOX", "SENT", "DRAFT", "TRASH", "SPAM", "STARRED", "IMPORTANT"}
labels = []
for lbl in result.get("labels", []):
labelId = lbl.get("id", "")
labelName = lbl.get("name", labelId)
if lbl.get("type") == "system" and labelId not in _SYSTEM_LABELS:
continue
labels.append(ExternalEntry(
name=labelName,
path=f"/{labelId}",
isFolder=True,
metadata={"id": labelId, "type": lbl.get("type", "")},
))
labels.sort(key=lambda e: (0 if e.metadata.get("type") == "system" else 1, e.name))
return labels
url = f"{_GMAIL_BASE}/users/me/messages?labelIds={cleanPath}&maxResults=25"
result = await _googleGet(self._token, url)
if "error" in result:
return []
entries = []
for msg in result.get("messages", [])[:25]:
msgId = msg.get("id", "")
detailUrl = f"{_GMAIL_BASE}/users/me/messages/{msgId}?format=metadata&metadataHeaders=Subject&metadataHeaders=From&metadataHeaders=Date"
detail = await _googleGet(self._token, detailUrl)
if "error" in detail:
entries.append(ExternalEntry(name=f"Message {msgId}", path=f"/{cleanPath}/{msgId}", isFolder=False))
continue
headers = {h.get("name", ""): h.get("value", "") for h in detail.get("payload", {}).get("headers", [])}
entries.append(ExternalEntry(
name=headers.get("Subject", "(no subject)"),
path=f"/{cleanPath}/{msgId}",
isFolder=False,
metadata={
"id": msgId,
"from": headers.get("From", ""),
"date": headers.get("Date", ""),
"snippet": detail.get("snippet", ""),
},
))
return entries
async def download(self, path: str) -> bytes:
return b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
return {"error": "Gmail upload not applicable"}
async def search(self, query: str, path: Optional[str] = None) -> list:
url = f"{_GMAIL_BASE}/users/me/messages?q={query}&maxResults=10"
result = await _googleGet(self._token, url)
if "error" in result:
return []
return [
ExternalEntry(
name=f"Message {m.get('id', '')}",
path=f"/{m.get('id', '')}",
isFolder=False,
metadata={"id": m.get("id")},
)
for m in result.get("messages", [])
]
class GoogleConnector(ProviderConnector):
"""Google ProviderConnector -- 1 connection -> Drive + Gmail."""
_SERVICE_MAP = {
"drive": DriveAdapter,
"gmail": GmailAdapter,
}
def getAvailableServices(self) -> List[str]:
return list(self._SERVICE_MAP.keys())
def getServiceAdapter(self, service: str) -> ServiceAdapter:
adapterClass = self._SERVICE_MAP.get(service)
if not adapterClass:
raise ValueError(f"Unknown Google service: {service}. Available: {list(self._SERVICE_MAP.keys())}")
return adapterClass(self.accessToken)

View file

@ -0,0 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Microsoft Provider Connector -- 1 Connection : n Services (SharePoint, Outlook, Teams, OneDrive)."""

View file

@ -0,0 +1,459 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Microsoft ProviderConnector -- one MSFT connection serves SharePoint, Outlook, Teams, OneDrive.
All ServiceAdapters share the same OAuth access token obtained from the
UserConnection (authority=msft).
"""
import logging
import aiohttp
import asyncio
from typing import Dict, Any, List, Optional
from modules.connectors.connectorProviderBase import ProviderConnector, ServiceAdapter
from modules.datamodels.datamodelDataSource import ExternalEntry
logger = logging.getLogger(__name__)
_GRAPH_BASE = "https://graph.microsoft.com/v1.0"
class _GraphApiMixin:
"""Shared Graph API call logic for all MSFT service adapters."""
def __init__(self, accessToken: str):
self._accessToken = accessToken
async def _graphGet(self, endpoint: str) -> Dict[str, Any]:
return await _makeGraphCall(self._accessToken, endpoint, "GET")
async def _graphPost(self, endpoint: str, data: Any = None) -> Dict[str, Any]:
return await _makeGraphCall(self._accessToken, endpoint, "POST", data)
async def _graphPut(self, endpoint: str, data: bytes = None) -> Dict[str, Any]:
return await _makeGraphCall(self._accessToken, endpoint, "PUT", data)
async def _graphDelete(self, endpoint: str) -> Dict[str, Any]:
return await _makeGraphCall(self._accessToken, endpoint, "DELETE")
async def _graphDownload(self, endpoint: str) -> Optional[bytes]:
"""Download binary content from Graph API."""
headers = {"Authorization": f"Bearer {self._accessToken}"}
timeout = aiohttp.ClientTimeout(total=60)
url = f"{_GRAPH_BASE}/{endpoint.lstrip('/')}"
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=headers) as resp:
if resp.status == 200:
return await resp.read()
logger.error(f"Download failed {resp.status}: {await resp.text()}")
return None
except Exception as e:
logger.error(f"Graph download error: {e}")
return None
async def _makeGraphCall(
token: str, endpoint: str, method: str = "GET", data: Any = None
) -> Dict[str, Any]:
"""Execute a single Microsoft Graph API call."""
url = f"{_GRAPH_BASE}/{endpoint.lstrip('/')}"
contentType = "application/json"
if method == "PUT" and isinstance(data, bytes):
contentType = "application/octet-stream"
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": contentType,
}
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
kwargs: Dict[str, Any] = {"headers": headers}
if data is not None:
kwargs["data"] = data
if method == "GET":
async with session.get(url, **kwargs) as resp:
return await _handleResponse(resp)
elif method == "POST":
async with session.post(url, **kwargs) as resp:
return await _handleResponse(resp)
elif method == "PUT":
async with session.put(url, **kwargs) as resp:
return await _handleResponse(resp)
elif method == "DELETE":
async with session.delete(url, **kwargs) as resp:
if resp.status in (200, 204):
return {}
return await _handleResponse(resp)
except asyncio.TimeoutError:
return {"error": f"Graph API timeout: {endpoint}"}
except Exception as e:
return {"error": f"Graph API error: {e}"}
return {"error": f"Unsupported method: {method}"}
async def _handleResponse(resp: aiohttp.ClientResponse) -> Dict[str, Any]:
if resp.status in (200, 201):
return await resp.json()
errorText = await resp.text()
logger.error(f"Graph API {resp.status}: {errorText}")
return {"error": f"{resp.status}: {errorText}"}
def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry:
isFolder = "folder" in item
return ExternalEntry(
name=item.get("name", ""),
path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""),
isFolder=isFolder,
size=item.get("size"),
mimeType=item.get("file", {}).get("mimeType") if not isFolder else None,
lastModified=None,
metadata={
"id": item.get("id"),
"webUrl": item.get("webUrl"),
"childCount": item.get("folder", {}).get("childCount") if isFolder else None,
},
)
# ---------------------------------------------------------------------------
# SharePoint Adapter
# ---------------------------------------------------------------------------
class SharepointAdapter(_GraphApiMixin, ServiceAdapter):
"""ServiceAdapter for SharePoint (files, sites) via Microsoft Graph."""
async def browse(self, path: str, filter: Optional[str] = None) -> List[ExternalEntry]:
"""List items in a SharePoint folder.
Path format: /sites/<SiteName>/<FolderPath>
Root "/" lists available sites via discovery.
"""
if not path or path == "/":
return await self._discoverSites()
siteId, folderPath = _parseSharepointPath(path)
if not siteId:
return await self._discoverSites()
if not folderPath or folderPath == "/":
endpoint = f"sites/{siteId}/drive/root/children"
else:
cleanPath = folderPath.lstrip("/")
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children"
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
return []
entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
return entries
async def _discoverSites(self) -> List[ExternalEntry]:
"""Discover accessible SharePoint sites."""
result = await self._graphGet("sites?search=*&$top=50")
if "error" in result:
logger.warning(f"SharePoint site discovery failed: {result['error']}")
return []
return [
ExternalEntry(
name=s.get("displayName") or s.get("name", ""),
path=f"/sites/{s.get('id', '')}",
isFolder=True,
metadata={
"id": s.get("id"),
"webUrl": s.get("webUrl"),
"description": s.get("description", ""),
},
)
for s in result.get("value", [])
if s.get("displayName")
]
async def download(self, path: str) -> bytes:
siteId, filePath = _parseSharepointPath(path)
if not siteId or not filePath:
return b""
cleanPath = filePath.strip("/")
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/content"
data = await self._graphDownload(endpoint)
return data or b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
siteId, folderPath = _parseSharepointPath(path)
if not siteId:
return {"error": "Invalid SharePoint path"}
cleanFolder = (folderPath or "").strip("/")
uploadPath = f"{cleanFolder}/{fileName}" if cleanFolder else fileName
endpoint = f"sites/{siteId}/drive/root:/{uploadPath}:/content"
result = await self._graphPut(endpoint, data)
return result
async def search(self, query: str, path: Optional[str] = None) -> List[ExternalEntry]:
siteId, _ = _parseSharepointPath(path or "")
if not siteId:
return []
safeQuery = query.replace("'", "''")
endpoint = f"sites/{siteId}/drive/root/search(q='{safeQuery}')"
result = await self._graphGet(endpoint)
if "error" in result:
return []
return [_graphItemToExternalEntry(item) for item in result.get("value", [])]
# ---------------------------------------------------------------------------
# Outlook Adapter
# ---------------------------------------------------------------------------
class OutlookAdapter(_GraphApiMixin, ServiceAdapter):
"""ServiceAdapter for Outlook (mail, calendar) via Microsoft Graph."""
async def browse(self, path: str, filter: Optional[str] = None) -> List[ExternalEntry]:
"""List mail folders or messages.
path = "" or "/" list mail folders
path = "/Inbox" list messages in Inbox
"""
if not path or path == "/":
result = await self._graphGet("me/mailFolders")
if "error" in result:
return []
return [
ExternalEntry(
name=f.get("displayName", ""),
path=f"/{f.get('id', '')}",
isFolder=True,
metadata={"id": f.get("id"), "totalItemCount": f.get("totalItemCount")},
)
for f in result.get("value", [])
]
folderId = path.strip("/")
endpoint = f"me/mailFolders/{folderId}/messages?$top=25&$orderby=receivedDateTime desc"
result = await self._graphGet(endpoint)
if "error" in result:
return []
return [
ExternalEntry(
name=m.get("subject", "(no subject)"),
path=f"{path}/{m.get('id', '')}",
isFolder=False,
metadata={
"id": m.get("id"),
"from": m.get("from", {}).get("emailAddress", {}).get("address"),
"receivedDateTime": m.get("receivedDateTime"),
"hasAttachments": m.get("hasAttachments", False),
},
)
for m in result.get("value", [])
]
async def download(self, path: str) -> bytes:
"""Download a mail message as JSON bytes."""
import json
messageId = path.strip("/").split("/")[-1]
result = await self._graphGet(f"me/messages/{messageId}")
if "error" in result:
return b""
return json.dumps(result, ensure_ascii=False).encode("utf-8")
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
"""Not applicable for Outlook in the file sense."""
return {"error": "Upload not supported for Outlook"}
async def search(self, query: str, path: Optional[str] = None) -> List[ExternalEntry]:
safeQuery = query.replace("'", "''")
endpoint = f"me/messages?$search=\"{safeQuery}\"&$top=25"
result = await self._graphGet(endpoint)
if "error" in result:
return []
return [
ExternalEntry(
name=m.get("subject", "(no subject)"),
path=f"/search/{m.get('id', '')}",
isFolder=False,
metadata={
"id": m.get("id"),
"from": m.get("from", {}).get("emailAddress", {}).get("address"),
"receivedDateTime": m.get("receivedDateTime"),
},
)
for m in result.get("value", [])
]
async def sendMail(
self, to: List[str], subject: str, body: str,
cc: Optional[List[str]] = None, attachments: Optional[List[Dict]] = None
) -> Dict[str, Any]:
"""Send an email via Microsoft Graph."""
import json
message: Dict[str, Any] = {
"subject": subject,
"body": {"contentType": "Text", "content": body},
"toRecipients": [{"emailAddress": {"address": addr}} for addr in to],
}
if cc:
message["ccRecipients"] = [{"emailAddress": {"address": addr}} for addr in cc]
payload = json.dumps({"message": message, "saveToSentItems": True}).encode("utf-8")
result = await self._graphPost("me/sendMail", payload)
if "error" in result:
return result
return {"success": True}
# ---------------------------------------------------------------------------
# Teams Adapter (Stub)
# ---------------------------------------------------------------------------
class TeamsAdapter(_GraphApiMixin, ServiceAdapter):
"""ServiceAdapter for Microsoft Teams -- browse joined teams and channels."""
async def browse(self, path: str, filter: Optional[str] = None) -> list:
cleanPath = (path or "").strip("/")
if not cleanPath:
result = await self._graphGet("me/joinedTeams")
if "error" in result:
logger.warning(f"Teams browse failed: {result['error']}")
return []
return [
ExternalEntry(
name=t.get("displayName", ""),
path=f"/{t.get('id', '')}",
isFolder=True,
metadata={"id": t.get("id"), "description": t.get("description", "")},
)
for t in result.get("value", [])
]
parts = cleanPath.split("/", 1)
teamId = parts[0]
if len(parts) == 1:
result = await self._graphGet(f"teams/{teamId}/channels")
if "error" in result:
return []
return [
ExternalEntry(
name=ch.get("displayName", ""),
path=f"/{teamId}/{ch.get('id', '')}",
isFolder=True,
metadata={"id": ch.get("id"), "membershipType": ch.get("membershipType", "")},
)
for ch in result.get("value", [])
]
return []
async def download(self, path: str) -> bytes:
return b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
return {"error": "Teams upload not implemented"}
async def search(self, query: str, path: Optional[str] = None) -> list:
return []
# ---------------------------------------------------------------------------
# OneDrive Adapter (Stub -- similar to SharePoint but personal drive)
# ---------------------------------------------------------------------------
class OneDriveAdapter(_GraphApiMixin, ServiceAdapter):
"""ServiceAdapter stub for OneDrive (personal drive)."""
async def browse(self, path: str, filter: Optional[str] = None) -> List[ExternalEntry]:
cleanPath = (path or "").strip("/")
if not cleanPath:
endpoint = "me/drive/root/children"
else:
endpoint = f"me/drive/root:/{cleanPath}:/children"
result = await self._graphGet(endpoint)
if "error" in result:
return []
entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
return entries
async def download(self, path: str) -> bytes:
cleanPath = (path or "").strip("/")
if not cleanPath:
return b""
data = await self._graphDownload(f"me/drive/root:/{cleanPath}:/content")
return data or b""
async def upload(self, path: str, data: bytes, fileName: str) -> dict:
cleanPath = (path or "").strip("/")
uploadPath = f"{cleanPath}/{fileName}" if cleanPath else fileName
endpoint = f"me/drive/root:/{uploadPath}:/content"
return await self._graphPut(endpoint, data)
async def search(self, query: str, path: Optional[str] = None) -> List[ExternalEntry]:
safeQuery = query.replace("'", "''")
endpoint = f"me/drive/root/search(q='{safeQuery}')"
result = await self._graphGet(endpoint)
if "error" in result:
return []
return [_graphItemToExternalEntry(item) for item in result.get("value", [])]
# ---------------------------------------------------------------------------
# MsftConnector (1:n)
# ---------------------------------------------------------------------------
class MsftConnector(ProviderConnector):
"""Microsoft ProviderConnector -- 1 connection → n services."""
_SERVICE_MAP = {
"sharepoint": SharepointAdapter,
"outlook": OutlookAdapter,
"teams": TeamsAdapter,
"onedrive": OneDriveAdapter,
}
def getAvailableServices(self) -> List[str]:
return list(self._SERVICE_MAP.keys())
def getServiceAdapter(self, service: str) -> ServiceAdapter:
adapterClass = self._SERVICE_MAP.get(service)
if not adapterClass:
raise ValueError(f"Unknown MSFT service: {service}. Available: {list(self._SERVICE_MAP.keys())}")
return adapterClass(self.accessToken)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _parseSharepointPath(path: str) -> tuple:
"""Parse a SharePoint path into (siteId, innerPath).
Expected format: /sites/<siteId>/<innerPath>
Also accepts bare siteId if no /sites/ prefix.
"""
if not path:
return ("", "")
clean = path.strip("/")
if clean.startswith("sites/"):
parts = clean.split("/", 2)
siteId = parts[1] if len(parts) > 1 else ""
innerPath = parts[2] if len(parts) > 2 else ""
return (siteId, innerPath)
parts = clean.split("/", 1)
return (parts[0], parts[1] if len(parts) > 1 else "")
def _matchFilter(entry: ExternalEntry, pattern: str) -> bool:
"""Simple glob-like filter (supports * wildcard)."""
import fnmatch
return fnmatch.fnmatch(entry.name.lower(), pattern.lower())

View file

@ -26,6 +26,12 @@ class OperationTypeEnum(str, Enum):
WEB_SEARCH_DATA = "webSearch" # Returns list of URLs only
WEB_CRAWL = "webCrawl" # Web crawl for a given URL
# Agent Operations
AGENT = "agent" # Agent loop: reasoning + tool use
# Embedding Operations
EMBEDDING = "embedding" # Text → vector conversion for semantic search
# Speech Operations (dedicated pipeline, bypasses standard model selection)
SPEECH_TEAMS = "speechTeams" # Teams Meeting AI analysis: decide if/how to respond
@ -102,6 +108,7 @@ class AiModel(BaseModel):
# Function reference (not serialized)
functionCall: Optional[Callable] = Field(default=None, exclude=True, description="Function to call for this model")
functionCallStream: Optional[Callable] = Field(default=None, exclude=True, description="Streaming function: yields str deltas, then final AiModelResponse")
calculatepriceCHF: Optional[Callable] = Field(default=None, exclude=True, description="Function to calculate price in USD")
# Selection criteria - capabilities with ratings
@ -155,10 +162,12 @@ class AiCallOptions(BaseModel):
class AiCallRequest(BaseModel):
"""Centralized AI call request payload for interface use."""
prompt: str = Field(description="The user prompt")
prompt: str = Field(default="", description="The user prompt")
context: Optional[str] = Field(default=None, description="Optional external context (e.g., extracted docs)")
options: AiCallOptions = Field(default_factory=AiCallOptions)
contentParts: Optional[List['ContentPart']] = None # NEW: Content parts for model-aware chunking
contentParts: Optional[List['ContentPart']] = None # Content parts for model-aware chunking
messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="OpenAI-style messages for multi-turn agent conversations")
tools: Optional[List[Dict[str, Any]]] = Field(default=None, description="Tool definitions for native function calling")
class AiCallResponse(BaseModel):
@ -172,14 +181,19 @@ class AiCallResponse(BaseModel):
bytesSent: int = Field(default=0, description="Input data size in bytes")
bytesReceived: int = Field(default=0, description="Output data size in bytes")
errorCount: int = Field(default=0, description="0 for success, 1+ for errors")
toolCalls: Optional[List[Dict[str, Any]]] = Field(default=None, description="Tool calls from native function calling")
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional response metadata (e.g. embeddings vectors)")
class AiModelCall(BaseModel):
"""Standardized input for AI model calls."""
messages: List[Dict[str, Any]] = Field(description="Messages in OpenAI format (role, content)")
messages: List[Dict[str, Any]] = Field(default_factory=list, description="Messages in OpenAI format (role, content)")
model: Optional[AiModel] = Field(default=None, description="The AI model being called")
options: AiCallOptions = Field(default_factory=AiCallOptions, description="Additional model-specific options")
tools: Optional[List[Dict[str, Any]]] = Field(default=None, description="Tool definitions for native function calling")
toolChoice: Optional[Any] = Field(default=None, description="Tool choice: 'auto', 'none', or specific tool")
embeddingInput: Optional[List[str]] = Field(default=None, description="Input texts for embedding models (used instead of messages)")
model_config = ConfigDict(arbitrary_types_allowed=True)

View file

@ -124,6 +124,12 @@ class BillingTransaction(BaseModel):
aicoreModel: Optional[str] = Field(None, description="AICore model name (e.g., claude-4-sonnet, gpt-4o)")
createdByUserId: Optional[str] = Field(None, description="User who created/caused this transaction")
# AI call metadata (for per-call analytics)
processingTime: Optional[float] = Field(None, description="Processing time in seconds")
bytesSent: Optional[int] = Field(None, description="Bytes sent to AI model")
bytesReceived: Optional[int] = Field(None, description="Bytes received from AI model")
errorCount: Optional[int] = Field(None, description="Number of errors in this call")
registerModelLabels(
"BillingTransaction",

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatStat, ChatDocument."""
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatDocument."""
from typing import List, Dict, Any, Optional
from enum import Enum
@ -10,44 +10,6 @@ from modules.shared.timeUtils import getUtcTimestamp
import uuid
class ChatStat(BaseModel):
"""Statistics for chat operations. User-owned, no mandate context."""
model_config = {"populate_by_name": True, "extra": "allow"} # Allow DB system fields
id: str = Field(
default_factory=lambda: str(uuid.uuid4()), description="Primary key"
)
workflowId: Optional[str] = Field(
None, description="Foreign key to workflow (for workflow stats)"
)
processingTime: Optional[float] = Field(
None, description="Processing time in seconds"
)
bytesSent: Optional[int] = Field(None, description="Number of bytes sent")
bytesReceived: Optional[int] = Field(None, description="Number of bytes received")
errorCount: Optional[int] = Field(None, description="Number of errors encountered")
process: Optional[str] = Field(None, description="The process that delivers the stats data (e.g. 'action.outlook.readMails', 'ai.process.document.name')")
engine: Optional[str] = Field(None, description="The engine used (e.g. 'ai.anthropic.35', 'ai.tavily.basic', 'renderer.docx')")
priceCHF: Optional[float] = Field(None, description="Calculated price in USD for the operation")
registerModelLabels(
"ChatStat",
{"en": "Chat Statistics", "fr": "Statistiques de chat"},
{
"id": {"en": "ID", "fr": "ID"},
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
"processingTime": {"en": "Processing Time", "fr": "Temps de traitement"},
"bytesSent": {"en": "Bytes Sent", "fr": "Octets envoyés"},
"bytesReceived": {"en": "Bytes Received", "fr": "Octets reçus"},
"errorCount": {"en": "Error Count", "fr": "Nombre d'erreurs"},
"process": {"en": "Process", "fr": "Processus"},
"engine": {"en": "Engine", "fr": "Moteur"},
"priceCHF": {"en": "Price CHF", "fr": "Prix CHF"},
},
)
class ChatLog(BaseModel):
"""Log entries for chat workflows. User-owned, no mandate context."""
id: str = Field(
@ -322,7 +284,6 @@ class ChatWorkflow(BaseModel):
startedAt: float = Field(default_factory=getUtcTimestamp, description="When the workflow started (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
logs: List[ChatLog] = Field(default_factory=list, description="Workflow logs", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
messages: List[ChatMessage] = Field(default_factory=list, description="Messages in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
stats: List[ChatStat] = Field(default_factory=list, description="Workflow statistics list", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
tasks: list = Field(default_factory=list, description="List of tasks in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
workflowMode: WorkflowModeEnum = Field(default=WorkflowModeEnum.WORKFLOW_DYNAMIC, description="Workflow mode selector", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [
{

View file

@ -0,0 +1,58 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Content Object data models for the container and content extraction pipeline.
Physical layer: Container hierarchy (ZIP, Folder, File)
Logical layer: Scalar content objects (text, image, videostream, audiostream, other)
The entire extraction pipeline up to ContentObjects runs without AI.
"""
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
import uuid
class ContainerLimitError(Exception):
"""Raised when container extraction exceeds safety limits (size, depth, file count)."""
pass
class ContentContextRef(BaseModel):
"""Reference to the origin context within a container/file."""
containerPath: str = Field(description="e.g. 'archiv.zip/folder-a/report.pdf'")
location: str = Field(default="", description="e.g. 'page:5/region:bottomLeft'")
label: Optional[str] = Field(default=None, description="e.g. 'Abbildung 3: Uebersicht'")
pageIndex: Optional[int] = Field(default=None, description="Page number (PDF, DOCX)")
sectionId: Optional[str] = Field(default=None, description="Section/Heading ID")
sheetName: Optional[str] = Field(default=None, description="Sheet name (XLSX)")
slideIndex: Optional[int] = Field(default=None, description="Slide number (PPTX)")
class ContentObject(BaseModel):
"""Scalar content object extracted from a file. No AI involved."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
fileId: str = Field(description="FK to the physical file")
contentType: str = Field(description="text, image, videostream, audiostream, other")
data: str = Field(default="", description="Content data (text, base64, URL)")
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
metadata: Dict[str, Any] = Field(default_factory=dict)
sequence: int = Field(default=0, description="Order within the context")
class ContentObjectSummary(BaseModel):
"""Compact description of a content object for the FileContentIndex."""
id: str = Field(description="Content object ID")
contentType: str = Field(description="text, image, videostream, audiostream, other")
contextRef: ContentContextRef = Field(default_factory=ContentContextRef)
charCount: Optional[int] = Field(default=None, description="Only for text")
dimensions: Optional[str] = Field(default=None, description="Only for image/video (e.g. '1920x1080')")
duration: Optional[float] = Field(default=None, description="Only for audio/video (seconds)")
class FileEntry(BaseModel):
"""A file extracted from a container (ZIP, TAR, Folder)."""
path: str = Field(description="Relative path within the container")
data: bytes = Field(description="File content bytes")
mimeType: str = Field(description="Detected MIME type")
size: int = Field(description="File size in bytes")

View file

@ -0,0 +1,58 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""DataSource and ExternalEntry models for external data integration.
DataSource links a UserConnection to an external path (SharePoint folder,
Google Drive folder, FTP directory, etc.) for agent-accessible data containers.
"""
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid
class DataSource(BaseModel):
"""Configured external data source linked to a UserConnection."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
connectionId: str = Field(description="FK to UserConnection")
sourceType: str = Field(description="sharepointFolder, googleDriveFolder, outlookFolder, ftpFolder")
path: str = Field(description="External path (e.g. '/sites/MySite/Documents/Reports')")
label: str = Field(description="User-visible label")
featureInstanceId: Optional[str] = Field(default=None, description="Scoped to feature instance")
mandateId: Optional[str] = Field(default=None, description="Mandate scope")
userId: str = Field(default="", description="Owner user ID")
autoSync: bool = Field(default=False, description="Automatically sync on schedule")
lastSynced: Optional[float] = Field(default=None, description="Last sync timestamp")
createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp")
registerModelLabels(
"DataSource",
{"en": "Data Source", "de": "Datenquelle", "fr": "Source de données"},
{
"id": {"en": "ID", "de": "ID", "fr": "ID"},
"connectionId": {"en": "Connection ID", "de": "Verbindungs-ID", "fr": "ID de connexion"},
"sourceType": {"en": "Source Type", "de": "Quellentyp", "fr": "Type de source"},
"path": {"en": "Path", "de": "Pfad", "fr": "Chemin"},
"label": {"en": "Label", "de": "Bezeichnung", "fr": "Libellé"},
"featureInstanceId": {"en": "Feature Instance", "de": "Feature-Instanz", "fr": "Instance de fonctionnalité"},
"mandateId": {"en": "Mandate ID", "de": "Mandanten-ID", "fr": "ID du mandat"},
"userId": {"en": "User ID", "de": "Benutzer-ID", "fr": "ID utilisateur"},
"autoSync": {"en": "Auto Sync", "de": "Auto-Sync", "fr": "Synchro auto"},
"lastSynced": {"en": "Last Synced", "de": "Letzter Sync", "fr": "Dernier sync"},
"createdAt": {"en": "Created At", "de": "Erstellt am", "fr": "Créé le"},
},
)
class ExternalEntry(BaseModel):
"""An item (file or folder) from an external data source."""
name: str = Field(description="Item name")
path: str = Field(description="Full path within the source")
isFolder: bool = Field(default=False, description="True if directory/folder")
size: Optional[int] = Field(default=None, description="File size in bytes")
mimeType: Optional[str] = Field(default=None, description="MIME type (files only)")
lastModified: Optional[float] = Field(default=None, description="Last modification timestamp")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Provider-specific metadata")

View file

@ -73,7 +73,7 @@ class ExtractionOptions(BaseModel):
"""Options for document extraction and processing with clear data structures."""
# Core extraction parameters
prompt: str = Field(description="Extraction prompt for AI processing")
prompt: str = Field(default="", description="Extraction prompt for AI processing")
processDocumentsIndividually: bool = Field(default=True, description="Process each document separately")
# Image processing parameters
@ -81,7 +81,7 @@ class ExtractionOptions(BaseModel):
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
mergeStrategy: MergeStrategy = Field(description="Strategy for merging extraction results")
mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")

View file

@ -0,0 +1,32 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FileFolder: hierarchical folder structure for file organization."""
from typing import Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid
class FileFolder(BaseModel):
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
name: str = Field(description="Folder name", json_schema_extra={"frontend_type": "text", "frontend_readonly": False, "frontend_required": True})
parentId: Optional[str] = Field(default=None, description="Parent folder ID (null = root)", json_schema_extra={"frontend_type": "text", "frontend_readonly": False, "frontend_required": False})
mandateId: Optional[str] = Field(default=None, description="Mandate context", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
featureInstanceId: Optional[str] = Field(default=None, description="Feature instance context", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
registerModelLabels(
"FileFolder",
{"en": "File Folder", "fr": "Dossier de fichiers"},
{
"id": {"en": "ID", "fr": "ID"},
"name": {"en": "Name", "fr": "Nom"},
"parentId": {"en": "Parent Folder", "fr": "Dossier parent"},
"mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"createdAt": {"en": "Created At", "fr": "Créé le"},
},
)

View file

@ -2,7 +2,7 @@
# All rights reserved.
"""File-related datamodels: FileItem, FilePreview, FileData."""
from typing import Dict, Any, Optional, Union
from typing import Dict, Any, List, Optional, Union
from pydantic import BaseModel, ConfigDict, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
@ -20,6 +20,10 @@ class FileItem(BaseModel):
fileHash: str = Field(description="Hash of the file", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
fileSize: int = Field(description="Size of the file in bytes", json_schema_extra={"frontend_type": "integer", "frontend_readonly": True, "frontend_required": False})
creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the file was created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
tags: Optional[List[str]] = Field(default=None, description="Tags for categorization and search", json_schema_extra={"frontend_type": "tags", "frontend_readonly": False, "frontend_required": False})
folderId: Optional[str] = Field(default=None, description="ID of the parent folder", json_schema_extra={"frontend_type": "text", "frontend_readonly": False, "frontend_required": False})
description: Optional[str] = Field(default=None, description="User-provided description of the file", json_schema_extra={"frontend_type": "textarea", "frontend_readonly": False, "frontend_required": False})
status: Optional[str] = Field(default=None, description="Processing status: pending, extracted, embedding, indexed, failed", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
registerModelLabels(
"FileItem",
@ -33,6 +37,10 @@ registerModelLabels(
"fileHash": {"en": "File Hash", "fr": "Hash du fichier"},
"fileSize": {"en": "File Size", "fr": "Taille du fichier"},
"creationDate": {"en": "Creation Date", "fr": "Date de création"},
"tags": {"en": "Tags", "fr": "Tags"},
"folderId": {"en": "Folder ID", "fr": "ID du dossier"},
"description": {"en": "Description", "fr": "Description"},
"status": {"en": "Status", "fr": "Statut"},
},
)

View file

@ -0,0 +1,130 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge Store data models: FileContentIndex, ContentChunk, WorkflowMemory.
These models support the 3-tier RAG architecture:
- Shared Layer: mandateId-scoped, isShared=True
- Instance Layer: userId + featureInstanceId-scoped
- Workflow Layer: workflowId-scoped (WorkflowMemory)
Vector fields use json_schema_extra={"db_type": "vector(1536)"} for pgvector.
"""
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
import uuid
class FileContentIndex(BaseModel):
"""Structural index of a file's content objects. Created without AI.
Lives in the Instance Layer; optionally promoted to Shared Layer via isShared."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key (typically = fileId)")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
mandateId: str = Field(default="", description="Mandate scope")
isShared: bool = Field(default=False, description="Visible in Shared Layer for all mandate users")
fileName: str = Field(description="Original file name")
mimeType: str = Field(description="MIME type of the file")
containerPath: Optional[str] = Field(default=None, description="Path within a container (e.g. 'archive.zip/folder/report.pdf')")
totalObjects: int = Field(default=0, description="Total number of content objects extracted")
totalSize: int = Field(default=0, description="Total size of all content objects in bytes")
structure: Dict[str, Any] = Field(default_factory=dict, description="Structural overview (pages, sections, hierarchy)")
objectSummary: List[Dict[str, Any]] = Field(default_factory=list, description="Compact summary per content object")
extractedAt: float = Field(default_factory=getUtcTimestamp, description="Extraction timestamp")
status: str = Field(default="pending", description="Processing status: pending, extracted, embedding, indexed, failed")
registerModelLabels(
"FileContentIndex",
{"en": "File Content Index", "fr": "Index du contenu de fichier"},
{
"id": {"en": "ID", "fr": "ID"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"mandateId": {"en": "Mandate ID", "fr": "ID du mandat"},
"isShared": {"en": "Shared", "fr": "Partagé"},
"fileName": {"en": "File Name", "fr": "Nom de fichier"},
"mimeType": {"en": "MIME Type", "fr": "Type MIME"},
"containerPath": {"en": "Container Path", "fr": "Chemin du conteneur"},
"totalObjects": {"en": "Total Objects", "fr": "Nombre total d'objets"},
"totalSize": {"en": "Total Size", "fr": "Taille totale"},
"structure": {"en": "Structure", "fr": "Structure"},
"objectSummary": {"en": "Object Summary", "fr": "Résumé des objets"},
"extractedAt": {"en": "Extracted At", "fr": "Extrait le"},
"status": {"en": "Status", "fr": "Statut"},
},
)
class ContentChunk(BaseModel):
"""Persisted content chunk with embedding vector. Reusable across workflows.
Scalar content object (or chunk thereof) with pgvector embedding."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
contentObjectId: str = Field(description="Reference to the content object within FileContentIndex")
fileId: str = Field(description="FK to the source file")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
contentType: str = Field(description="Content type: text, image, videostream, audiostream, other")
data: str = Field(description="Content data (text, base64, URL)")
contextRef: Dict[str, Any] = Field(default_factory=dict, description="Context reference (page, position, label)")
summary: Optional[str] = Field(default=None, description="AI-generated summary (on demand)")
chunkMetadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
embedding: Optional[List[float]] = Field(
default=None, description="pgvector embedding (NOT NULL for text chunks)",
json_schema_extra={"db_type": "vector(1536)"}
)
registerModelLabels(
"ContentChunk",
{"en": "Content Chunk", "fr": "Fragment de contenu"},
{
"id": {"en": "ID", "fr": "ID"},
"contentObjectId": {"en": "Content Object ID", "fr": "ID de l'objet de contenu"},
"fileId": {"en": "File ID", "fr": "ID du fichier"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"contentType": {"en": "Content Type", "fr": "Type de contenu"},
"data": {"en": "Data", "fr": "Données"},
"contextRef": {"en": "Context Reference", "fr": "Référence contextuelle"},
"summary": {"en": "Summary", "fr": "Résumé"},
"chunkMetadata": {"en": "Metadata", "fr": "Métadonnées"},
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
},
)
class WorkflowMemory(BaseModel):
"""Workflow-scoped key-value cache for entities and facts.
Extracted during agent rounds, persisted for cross-round and cross-workflow reuse."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Primary key")
workflowId: str = Field(description="FK to the workflow")
userId: str = Field(description="Owner user ID")
featureInstanceId: str = Field(default="", description="Feature instance scope")
key: str = Field(description="Key identifier (e.g. 'entity:companyName')")
value: str = Field(description="Extracted value")
source: str = Field(default="extraction", description="Origin: extraction, tool, conversation, summary")
createdAt: float = Field(default_factory=getUtcTimestamp, description="Creation timestamp")
embedding: Optional[List[float]] = Field(
default=None, description="Optional embedding for semantic lookup",
json_schema_extra={"db_type": "vector(1536)"}
)
registerModelLabels(
"WorkflowMemory",
{"en": "Workflow Memory", "fr": "Mémoire de workflow"},
{
"id": {"en": "ID", "fr": "ID"},
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
"userId": {"en": "User ID", "fr": "ID utilisateur"},
"featureInstanceId": {"en": "Feature Instance ID", "fr": "ID de l'instance"},
"key": {"en": "Key", "fr": "Clé"},
"value": {"en": "Value", "fr": "Valeur"},
"source": {"en": "Source", "fr": "Source"},
"createdAt": {"en": "Created At", "fr": "Créé le"},
"embedding": {"en": "Embedding", "fr": "Vecteur d'embedding"},
},
)

View file

@ -2,6 +2,7 @@
# All rights reserved.
"""Voice settings datamodel."""
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
@ -16,6 +17,7 @@ class VoiceSettings(BaseModel):
sttLanguage: str = Field(default="de-DE", description="Speech-to-Text language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsLanguage: str = Field(default="de-DE", description="Text-to-Speech language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoice: str = Field(default="de-DE-KatjaNeural", description="Text-to-Speech voice", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoiceMap: Dict[str, Any] = Field(default_factory=dict, description="Per-language voice mapping, e.g. {'de-DE': {'voiceName': 'de-DE-Wavenet-A'}, 'en-US': {'voiceName': 'en-US-Wavenet-C'}}", json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False})
translationEnabled: bool = Field(default=True, description="Whether translation is enabled", json_schema_extra={"frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False})
targetLanguage: str = Field(default="en-US", description="Target language for translation", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False})
creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the settings were created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
@ -33,6 +35,7 @@ registerModelLabels(
"sttLanguage": {"en": "STT Language", "fr": "Langue STT"},
"ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"},
"ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"},
"ttsVoiceMap": {"en": "TTS Voice Map", "fr": "Carte des voix TTS"},
"translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"},
"targetLanguage": {"en": "Target Language", "fr": "Langue cible"},
"creationDate": {"en": "Creation Date", "fr": "Date de création"},

View file

@ -180,7 +180,7 @@ def getAutomationServices(
for spec in REQUIRED_SERVICES:
key = spec["serviceKey"]
try:
svc = getService(key, ctx, legacy_hub=None)
svc = getService(key, ctx)
setattr(hub, key, svc)
except Exception as e:
logger.warning(f"Could not resolve service '{key}' for automation: {e}")

View file

@ -21,6 +21,7 @@ from modules.datamodels.datamodelChat import ChatWorkflow, ChatMessage, ChatLog
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.interfaces import interfaceDbChat
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
# Configure logger
logger = logging.getLogger(__name__)
@ -682,7 +683,9 @@ def get_automation_workflow_chat_data(
workflow = chatInterface.getWorkflow(workflowId)
if not workflow:
raise HTTPException(status_code=404, detail=f"Workflow {workflowId} not found")
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
billingInterface = _getBillingInterface(context.user, context.mandateId)
workflowCost = billingInterface.getWorkflowCost(workflowId)
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
except HTTPException:
raise
except Exception as e:

View file

@ -1291,17 +1291,6 @@ class ChatObjects:
logger.error(f"Error updating message {messageId}: {str(e)}", exc_info=True)
raise ValueError(f"Error updating message {messageId}: {str(e)}")
def createStat(self, statData: Dict[str, Any]):
"""Create stat record. Compatibility with ChatService; stats may not be persisted in chatbot schema."""
from modules.datamodels.datamodelChat import ChatStat
stat = ChatStat(**statData)
try:
created = self.db.recordCreate(ChatStat, statData)
return ChatStat(**created)
except Exception as e:
logger.debug(f"createStat: not persisting (chatbot schema): {e}")
return stat
def deleteMessage(self, conversationId: str, messageId: str) -> bool:
"""Deletes a conversation message and related data if user has access."""
try:

View file

@ -179,7 +179,7 @@ def getChatbotServices(
for spec in REQUIRED_SERVICES:
key = spec["serviceKey"]
try:
svc = getService(key, ctx, legacy_hub=None)
svc = getService(key, ctx)
setattr(hub, key, svc)
except Exception as e:
logger.warning(f"Could not resolve service '{key}' for chatbot: {e}")
@ -197,7 +197,7 @@ def getChatStreamingHelper():
from modules.serviceCenter.context import ServiceCenterContext
# Minimal context - streaming service only needs it for resolver
ctx = ServiceCenterContext(user=__get_placeholder_user(), mandate_id=None, feature_instance_id=None)
streaming = getService("streaming", ctx, legacy_hub=None)
streaming = getService("streaming", ctx)
return streaming.getChatStreamingHelper() if streaming else None
@ -219,7 +219,7 @@ def getEventManager(user, mandateId: Optional[str] = None, featureInstanceId: Op
mandate_id=mandateId,
feature_instance_id=featureInstanceId,
)
streaming = getService("streaming", ctx, legacy_hub=None)
streaming = getService("streaming", ctx)
return streaming.getEventManager()
@ -306,12 +306,12 @@ def getChatbotServices(
Uses interfaceFeatureChatbot (ChatObjects) for interfaceDbChat to avoid
duplicate DB init - chatProcess reuses hub.interfaceDbChat.
"""
from modules.services import PublicService
from modules.serviceHub import PublicService
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
from modules.features.chatbot.interfaceFeatureChatbot import getInterface as getChatbotInterface
from modules.services.serviceChat.mainServiceChat import ChatService
from modules.services.serviceAi.mainServiceAi import AiService
from modules.services.serviceStreaming.mainServiceStreaming import StreamingService
from modules.serviceCenter.services.serviceChat.mainServiceChat import ChatService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.core.serviceStreaming.mainServiceStreaming import StreamingService
hub = _ChatbotServiceHub()
hub.user = user
@ -344,7 +344,7 @@ def getChatbotServices(
feature_instance_id=featureInstanceId,
workflow=_workflow,
)
hub.billing = getService("billing", ctx, legacy_hub=None)
hub.billing = getService("billing", ctx)
except Exception as e:
logger.warning(f"Could not resolve billing service for chatbot: {e}")
hub.billing = None

View file

@ -135,11 +135,3 @@ class ChatPlaygroundObjects:
def createLog(self, log) -> Dict[str, Any]:
"""Create a new log entry."""
return self._chatInterface.createLog(log)
def getStats(self, workflowId: str) -> List[Dict[str, Any]]:
"""Get stats for a workflow."""
return self._chatInterface.getStats(workflowId)
def createStat(self, stat) -> Dict[str, Any]:
"""Create a new stat entry."""
return self._chatInterface.createStat(stat)

View file

@ -158,7 +158,7 @@ def getChatplaygroundServices(
for spec in REQUIRED_SERVICES:
key = spec["serviceKey"]
try:
svc = getService(key, ctx, legacy_hub=None)
svc = getService(key, ctx)
setattr(hub, key, svc)
except Exception as e:
logger.warning(f"Could not resolve service '{key}' for chatplayground: {e}")

View file

@ -15,6 +15,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
# Import interfaces
from modules.interfaces import interfaceDbChat
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
# Import models
from modules.datamodels.datamodelChat import (
@ -220,9 +221,11 @@ def get_workflow_chat_data(
detail=f"Workflow with ID {workflowId} not found"
)
# Get unified chat data
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
# Get workflow cost from billing transactions (single source of truth)
billingInterface = _getBillingInterface(context.user, context.mandateId)
workflowCost = billingInterface.getWorkflowCost(workflowId)
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
return chatData
except HTTPException:

View file

@ -17,7 +17,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
from modules.interfaces import interfaceDbChat, interfaceDbManagement
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.datamodels.datamodelChat import UserInputRequest
from modules.services.serviceStreaming import get_event_manager
from modules.serviceCenter.core.serviceStreaming import get_event_manager
from modules.features.codeeditor import codeEditorProcessor, fileContextManager
from modules.features.codeeditor.datamodelCodeeditor import FileEditProposal, EditStatusEnum

View file

@ -1011,7 +1011,7 @@ class CommcoachService:
async def _callAi(self, systemPrompt: str, userPrompt: str):
"""Call the AI service with the given prompts."""
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
serviceContext = type('Ctx', (), {
'user': self.currentUser,

View file

@ -7,7 +7,7 @@ from urllib.parse import urlparse, unquote
from modules.datamodels.datamodelUam import User
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
logger = logging.getLogger(__name__)
@ -205,7 +205,7 @@ class NeutralizationPlayground:
async def processSharepointFiles(self, sourcePath: str, targetPath: str) -> Dict[str, Any]:
"""Process files from SharePoint source path and store neutralized files in target path"""
from modules.services.serviceSharepoint.mainServiceSharepoint import SharepointService
from modules.serviceCenter.services.serviceSharepoint.mainServiceSharepoint import SharepointService
processor = SharepointProcessor(self.currentUser, self.services)
return await processor.processSharepointFiles(sourcePath, targetPath)

View file

@ -262,8 +262,8 @@ class NeutralizationService:
fileId: Optional[str]
) -> Dict[str, Any]:
"""Extract -> neutralize -> adapt -> generate for PDF/DOCX/XLSX/PPTX."""
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
# Ensure registries exist
@ -405,10 +405,10 @@ class NeutralizationService:
def _getRendererForMime(self, mimeType: str):
"""Get renderer instance and output mime for the given input MIME type."""
from modules.services.serviceGeneration.renderers.rendererPdf import RendererPdf
from modules.services.serviceGeneration.renderers.rendererDocx import RendererDocx
from modules.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
from modules.services.serviceGeneration.renderers.rendererPptx import RendererPptx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import RendererPdf
from modules.serviceCenter.services.serviceGeneration.renderers.rendererDocx import RendererDocx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPptx import RendererPptx
mime_map = {
"application/pdf": (RendererPdf, "application/pdf"),

View file

@ -284,7 +284,7 @@ from .datamodelFeatureRealEstate import (
Land,
DokumentTyp,
)
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
from .interfaceFeatureRealEstate import getInterface as getRealEstateInterface
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector

View file

@ -843,7 +843,7 @@ async def testVoice(
):
"""Test TTS voice with AI-generated sample text in the correct language."""
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
mandateId = _validateInstanceAccess(instanceId, context)

View file

@ -1062,7 +1062,7 @@ class TeamsbotService:
# Call SPEECH_TEAMS
try:
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
# Create minimal service context for AI billing
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1684,7 +1684,7 @@ class TeamsbotService:
"""Summarize a long user-provided session context to its essential points.
This reduces token usage in every subsequent AI call."""
try:
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1738,7 +1738,7 @@ class TeamsbotService:
lines.append(f"[{speaker}]: {text}")
textToSummarize = "\n".join(lines)
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1783,7 +1783,7 @@ class TeamsbotService:
for t in transcripts
)
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
aiService = AiService(serviceCenter=serviceContext)

View file

@ -188,7 +188,7 @@ def get_mime_type_options(
"""Get supported MIME types from the document extraction service.
Returns: [{ value: "mime/type", label: "Description" }]
"""
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
formats = registry.getSupportedFormats()

View file

@ -0,0 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Unified AI Workspace feature -- merges Codeeditor, Chatbot, and Playground."""

View file

@ -0,0 +1,255 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Workspace Feature Container - Main Module.
Handles feature initialization and RBAC catalog registration.
Unified AI Workspace combining Codeeditor, Chatbot, and Playground capabilities.
"""
import logging
from typing import Dict, List, Any
logger = logging.getLogger(__name__)
FEATURE_CODE = "workspace"
FEATURE_LABEL = {"en": "AI Workspace", "de": "AI Workspace", "fr": "AI Workspace"}
FEATURE_ICON = "mdi-brain"
UI_OBJECTS = [
{
"objectKey": "ui.feature.workspace.dashboard",
"label": {"en": "Dashboard", "de": "Dashboard", "fr": "Tableau de bord"},
"meta": {"area": "dashboard"}
},
{
"objectKey": "ui.feature.workspace.settings",
"label": {"en": "Settings", "de": "Einstellungen", "fr": "Parametres"},
"meta": {"area": "settings"}
},
]
RESOURCE_OBJECTS = [
{
"objectKey": "resource.feature.workspace.start",
"label": {"en": "Start Agent", "de": "Agent starten", "fr": "Demarrer agent"},
"meta": {"endpoint": "/api/workspace/{instanceId}/start/stream", "method": "POST"}
},
{
"objectKey": "resource.feature.workspace.stop",
"label": {"en": "Stop Agent", "de": "Agent stoppen", "fr": "Arreter agent"},
"meta": {"endpoint": "/api/workspace/{instanceId}/{workflowId}/stop", "method": "POST"}
},
{
"objectKey": "resource.feature.workspace.files",
"label": {"en": "Manage Files", "de": "Dateien verwalten", "fr": "Gerer fichiers"},
"meta": {"endpoint": "/api/workspace/{instanceId}/files", "method": "GET"}
},
{
"objectKey": "resource.feature.workspace.folders",
"label": {"en": "Manage Folders", "de": "Ordner verwalten", "fr": "Gerer dossiers"},
"meta": {"endpoint": "/api/workspace/{instanceId}/folders", "method": "GET"}
},
{
"objectKey": "resource.feature.workspace.datasources",
"label": {"en": "Data Sources", "de": "Datenquellen", "fr": "Sources de donnees"},
"meta": {"endpoint": "/api/workspace/{instanceId}/datasources", "method": "GET"}
},
{
"objectKey": "resource.feature.workspace.voice",
"label": {"en": "Voice Input/Output", "de": "Spracheingabe/-ausgabe", "fr": "Entree/sortie vocale"},
"meta": {"endpoint": "/api/workspace/{instanceId}/voice/*", "method": "POST"}
},
]
TEMPLATE_ROLES = [
{
"roleLabel": "workspace-viewer",
"description": {
"en": "Workspace Viewer - View workspace (read-only)",
"de": "Workspace Betrachter - Workspace ansehen (nur lesen)",
"fr": "Visualiseur Workspace - Consulter le workspace (lecture seule)"
},
"accessRules": [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"},
]
},
{
"roleLabel": "workspace-user",
"description": {
"en": "Workspace User - Use AI workspace and tools",
"de": "Workspace Benutzer - AI Workspace und Tools nutzen",
"fr": "Utilisateur Workspace - Utiliser l'espace de travail AI et les outils"
},
"accessRules": [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.folders", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.datasources", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.voice", "view": True},
{"context": "DATA", "item": None, "view": True, "read": "m", "create": "m", "update": "m", "delete": "m"},
]
},
{
"roleLabel": "workspace-admin",
"description": {
"en": "Workspace Admin - Full access to AI workspace",
"de": "Workspace Admin - Vollzugriff auf AI Workspace",
"fr": "Administrateur Workspace - Acces complet au workspace AI"
},
"accessRules": [
{"context": "UI", "item": None, "view": True},
{"context": "RESOURCE", "item": None, "view": True},
{"context": "DATA", "item": None, "view": True, "read": "a", "create": "a", "update": "a", "delete": "a"},
]
},
]
def getFeatureDefinition() -> Dict[str, Any]:
"""Return the feature definition for registration."""
return {
"code": FEATURE_CODE,
"label": FEATURE_LABEL,
"icon": FEATURE_ICON,
"autoCreateInstance": True,
}
def getUiObjects() -> List[Dict[str, Any]]:
"""Return UI objects for RBAC catalog registration."""
return UI_OBJECTS
def getResourceObjects() -> List[Dict[str, Any]]:
"""Return resource objects for RBAC catalog registration."""
return RESOURCE_OBJECTS
def getTemplateRoles() -> List[Dict[str, Any]]:
"""Return template roles for this feature."""
return TEMPLATE_ROLES
def registerFeature(catalogService) -> bool:
"""Register this feature's RBAC objects in the catalog."""
try:
for uiObj in UI_OBJECTS:
catalogService.registerUiObject(
featureCode=FEATURE_CODE,
objectKey=uiObj["objectKey"],
label=uiObj["label"],
meta=uiObj.get("meta")
)
for resObj in RESOURCE_OBJECTS:
catalogService.registerResourceObject(
featureCode=FEATURE_CODE,
objectKey=resObj["objectKey"],
label=resObj["label"],
meta=resObj.get("meta")
)
_syncTemplateRolesToDb()
logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects")
return True
except Exception as e:
logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}")
return False
def _syncTemplateRolesToDb() -> int:
"""Sync template roles and their AccessRules to the database."""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.datamodels.datamodelRbac import Role, AccessRule, AccessRuleContext
rootInterface = getRootInterface()
existingRoles = rootInterface.getRolesByFeatureCode(FEATURE_CODE)
templateRoles = [r for r in existingRoles if r.mandateId is None]
existingRoleLabels = {r.roleLabel: str(r.id) for r in templateRoles}
createdCount = 0
for roleTemplate in TEMPLATE_ROLES:
roleLabel = roleTemplate["roleLabel"]
if roleLabel in existingRoleLabels:
roleId = existingRoleLabels[roleLabel]
_ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", []))
else:
newRole = Role(
roleLabel=roleLabel,
description=roleTemplate.get("description", {}),
featureCode=FEATURE_CODE,
mandateId=None,
featureInstanceId=None,
isSystemRole=False
)
createdRole = rootInterface.db.recordCreate(Role, newRole.model_dump())
roleId = createdRole.get("id")
_ensureAccessRulesForRole(rootInterface, roleId, roleTemplate.get("accessRules", []))
logger.info(f"Created template role '{roleLabel}' with ID {roleId}")
createdCount += 1
if createdCount > 0:
logger.info(f"Feature '{FEATURE_CODE}': Created {createdCount} template roles")
return createdCount
except Exception as e:
logger.error(f"Error syncing template roles for feature '{FEATURE_CODE}': {e}")
return 0
def _ensureAccessRulesForRole(rootInterface, roleId: str, ruleTemplates: List[Dict[str, Any]]) -> int:
"""Ensure AccessRules exist for a role based on templates."""
from modules.datamodels.datamodelRbac import AccessRule, AccessRuleContext
existingRules = rootInterface.getAccessRulesByRole(roleId)
existingSignatures = set()
for rule in existingRules:
sig = (rule.context.value if rule.context else None, rule.item)
existingSignatures.add(sig)
createdCount = 0
for template in ruleTemplates:
context = template.get("context", "UI")
item = template.get("item")
sig = (context, item)
if sig in existingSignatures:
continue
if context == "UI":
contextEnum = AccessRuleContext.UI
elif context == "DATA":
contextEnum = AccessRuleContext.DATA
elif context == "RESOURCE":
contextEnum = AccessRuleContext.RESOURCE
else:
contextEnum = context
newRule = AccessRule(
roleId=roleId,
context=contextEnum,
item=item,
view=template.get("view", False),
read=template.get("read"),
create=template.get("create"),
update=template.get("update"),
delete=template.get("delete"),
)
rootInterface.db.recordCreate(AccessRule, newRule.model_dump())
createdCount += 1
if createdCount > 0:
logger.debug(f"Created {createdCount} AccessRules for role {roleId}")
return createdCount

File diff suppressed because it is too large Load diff

View file

@ -4,7 +4,7 @@ import logging
import asyncio
import uuid
import base64
from typing import Dict, Any, List, Union, Tuple, Optional, Callable
from typing import Dict, Any, List, Union, Tuple, Optional, Callable, AsyncGenerator
from dataclasses import dataclass, field
import time
@ -84,15 +84,16 @@ class AiObjects:
# AI for Extraction, Processing, Generation
async def callWithTextContext(self, request: AiCallRequest) -> AiCallResponse:
"""Call AI model for traditional text/context calls with fallback mechanism."""
"""Call AI model for traditional text/context calls with fallback mechanism.
Supports two modes:
- Legacy: prompt + context constructs messages internally
- Agent: request.messages provided passes through directly
"""
prompt = request.prompt
context = request.context or ""
options = request.options
# Input bytes will be calculated inside _callWithModel
# Generation parameters are handled inside _callWithModel
# Get failover models for this operation type
availableModels = modelRegistry.getAvailableModels()
@ -127,10 +128,12 @@ class AiObjects:
try:
logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
# Call the model directly - no truncation or compression here
response = await self._callWithModel(model, prompt, context, options)
if request.messages:
response = await self._callWithMessages(model, request.messages, options, request.tools)
else:
response = await self._callWithModel(model, prompt, context, options)
logger.info(f"✅ AI call successful with model: {model.name}")
logger.info(f"AI call successful with model: {model.name}")
return response
except Exception as e:
@ -142,8 +145,7 @@ class AiObjects:
logger.info(f"Trying next failover model...")
continue
else:
# All models failed
logger.error(f"💥 All {len(failoverModelList)} models failed for operation {options.operationType}")
logger.error(f"All {len(failoverModelList)} models failed for operation {options.operationType}")
break
# All failover attempts failed - return error response
@ -254,6 +256,242 @@ class AiObjects:
return response
async def _callWithMessages(self, model: AiModel, messages: List[Dict[str, Any]],
options: AiCallOptions = None,
tools: List[Dict[str, Any]] = None) -> AiCallResponse:
"""Call a model with pre-built messages (agent mode). Supports tools for native function calling."""
import json as _json
inputBytes = sum(len(str(m.get("content", "")).encode("utf-8")) for m in messages)
startTime = time.time()
if not model.functionCall:
raise ValueError(f"Model {model.name} has no function call defined")
modelCall = AiModelCall(
messages=messages,
model=model,
options=options or {},
tools=tools
)
modelResponse = await model.functionCall(modelCall)
if not modelResponse.success:
raise ValueError(f"Model call failed: {modelResponse.error}")
endTime = time.time()
processingTime = endTime - startTime
content = modelResponse.content
outputBytes = len(content.encode("utf-8"))
priceCHF = model.calculatepriceCHF(processingTime, inputBytes, outputBytes)
# Extract tool calls from metadata if present (native function calling)
responseToolCalls = None
if modelResponse.metadata:
responseToolCalls = modelResponse.metadata.get("toolCalls")
response = AiCallResponse(
content=content,
modelName=model.name,
provider=model.connectorType,
priceCHF=priceCHF,
processingTime=processingTime,
bytesSent=inputBytes,
bytesReceived=outputBytes,
errorCount=0,
toolCalls=responseToolCalls
)
if self.billingCallback:
try:
self.billingCallback(response)
except Exception as e:
logger.error(f"BILLING: Failed to record billing for model {model.name}: {e}")
return response
async def callWithTextContextStream(
self, request: AiCallRequest
) -> AsyncGenerator[Union[str, AiCallResponse], None]:
"""Streaming variant of callWithTextContext. Yields str deltas, then final AiCallResponse."""
options = request.options
availableModels = modelRegistry.getAvailableModels()
allowedProviders = getattr(options, 'allowedProviders', None) if options else None
if allowedProviders:
filtered = [m for m in availableModels if m.connectorType in allowedProviders]
if filtered:
availableModels = filtered
failoverModelList = modelSelector.getFailoverModelList(
request.prompt, request.context or "", options, availableModels
)
if not failoverModelList:
yield AiCallResponse(
content=f"No suitable models found for operation {options.operationType}",
modelName="error", priceCHF=0.0, processingTime=0.0,
bytesSent=0, bytesReceived=0, errorCount=1,
)
return
lastError = None
for attempt, model in enumerate(failoverModelList):
try:
logger.info(f"Streaming AI call with model: {model.name} (attempt {attempt + 1})")
async for chunk in self._callWithMessagesStream(model, request.messages, options, request.tools):
yield chunk
return
except Exception as e:
lastError = e
logger.warning(f"Streaming AI call failed with {model.name}: {e}")
modelSelector.reportFailure(model.name)
if attempt < len(failoverModelList) - 1:
continue
break
yield AiCallResponse(
content=f"All models failed (stream). Last error: {lastError}",
modelName="error", priceCHF=0.0, processingTime=0.0,
bytesSent=0, bytesReceived=0, errorCount=1,
)
async def _callWithMessagesStream(
self, model: AiModel, messages: List[Dict[str, Any]],
options: AiCallOptions = None, tools: List[Dict[str, Any]] = None,
) -> AsyncGenerator[Union[str, AiCallResponse], None]:
"""Stream a model call. Yields str deltas, then final AiCallResponse with billing."""
from modules.datamodels.datamodelAi import AiModelCall, AiModelResponse
inputBytes = sum(len(str(m.get("content", "")).encode("utf-8")) for m in messages)
startTime = time.time()
if not model.functionCallStream:
response = await self._callWithMessages(model, messages, options, tools)
if response.content:
yield response.content
yield response
return
modelCall = AiModelCall(
messages=messages, model=model,
options=options or {}, tools=tools,
)
finalModelResponse = None
async for item in model.functionCallStream(modelCall):
if isinstance(item, AiModelResponse):
finalModelResponse = item
else:
yield item
if not finalModelResponse:
raise ValueError(f"Stream from {model.name} produced no final AiModelResponse")
endTime = time.time()
processingTime = endTime - startTime
content = finalModelResponse.content
outputBytes = len(content.encode("utf-8"))
priceCHF = model.calculatepriceCHF(processingTime, inputBytes, outputBytes)
responseToolCalls = None
if finalModelResponse.metadata:
responseToolCalls = finalModelResponse.metadata.get("toolCalls")
response = AiCallResponse(
content=content,
modelName=model.name,
provider=model.connectorType,
priceCHF=priceCHF,
processingTime=processingTime,
bytesSent=inputBytes,
bytesReceived=outputBytes,
errorCount=0,
toolCalls=responseToolCalls,
)
if self.billingCallback:
try:
self.billingCallback(response)
except Exception as e:
logger.error(f"BILLING: Failed to record stream billing for {model.name}: {e}")
yield response
async def callEmbedding(self, texts: List[str], options: AiCallOptions = None) -> AiCallResponse:
"""Generate embeddings for a list of texts using the best available embedding model.
Uses the standard model selector with OperationTypeEnum.EMBEDDING to pick the model.
Failover across providers (OpenAI Mistral) works identically to chat models.
Returns:
AiCallResponse with metadata["embeddings"] containing the vectors.
"""
if options is None:
options = AiCallOptions(operationType=OperationTypeEnum.EMBEDDING)
else:
options.operationType = OperationTypeEnum.EMBEDDING
combinedText = " ".join(texts[:3])[:500]
availableModels = modelRegistry.getAvailableModels()
failoverModelList = modelSelector.getFailoverModelList(
combinedText, "", options, availableModels
)
if not failoverModelList:
return AiCallResponse(
content="", modelName="error", priceCHF=0.0,
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1
)
lastError = None
for attempt, model in enumerate(failoverModelList):
try:
logger.info(f"Embedding call with {model.name} (attempt {attempt + 1}/{len(failoverModelList)})")
inputBytes = sum(len(t.encode("utf-8")) for t in texts)
startTime = time.time()
modelCall = AiModelCall(
model=model, options=options, embeddingInput=texts
)
modelResponse = await model.functionCall(modelCall)
if not modelResponse.success:
raise ValueError(f"Embedding call failed: {modelResponse.error}")
processingTime = time.time() - startTime
priceCHF = model.calculatepriceCHF(processingTime, inputBytes, 0)
embeddings = (modelResponse.metadata or {}).get("embeddings", [])
response = AiCallResponse(
content="", modelName=model.name, provider=model.connectorType,
priceCHF=priceCHF, processingTime=processingTime,
bytesSent=inputBytes, bytesReceived=0, errorCount=0,
metadata={"embeddings": embeddings}
)
if self.billingCallback:
try:
self.billingCallback(response)
except Exception as e:
logger.error(f"BILLING: Failed to record billing for embedding {model.name}: {e}")
return response
except Exception as e:
lastError = e
logger.warning(f"Embedding call failed with {model.name}: {str(e)}")
modelSelector.reportFailure(model.name)
if attempt < len(failoverModelList) - 1:
continue
break
errorMsg = f"All embedding models failed. Last error: {str(lastError)}"
logger.error(errorMsg)
return AiCallResponse(
content=errorMsg, modelName="error", priceCHF=0.0,
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1
)
# Utility methods
async def listAvailableModels(self, connectorType: str = None) -> List[Dict[str, Any]]:

View file

@ -764,7 +764,11 @@ class BillingObjects:
featureCode: str = None,
aicoreProvider: str = None,
aicoreModel: str = None,
description: str = "AI Usage"
description: str = "AI Usage",
processingTime: float = None,
bytesSent: int = None,
bytesReceived: int = None,
errorCount: int = None
) -> Optional[Dict[str, Any]]:
"""
Record usage cost as a billing transaction.
@ -774,20 +778,6 @@ class BillingObjects:
- PREPAY_USER: deduct from user's own balance
- PREPAY_MANDATE: deduct from mandate pool balance
- CREDIT_POSTPAY: deduct from mandate pool balance
Args:
mandateId: Mandate ID
userId: User ID
priceCHF: Cost in CHF
workflowId: Optional workflow ID
featureInstanceId: Optional feature instance ID
featureCode: Optional feature code
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
description: Transaction description
Returns:
Created transaction dict or None
"""
if priceCHF <= 0:
return None
@ -816,7 +806,11 @@ class BillingObjects:
featureCode=featureCode,
aicoreProvider=aicoreProvider,
aicoreModel=aicoreModel,
createdByUserId=userId
createdByUserId=userId,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=errorCount
)
# Determine where to deduct balance
@ -828,6 +822,20 @@ class BillingObjects:
poolAccount = self.getOrCreateMandateAccount(mandateId)
return self.createTransaction(transaction, balanceAccountId=poolAccount["id"])
# =========================================================================
# Workflow Cost Query
# =========================================================================
def getWorkflowCost(self, workflowId: str) -> float:
"""Sum of all transaction amounts for a workflow."""
if not workflowId:
return 0.0
transactions = self.db.getRecordset(
BillingTransaction,
recordFilter={"workflowId": workflowId}
)
return sum(t.get("amount", 0.0) for t in transactions)
# =========================================================================
# Billing Model Switch Operations
# =========================================================================

View file

@ -18,7 +18,6 @@ from modules.datamodels.datamodelUam import AccessLevel
from modules.datamodels.datamodelChat import (
ChatDocument,
ChatStat,
ChatLog,
ChatMessage,
ChatWorkflow,
@ -663,10 +662,8 @@ class ChatObjects:
workflow = workflows[0]
try:
# Load related data from normalized tables
logs = self.getLogs(workflowId)
messages = self.getMessages(workflowId)
stats = self.getStats(workflowId)
# Validate workflow data against ChatWorkflow model
# Explicit type coercion: DB may store numeric fields as TEXT on some platforms
@ -694,8 +691,7 @@ class ChatObjects:
lastActivity=_toFloat(workflow.get("lastActivity")),
startedAt=_toFloat(workflow.get("startedAt")),
logs=logs,
messages=messages,
stats=stats
messages=messages
)
except Exception as e:
logger.error(f"Error validating workflow data: {str(e)}")
@ -731,7 +727,7 @@ class ChatObjects:
except Exception as e:
logger.warning(f"Could not get Root mandate: {e}")
# Note: ChatWorkflow has featureInstanceId for multi-tenancy isolation.
# Child tables (ChatMessage, ChatLog, ChatStat, ChatDocument) are user-owned
# Child tables (ChatMessage, ChatLog, ChatDocument) are user-owned
# and do NOT store featureInstanceId - they inherit isolation from ChatWorkflow.
# Ensure featureInstanceId is set from context if not already in workflowData
if "featureInstanceId" not in workflowData or not workflowData.get("featureInstanceId"):
@ -760,7 +756,7 @@ class ChatObjects:
logs=[],
messages=[],
stats=[],
workflowMode=created["workflowMode"],
workflowMode=created.get("workflowMode", "Dynamic"),
maxSteps=created.get("maxSteps", 1)
)
@ -789,23 +785,20 @@ class ChatObjects:
# Load fresh data from normalized tables
logs = self.getLogs(workflowId)
messages = self.getMessages(workflowId)
stats = self.getStats(workflowId)
# Convert to ChatWorkflow model
return ChatWorkflow(
id=updated["id"],
status=updated.get("status", workflow.status),
name=updated.get("name", workflow.name),
currentRound=updated.get("currentRound", workflow.currentRound),
currentTask=updated.get("currentTask", workflow.currentTask),
currentAction=updated.get("currentAction", workflow.currentAction),
totalTasks=updated.get("totalTasks", workflow.totalTasks),
totalActions=updated.get("totalActions", workflow.totalActions),
currentRound=updated.get("currentRound") or getattr(workflow, "currentRound", 0) or 0,
currentTask=updated.get("currentTask") or getattr(workflow, "currentTask", 0) or 0,
currentAction=updated.get("currentAction") or getattr(workflow, "currentAction", 0) or 0,
totalTasks=updated.get("totalTasks") or getattr(workflow, "totalTasks", 0) or 0,
totalActions=updated.get("totalActions") or getattr(workflow, "totalActions", 0) or 0,
lastActivity=updated.get("lastActivity", workflow.lastActivity),
startedAt=updated.get("startedAt", workflow.startedAt),
logs=logs,
messages=messages,
stats=stats
messages=messages
)
def deleteWorkflow(self, workflowId: str) -> bool:
@ -827,7 +820,6 @@ class ChatObjects:
messageId = message.id
if messageId:
# Delete message documents (but NOT the files!)
# Note: ChatStat does NOT have messageId - stats are only at workflow level
try:
existing_docs = self._getRecordset(ChatDocument, recordFilter={"messageId": messageId})
for doc in existing_docs:
@ -839,11 +831,7 @@ class ChatObjects:
self.db.recordDelete(ChatMessage, messageId)
# 2. Delete workflow stats
existing_stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
for stat in existing_stats:
self.db.recordDelete(ChatStat, stat["id"])
# 3. Delete workflow logs
# 2. Delete workflow logs
existing_logs = self._getRecordset(ChatLog, recordFilter={"workflowId": workflowId})
for log in existing_logs:
self.db.recordDelete(ChatLog, log["id"])
@ -1270,7 +1258,6 @@ class ChatObjects:
self.db.recordDelete(ChatDocument, doc["id"])
# 2. Finally delete the message itself
# Note: ChatStat has no messageId field -- stats are workflow-level, not message-level
success = self.db.recordDelete(ChatMessage, messageId)
return success
@ -1517,74 +1504,10 @@ class ChatObjects:
# Return validated ChatLog instance
return ChatLog(**createdLog)
# Stats methods
def getStats(self, workflowId: str) -> List[ChatStat]:
"""Returns list of statistics for a workflow if user has access."""
# Check workflow access first (without calling getWorkflow to avoid circular reference)
# Use RBAC filtering
workflows = self._getRecordset(ChatWorkflow, recordFilter={"id": workflowId})
if not workflows:
return []
# Get stats for this workflow from normalized table
stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
if not stats:
return []
# Return all stats records sorted by creation time.
# Use parseTimestamp to tolerate mixed DB types (float/string) on INT.
# DB uses _createdAt (camelCase system field).
stats.sort(key=lambda x: parseTimestamp(x.get("_createdAt"), default=0))
# Convert to ChatStat objects, preserving _createdAt via extra="allow"
result = []
for stat in stats:
chat_stat = ChatStat(**stat)
# Explicitly preserve _createdAt from raw DB record
if "_createdAt" in stat:
setattr(chat_stat, '_createdAt', stat["_createdAt"])
result.append(chat_stat)
return result
def createStat(self, statData: Dict[str, Any]) -> ChatStat:
"""Creates a new stats record and returns it."""
try:
# Ensure workflowId is present in statData
if "workflowId" not in statData:
raise ValueError("workflowId is required in statData")
# Note: Chat data is user-owned, no mandate/featureInstance context stored
# mandateId/featureInstanceId removed from ChatStat model
# Validate the stat data against ChatStat model
stat = ChatStat(**statData)
logger.debug(f"Creating stat for workflow {statData.get('workflowId')}: "
f"process={statData.get('process')}, "
f"priceCHF={statData.get('priceCHF', 0):.4f}, "
f"processingTime={statData.get('processingTime', 0):.2f}s")
# Create the stat record in the database
created = self.db.recordCreate(ChatStat, stat)
logger.info(f"Created stat {created.get('id')} for workflow {statData.get('workflowId')}")
# Return the created ChatStat
return ChatStat(**created)
except Exception as e:
logger.error(f"Error creating workflow stat: {str(e)}")
raise
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None) -> Dict[str, Any]:
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None, workflowCost: float = 0.0) -> Dict[str, Any]:
"""
Returns unified chat data (messages, logs, stats) for a workflow in chronological order.
Uses timestamp-based selective data transfer for efficient polling.
Returns unified chat data (messages, logs) for a workflow in chronological order,
plus workflowCost from billing transactions (single source of truth).
"""
# Check workflow access first
# Use RBAC filtering
@ -1652,29 +1575,10 @@ class ChatObjects:
"item": chatLog
})
# Get stats - ChatStat model supports _createdAt via model_config extra="allow"
stats = self.getStats(workflowId)
for stat in stats:
# Apply timestamp filtering in Python
# Use _createdAt (system field from DB, preserved via model_config extra="allow")
stat_timestamp = getattr(stat, '_createdAt', None) or getUtcTimestamp()
if afterTimestamp is not None and stat_timestamp <= afterTimestamp:
continue
# Convert to dict and include _createdAt for frontend
stat_dict = stat.model_dump() if hasattr(stat, 'model_dump') else stat.dict()
stat_dict['_createdAt'] = stat_timestamp
items.append({
"type": "stat",
"createdAt": stat_timestamp,
"item": stat_dict
})
# Sort all items by createdAt timestamp for chronological order
items.sort(key=lambda x: parseTimestamp(x.get("createdAt"), default=0))
return {"items": items}
return {"items": items, "workflowCost": workflowCost}
def getInterface(currentUser: Optional[User] = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> 'ChatObjects':

View file

@ -0,0 +1,234 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Interface to the Knowledge Store database (poweron_knowledge).
Provides CRUD for FileContentIndex, ContentChunk, WorkflowMemory
and semantic search via pgvector.
"""
import logging
from typing import Dict, Any, List, Optional
from modules.connectors.connectorDbPostgre import _get_cached_connector
from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk, WorkflowMemory
from modules.datamodels.datamodelUam import User
from modules.shared.configuration import APP_CONFIG
from modules.shared.timeUtils import getUtcTimestamp
logger = logging.getLogger(__name__)
_instances: Dict[str, "KnowledgeObjects"] = {}
class KnowledgeObjects:
"""Interface to the Knowledge Store database.
Manages FileContentIndex, ContentChunk, and WorkflowMemory with semantic search."""
def __init__(self):
self.currentUser: Optional[User] = None
self.userId: Optional[str] = None
self._initializeDatabase()
def _initializeDatabase(self):
dbHost = APP_CONFIG.get("DB_HOST", "_no_config_default_data")
dbDatabase = "poweron_knowledge"
dbUser = APP_CONFIG.get("DB_USER")
dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET")
dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
self.db = _get_cached_connector(
dbHost=dbHost,
dbDatabase=dbDatabase,
dbUser=dbUser,
dbPassword=dbPassword,
dbPort=dbPort,
userId=self.userId,
)
logger.info("Knowledge Store database initialized")
def setUserContext(self, user: User):
self.currentUser = user
self.userId = user.id if user else None
if self.userId:
self.db.updateContext(self.userId)
# =========================================================================
# FileContentIndex CRUD
# =========================================================================
def upsertFileContentIndex(self, index: FileContentIndex) -> Dict[str, Any]:
"""Create or update a FileContentIndex entry."""
data = index.model_dump()
existing = self.db._loadRecord(FileContentIndex, index.id)
if existing:
return self.db.recordModify(FileContentIndex, index.id, data)
return self.db.recordCreate(FileContentIndex, data)
def getFileContentIndex(self, fileId: str) -> Optional[Dict[str, Any]]:
"""Get a FileContentIndex by file ID."""
return self.db._loadRecord(FileContentIndex, fileId)
def getFileContentIndexByUser(
self, userId: str, featureInstanceId: str = None
) -> List[Dict[str, Any]]:
"""Get all FileContentIndex entries for a user."""
recordFilter = {"userId": userId}
if featureInstanceId:
recordFilter["featureInstanceId"] = featureInstanceId
return self.db.getRecordset(FileContentIndex, recordFilter=recordFilter)
def updateFileStatus(self, fileId: str, status: str) -> bool:
"""Update the processing status of a FileContentIndex."""
existing = self.db._loadRecord(FileContentIndex, fileId)
if not existing:
return False
self.db.recordModify(FileContentIndex, fileId, {"status": status})
return True
def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks."""
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
for chunk in chunks:
self.db.recordDelete(ContentChunk, chunk["id"])
return self.db.recordDelete(FileContentIndex, fileId)
# =========================================================================
# ContentChunk CRUD
# =========================================================================
def upsertContentChunk(self, chunk: ContentChunk) -> Dict[str, Any]:
"""Create or update a ContentChunk."""
data = chunk.model_dump()
existing = self.db._loadRecord(ContentChunk, chunk.id)
if existing:
return self.db.recordModify(ContentChunk, chunk.id, data)
return self.db.recordCreate(ContentChunk, data)
def upsertContentChunks(self, chunks: List[ContentChunk]) -> int:
"""Batch upsert multiple ContentChunks. Returns count of upserted chunks."""
count = 0
for chunk in chunks:
self.upsertContentChunk(chunk)
count += 1
return count
def getContentChunks(self, fileId: str) -> List[Dict[str, Any]]:
"""Get all ContentChunks for a file."""
return self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
def deleteContentChunks(self, fileId: str) -> int:
"""Delete all ContentChunks for a file. Returns count of deleted chunks."""
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
count = 0
for chunk in chunks:
if self.db.recordDelete(ContentChunk, chunk["id"]):
count += 1
return count
# =========================================================================
# WorkflowMemory CRUD
# =========================================================================
def upsertWorkflowMemory(self, memory: WorkflowMemory) -> Dict[str, Any]:
"""Create or update a WorkflowMemory entry."""
data = memory.model_dump()
existing = self.db._loadRecord(WorkflowMemory, memory.id)
if existing:
return self.db.recordModify(WorkflowMemory, memory.id, data)
return self.db.recordCreate(WorkflowMemory, data)
def getWorkflowEntities(self, workflowId: str) -> List[Dict[str, Any]]:
"""Get all WorkflowMemory entries for a workflow."""
return self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})
def getWorkflowEntity(self, workflowId: str, key: str) -> Optional[Dict[str, Any]]:
"""Get a specific WorkflowMemory entry by workflow and key."""
results = self.db.getRecordset(
WorkflowMemory, recordFilter={"workflowId": workflowId, "key": key}
)
return results[0] if results else None
def deleteWorkflowMemory(self, workflowId: str) -> int:
"""Delete all WorkflowMemory entries for a workflow. Returns count."""
entries = self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})
count = 0
for entry in entries:
if self.db.recordDelete(WorkflowMemory, entry["id"]):
count += 1
return count
# =========================================================================
# Semantic Search
# =========================================================================
def semanticSearch(
self,
queryVector: List[float],
userId: str = None,
featureInstanceId: str = None,
mandateId: str = None,
isShared: bool = None,
limit: int = 10,
minScore: float = None,
contentType: str = None,
) -> List[Dict[str, Any]]:
"""Semantic search across ContentChunks using pgvector cosine similarity.
Args:
queryVector: Query embedding vector.
userId: Filter by user (Instance Layer).
featureInstanceId: Filter by feature instance.
mandateId: Filter by mandate (for Shared Layer lookups).
isShared: If True, search Shared Layer via FileContentIndex join.
limit: Max results.
minScore: Minimum cosine similarity (0.0 - 1.0).
contentType: Filter by content type (text, image, etc.).
Returns:
List of ContentChunk records with _score field, sorted by relevance.
"""
recordFilter = {}
if userId:
recordFilter["userId"] = userId
if featureInstanceId:
recordFilter["featureInstanceId"] = featureInstanceId
if contentType:
recordFilter["contentType"] = contentType
return self.db.semanticSearch(
modelClass=ContentChunk,
vectorColumn="embedding",
queryVector=queryVector,
limit=limit,
recordFilter=recordFilter if recordFilter else None,
minScore=minScore,
)
def semanticSearchWorkflowMemory(
self,
queryVector: List[float],
workflowId: str,
limit: int = 5,
minScore: float = None,
) -> List[Dict[str, Any]]:
"""Semantic search across WorkflowMemory entries."""
return self.db.semanticSearch(
modelClass=WorkflowMemory,
vectorColumn="embedding",
queryVector=queryVector,
limit=limit,
recordFilter={"workflowId": workflowId},
minScore=minScore,
)
def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects:
"""Get or create a KnowledgeObjects singleton."""
if "default" not in _instances:
_instances["default"] = KnowledgeObjects()
interface = _instances["default"]
if currentUser:
interface.setUserContext(currentUser)
return interface

View file

@ -58,7 +58,6 @@ TABLE_NAMESPACE = {
"ChatWorkflow": "chat",
"ChatMessage": "chat",
"ChatLog": "chat",
"ChatStat": "chat",
"ChatDocument": "chat",
"Prompt": "chat",
# Chatbot (poweron_chatbot) - per feature-instance isolation
@ -69,13 +68,20 @@ TABLE_NAMESPACE = {
# Files - benutzer-eigen
"FileItem": "files",
"FileData": "files",
"FileFolder": "files",
# Automation - benutzer-eigen
"AutomationDefinition": "automation",
"AutomationTemplate": "automation",
# Knowledge Store - benutzer-eigen
"FileContentIndex": "knowledge",
"ContentChunk": "knowledge",
"WorkflowMemory": "knowledge",
# Data Sources - benutzer-eigen
"DataSource": "datasource",
}
# Namespaces ohne Mandantenkontext - GROUP wird auf MY gemappt
USER_OWNED_NAMESPACES = {"chat", "chatbot", "files", "automation"}
USER_OWNED_NAMESPACES = {"chat", "chatbot", "files", "automation", "knowledge", "datasource"}
def buildDataObjectKey(tableName: str, featureCode: Optional[str] = None) -> str:
@ -175,7 +181,7 @@ def getRecordsetWithRBAC(
whereValues = []
# CRITICAL: Only pass featureInstanceId to WHERE clause if the model actually has
# this column. Chat child tables (ChatMessage, ChatLog, ChatStat, ChatDocument)
# this column. Chat child tables (ChatMessage, ChatLog, ChatDocument)
# are user-owned and do NOT have featureInstanceId - only ChatWorkflow does.
# Without this check, the SQL query would reference a non-existent column,
# causing a silent error that returns empty results.

View file

@ -247,19 +247,13 @@ def _getInstancePermissions(rootInterface, userId: str, instanceId: str) -> Dict
# Get FeatureAccess for this user and instance (Pydantic model)
featureAccess = rootInterface.getFeatureAccess(userId, instanceId)
logger.debug(f"_getInstancePermissions: userId={userId}, instanceId={instanceId}, featureAccess={featureAccess is not None}")
if not featureAccess:
logger.debug(f"_getInstancePermissions: No FeatureAccess found for user {userId} and instance {instanceId}")
return permissions
# Get role IDs via interface method
roleIds = rootInterface.getRoleIdsForFeatureAccess(str(featureAccess.id))
logger.debug(f"_getInstancePermissions: featureAccessId={featureAccess.id}, roleIds={roleIds}")
if not roleIds:
logger.debug(f"_getInstancePermissions: No roles found for FeatureAccess {featureAccess.id}")
return permissions
# Check if user has admin role
@ -274,8 +268,6 @@ def _getInstancePermissions(rootInterface, userId: str, instanceId: str) -> Dict
# Get all rules for this role (returns Pydantic models)
accessRules = rootInterface.getAccessRules(roleId=roleId)
logger.debug(f"_getInstancePermissions: roleId={roleId}, accessRules={len(accessRules) if accessRules else 0}")
for rule in accessRules:
context = rule.context
item = rule.item or ""

View file

@ -21,7 +21,7 @@ from modules.auth import limiter, requireSysAdminRole, getRequestContext, Reques
# Import billing components
from modules.interfaces.interfaceDbBilling import getInterface as getBillingInterface, _getRootInterface
from modules.services.serviceBilling.mainServiceBilling import getService as getBillingService
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import getService as getBillingService
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
from modules.routes.routeDataUsers import _applyFiltersAndSort
from modules.datamodels.datamodelBilling import (
@ -162,6 +162,23 @@ def _isAdminOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
return False
def _isMemberOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
"""Check if user has any enabled membership in the specified mandate."""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
userMandates = rootInterface.getUserMandates(str(ctx.user.id))
for um in userMandates:
if str(getattr(um, 'mandateId', None)) != str(targetMandateId):
continue
if not getattr(um, 'enabled', True):
continue
return True
return False
except Exception:
return False
def _filterTransactionsByScope(transactions: list, scope: BillingDataScope) -> list:
"""
Filter a list of transaction dicts based on the user's BillingDataScope.
@ -720,11 +737,11 @@ def createCheckoutSession(
targetMandateId: str = Path(..., description="Mandate ID"),
checkoutRequest: CheckoutCreateRequest = Body(...),
ctx: RequestContext = Depends(getRequestContext),
_admin = Depends(requireSysAdminRole)
):
"""
Create Stripe Checkout Session for credit top-up. Returns redirect URL.
SysAdmin only. Amount is validated server-side against allowed presets.
RBAC: PREPAY_USER requires mandate membership (user loads own account),
PREPAY_MANDATE requires mandate admin role.
"""
try:
billingInterface = getBillingInterface(ctx.user, targetMandateId)
@ -738,10 +755,17 @@ def createCheckoutSession(
if billingModel == BillingModelEnum.PREPAY_USER:
if not checkoutRequest.userId:
raise HTTPException(status_code=400, detail="userId is required for PREPAY_USER model")
elif billingModel not in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
if str(checkoutRequest.userId) != str(ctx.user.id):
raise HTTPException(status_code=403, detail="Users can only load credit to their own account")
if not _isMemberOfMandate(ctx, targetMandateId):
raise HTTPException(status_code=403, detail="User is not a member of this mandate")
elif billingModel in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
if not _isAdminOfMandate(ctx, targetMandateId):
raise HTTPException(status_code=403, detail="Mandate admin role required to load mandate credit")
else:
raise HTTPException(status_code=400, detail=f"Cannot add credit to {billingModel.value} billing model")
from modules.services.serviceBilling.stripeCheckout import create_checkout_session
from modules.serviceCenter.services.serviceBilling.stripeCheckout import create_checkout_session
redirect_url = create_checkout_session(
mandate_id=targetMandateId,
user_id=checkoutRequest.userId,
@ -768,7 +792,7 @@ async def stripeWebhook(
No JWT auth - Stripe authenticates via Stripe-Signature header.
"""
from modules.shared.configuration import APP_CONFIG
from modules.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
from modules.serviceCenter.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
webhook_secret = APP_CONFIG.get("STRIPE_WEBHOOK_SECRET")
if not webhook_secret:

View file

@ -19,6 +19,114 @@ from modules.datamodels.datamodelPagination import PaginationParams, PaginatedRe
# Configure logger
logger = logging.getLogger(__name__)
async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
"""Background task: pre-scan + extraction + knowledge indexing.
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
Step 2: Content extraction via runExtraction -> ContentParts
Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
userId = user.id if hasattr(user, "id") else str(user)
try:
mgmtInterface = interfaceDbManagement.getInterface(user)
mgmtInterface.updateFile(fileId, {"status": "processing"})
rawBytes = mgmtInterface.getFileData(fileId)
if not rawBytes:
logger.warning(f"Auto-index: no file data for {fileId}, skipping")
mgmtInterface.updateFile(fileId, {"status": "active"})
return
logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})")
# Step 1: Structure Pre-Scan (AI-free)
from modules.serviceCenter.services.serviceKnowledge.subPreScan import preScanDocument
contentIndex = await preScanDocument(
fileData=rawBytes,
mimeType=mimeType,
fileId=fileId,
fileName=fileName,
userId=userId,
)
logger.info(
f"Pre-scan complete for {fileName}: "
f"{contentIndex.totalObjects} objects"
)
# Persist FileContentIndex immediately
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts)
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
options = ExtractionOptions()
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType, options,
)
contentObjects = []
for part in extracted.parts:
contentType = "text"
if part.typeGroup == "image":
contentType = "image"
elif part.typeGroup in ("binary", "container"):
contentType = "other"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": contentType,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
logger.info(f"Extracted {len(contentObjects)} content objects from {fileName}")
if not contentObjects:
knowledgeDb.updateFileStatus(fileId, "indexed")
mgmtInterface.updateFile(fileId, {"status": "active"})
return
# Step 3: Knowledge indexing (chunking + embedding)
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(user=user, mandate_id="", feature_instance_id="")
knowledgeService = getService("knowledge", ctx)
await knowledgeService.indexFile(
fileId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
contentObjects=contentObjects,
structure=contentIndex.structure,
)
mgmtInterface.updateFile(fileId, {"status": "active"})
logger.info(f"Auto-index complete for file {fileId} ({fileName})")
except Exception as e:
logger.error(f"Auto-index failed for file {fileId}: {e}", exc_info=True)
try:
errMgmt = interfaceDbManagement.getInterface(user)
errMgmt.updateFile(fileId, {"status": "active"})
except Exception:
pass
# Model attributes for FileItem
fileAttributes = getModelAttributeDefinitions(FileItem)
@ -111,6 +219,7 @@ async def upload_file(
request: Request,
file: UploadFile = File(...),
workflowId: Optional[str] = Form(None),
featureInstanceId: Optional[str] = Form(None),
currentUser: User = Depends(getCurrentUser)
) -> JSONResponse:
# Add fileName property to UploadFile for consistency with backend model
@ -133,6 +242,10 @@ async def upload_file(
# Save file via LucyDOM interface in the database
fileItem, duplicateType = managementInterface.saveUploadedFile(fileContent, file.filename)
if featureInstanceId and not fileItem.featureInstanceId:
managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId})
fileItem.featureInstanceId = featureInstanceId
# Determine response message based on duplicate type
if duplicateType == "exact_duplicate":
message = f"File '{file.filename}' already exists with identical content. Reusing existing file."
@ -148,6 +261,32 @@ async def upload_file(
if workflowId:
fileMeta["workflowId"] = workflowId
# Trigger background auto-index pipeline (non-blocking)
# Also runs for duplicates in case the original was never successfully indexed
shouldIndex = duplicateType == "new_file"
if not shouldIndex:
try:
from modules.interfaces.interfaceDbKnowledge import getInterface as _getKnowledgeInterface
_kDb = _getKnowledgeInterface()
_existingIndex = _kDb.getFileContentIndex(fileItem.id)
if not _existingIndex:
shouldIndex = True
logger.info(f"Re-triggering auto-index for duplicate {fileItem.id} (not yet indexed)")
except Exception:
shouldIndex = True
if shouldIndex:
try:
import asyncio
asyncio.ensure_future(_autoIndexFile(
fileId=fileItem.id,
fileName=fileItem.fileName,
mimeType=fileItem.mimeType,
user=currentUser,
))
except Exception as indexErr:
logger.warning(f"Auto-index trigger failed (non-blocking): {indexErr}")
# Response with duplicate information
return JSONResponse({
"message": message,

View file

@ -764,7 +764,7 @@ def send_password_link(
expiryHours = int(APP_CONFIG.get("Auth_RESET_TOKEN_EXPIRY_HOURS", "24"))
try:
from modules.services import Services
from modules.serviceHub import Services
services = Services(targetUser)
emailSubject = "PowerOn - Passwort setzen"

View file

@ -395,7 +395,7 @@ def trigger_subscription(
)
# Get messaging service from request app state
from modules.services import getInterface as getServicesInterface
from modules.serviceHub import getInterface as getServicesInterface
services = getServicesInterface(context.user, None, mandateId=str(context.mandateId))
# Konvertiere Dict zu Pydantic Model

View file

@ -87,9 +87,10 @@ CLIENT_SECRET = APP_CONFIG.get("Service_GOOGLE_CLIENT_SECRET")
REDIRECT_URI = APP_CONFIG.get("Service_GOOGLE_REDIRECT_URI")
SCOPES = [
"https://www.googleapis.com/auth/gmail.readonly",
"https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/userinfo.profile",
"https://www.googleapis.com/auth/userinfo.email",
"openid"
"openid",
]
@router.get("/config")
@ -488,7 +489,7 @@ async def auth_callback(code: str, state: str, request: Request, response: Respo
connection.externalUsername = user_info.get("email")
connection.externalEmail = user_info.get("email")
# Store actually granted scopes for this connection
granted_scopes_list = granted_scopes.split(" ") if granted_scopes else SCOPES
granted_scopes_list = granted_scopes if isinstance(granted_scopes, list) else (granted_scopes.split(" ") if granted_scopes else SCOPES)
connection.grantedScopes = granted_scopes_list
logger.info(f"Storing granted scopes for connection {connection_id}: {granted_scopes_list}")

View file

@ -59,6 +59,7 @@ SCOPES = [
"Mail.Send", # Send mail
"Files.ReadWrite.All", # Read and write files (SharePoint/OneDrive)
"Sites.ReadWrite.All", # Read and write SharePoint sites
"Team.ReadBasic.All", # List joined teams and channels
# Teams Bot: Meeting and chat access (requires admin consent)
"OnlineMeetings.Read", # Read user's Teams meeting details (delegated scope)
"Chat.ReadWrite", # Read and write Teams chat messages

View file

@ -12,7 +12,7 @@ from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request, sta
from modules.auth import limiter, getCurrentUser
from modules.datamodels.datamodelUam import User, UserConnection
from modules.interfaces.interfaceDbApp import getInterface
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
logger = logging.getLogger(__name__)

View file

@ -123,6 +123,9 @@ def _getFeatureUiObjects(featureCode: str) -> List[Dict[str, Any]]:
elif featureCode == "commcoach":
from modules.features.commcoach.mainCommcoach import UI_OBJECTS
return UI_OBJECTS
elif featureCode == "workspace":
from modules.features.workspace.mainWorkspace import UI_OBJECTS
return UI_OBJECTS
else:
logger.warning(f"Unknown feature code: {featureCode}")
return []

View file

@ -26,7 +26,6 @@ logger = logging.getLogger(__name__)
def getService(
key: str,
context: ServiceCenterContext,
legacy_hub: Optional[Any] = None,
) -> Any:
"""
Get a service instance by key for the given context.
@ -34,14 +33,13 @@ def getService(
Args:
key: Service key (e.g., "web", "extraction", "utils")
context: ServiceCenterContext with user, mandate_id, feature_instance_id, workflow
legacy_hub: Optional legacy Services instance for fallback when service not yet migrated
Returns:
Service instance
"""
cache = get_resolution_cache()
resolving = set()
return resolve(key, context, cache, resolving, legacy_hub=legacy_hub)
return resolve(key, context, cache, resolving)
def preWarm(service_keys: Optional[List[str]] = None) -> None:

View file

@ -22,6 +22,8 @@ class EventManager:
"""Initialize the event manager."""
self._queues: Dict[str, asyncio.Queue] = {}
self._cleanup_tasks: Dict[str, asyncio.Task] = {}
self._agent_tasks: Dict[str, asyncio.Task] = {}
self._cancelled: Dict[str, bool] = {}
def create_queue(self, workflow_id: str) -> asyncio.Queue:
"""
@ -33,9 +35,22 @@ class EventManager:
Returns:
Async queue for events
"""
if workflow_id in self._cleanup_tasks:
self._cleanup_tasks[workflow_id].cancel()
del self._cleanup_tasks[workflow_id]
logger.debug(f"Cancelled pending cleanup for workflow {workflow_id}")
if workflow_id not in self._queues:
self._queues[workflow_id] = asyncio.Queue()
logger.debug(f"Created event queue for workflow {workflow_id}")
else:
old = self._queues[workflow_id]
while not old.empty():
try:
old.get_nowait()
except asyncio.QueueEmpty:
break
logger.debug(f"Reusing event queue for workflow {workflow_id} (drained stale events)")
return self._queues[workflow_id]
def get_queue(self, workflow_id: str) -> Optional[asyncio.Queue]:
@ -62,6 +77,31 @@ class EventManager:
"""
return workflow_id in self._queues
def register_agent_task(self, workflow_id: str, task: asyncio.Task) -> None:
"""Register the asyncio Task running the agent for a workflow."""
self._agent_tasks[workflow_id] = task
self._cancelled.pop(workflow_id, None)
def is_cancelled(self, workflow_id: str) -> bool:
"""Check if a workflow has been cancelled."""
return self._cancelled.get(workflow_id, False)
async def cancel_agent(self, workflow_id: str) -> bool:
"""Cancel the running agent task for a workflow. Returns True if cancelled."""
self._cancelled[workflow_id] = True
task = self._agent_tasks.pop(workflow_id, None)
if task and not task.done():
task.cancel()
logger.info(f"Cancelled agent task for workflow {workflow_id}")
return True
logger.debug(f"No running agent task found for workflow {workflow_id}")
return False
def _unregister_agent_task(self, workflow_id: str) -> None:
"""Remove the agent task reference after completion."""
self._agent_tasks.pop(workflow_id, None)
self._cancelled.pop(workflow_id, None)
async def emit_event(
self,
context_id: str,
@ -97,7 +137,8 @@ class EventManager:
try:
await queue.put(event)
logger.debug(f"Emitted {event_type} event for workflow {context_id}")
if event_type not in ("chunk",):
logger.debug(f"Emitted {event_type} event for workflow {context_id}")
except Exception as e:
logger.error(f"Error emitting event for workflow {context_id}: {e}", exc_info=True)

View file

@ -98,6 +98,20 @@ IMPORTABLE_SERVICES: Dict[str, Dict[str, Any]] = {
"objectKey": "service.neutralization",
"label": {"en": "Neutralization", "de": "Neutralisierung", "fr": "Neutralisation"},
},
"agent": {
"module": "modules.serviceCenter.services.serviceAgent.mainServiceAgent",
"class": "AgentService",
"dependencies": ["ai", "chat", "utils", "extraction", "billing", "streaming", "knowledge"],
"objectKey": "service.agent",
"label": {"en": "Agent", "de": "Agent", "fr": "Agent"},
},
"knowledge": {
"module": "modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge",
"class": "KnowledgeService",
"dependencies": ["ai"],
"objectKey": "service.knowledge",
"label": {"en": "Knowledge Store", "de": "Wissensspeicher", "fr": "Base de connaissances"},
},
}
# RBAC objects for service-level access control (for catalog registration)

View file

@ -2,7 +2,7 @@
# All rights reserved.
"""
Service Center Resolver.
Resolution logic, dependency injection, and optional legacy fallback.
Resolution logic and dependency injection for service instantiation.
"""
import importlib
@ -14,7 +14,6 @@ from modules.serviceCenter.registry import CORE_SERVICES, IMPORTABLE_SERVICES
logger = logging.getLogger(__name__)
# Type for get_service callable passed to services
GetServiceFunc = Callable[[str], Any]
@ -29,50 +28,15 @@ def _load_service_class(module_path: str, class_name: str):
return getattr(module, class_name)
def _create_legacy_hub(ctx: ServiceCenterContext) -> Any:
"""Create legacy Services instance for fallback when service not yet migrated."""
from modules.services import getInterface
return getInterface(
ctx.user,
workflow=ctx.workflow,
mandateId=ctx.mandate_id,
featureInstanceId=ctx.feature_instance_id,
)
def _get_from_legacy(legacy_hub: Any, key: str) -> Any:
"""Map service key to legacy hub attribute (for fallback when service center module fails)."""
key_to_attr = {
"utils": "utils",
"security": "security",
"streaming": "streaming",
"ticket": "ticket",
"messaging": "messaging",
"billing": "billing",
"sharepoint": "sharepoint",
"chat": "chat",
"extraction": "extraction",
"generation": "generation",
"ai": "ai",
"web": "web",
"neutralization": "neutralization",
}
attr = key_to_attr.get(key)
if attr and hasattr(legacy_hub, attr):
return getattr(legacy_hub, attr)
return None
def resolve(
key: str,
context: ServiceCenterContext,
cache: Dict[str, Any],
resolving: Set[str],
legacy_hub: Optional[Any] = None,
) -> Any:
"""
Resolve a service by key. Uses cache, resolves dependencies recursively.
Falls back to legacy_hub if service module cannot be loaded.
Raises KeyError if the service is not registered.
"""
cache_key = f"{_make_context_id(context)}_{key}"
if cache_key in cache:
@ -82,59 +46,20 @@ def resolve(
raise RuntimeError(f"Circular dependency detected for service: {key}")
def get_service(dep_key: str) -> Any:
return resolve(dep_key, context, cache, resolving, legacy_hub)
return resolve(dep_key, context, cache, resolving)
# Try core first
if key in CORE_SERVICES:
spec = CORE_SERVICES[key]
spec = CORE_SERVICES.get(key) or IMPORTABLE_SERVICES.get(key)
if spec:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
except (ImportError, ModuleNotFoundError, AttributeError) as e:
logger.debug(f"Could not load core service '{key}' from service center: {e}")
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
raise
# Try importable
if key in IMPORTABLE_SERVICES:
spec = IMPORTABLE_SERVICES[key]
try:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
except (ImportError, ModuleNotFoundError, AttributeError) as e:
logger.debug(f"Could not load importable service '{key}' from service center: {e}")
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
raise
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
raise KeyError(f"Unknown service: {key}")

View file

@ -0,0 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""serviceAgent: AI Agent with ReAct loop and native function calling."""

View file

@ -0,0 +1,162 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""ActionToolAdapter: wraps existing workflow actions (dynamicMode=True) as agent tools."""
import logging
from typing import Dict, Any, List, Optional
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
ToolDefinition, ToolResult
)
from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistry
logger = logging.getLogger(__name__)
class ActionToolAdapter:
"""Wraps existing Workflow-Actions as Agent-Tools.
Iterates over discovered methods, finds actions with dynamicMode=True,
and registers them in the ToolRegistry with a compound name (method.action).
"""
def __init__(self, actionExecutor):
self._actionExecutor = actionExecutor
self._registeredTools: List[str] = []
def registerAll(self, toolRegistry: ToolRegistry):
"""Discover and register all dynamicMode actions as agent tools."""
from modules.workflows.processing.shared.methodDiscovery import methods
registered = 0
for methodName, methodInfo in methods.items():
if not methodName[0].isupper():
continue
shortName = methodName.replace("Method", "").lower()
methodInstance = methodInfo["instance"]
for actionName, actionInfo in methodInfo["actions"].items():
actionDef = methodInstance._actions.get(actionName)
if not actionDef or not getattr(actionDef, "dynamicMode", False):
continue
compoundName = f"{shortName}.{actionName}"
toolDef = _buildToolDefinition(compoundName, actionDef, actionInfo)
handler = _createDispatchHandler(self._actionExecutor, shortName, actionName)
toolRegistry.registerFromDefinition(toolDef, handler)
self._registeredTools.append(compoundName)
registered += 1
logger.info(f"ActionToolAdapter: registered {registered} tools from workflow actions")
@property
def registeredTools(self) -> List[str]:
"""Names of all tools registered by this adapter."""
return list(self._registeredTools)
def _buildToolDefinition(compoundName: str, actionDef, actionInfo: Dict[str, Any]) -> ToolDefinition:
"""Build a ToolDefinition from a WorkflowActionDefinition."""
parameters = _convertParameterSchema(actionInfo.get("parameters", {}))
return ToolDefinition(
name=compoundName,
description=actionDef.description or actionInfo.get("description", ""),
parameters=parameters,
readOnly=False
)
def _convertParameterSchema(actionParams: Dict[str, Any]) -> Dict[str, Any]:
"""Convert workflow action parameter schema to JSON Schema for tool definitions."""
properties = {}
required = []
for paramName, paramInfo in actionParams.items():
paramType = paramInfo.get("type", "str") if isinstance(paramInfo, dict) else "str"
paramDesc = paramInfo.get("description", "") if isinstance(paramInfo, dict) else ""
paramRequired = paramInfo.get("required", False) if isinstance(paramInfo, dict) else False
jsonType = _pythonTypeToJsonType(paramType)
properties[paramName] = {
"type": jsonType,
"description": paramDesc
}
if paramRequired:
required.append(paramName)
return {
"type": "object",
"properties": properties,
"required": required
}
def _pythonTypeToJsonType(pythonType: str) -> str:
"""Map Python type strings to JSON Schema types."""
mapping = {
"str": "string",
"int": "integer",
"float": "number",
"bool": "boolean",
"list": "array",
"dict": "object",
"List[str]": "array",
"List[int]": "array",
"List[dict]": "array",
"Dict[str, Any]": "object",
}
return mapping.get(pythonType, "string")
def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
"""Create an async handler that dispatches to the ActionExecutor."""
async def _handler(args: Dict[str, Any], context: Dict[str, Any]) -> ToolResult:
try:
result = await actionExecutor.executeAction(methodName, actionName, args)
data = _formatActionResult(result)
return ToolResult(
toolCallId="",
toolName=f"{methodName}.{actionName}",
success=result.success,
data=data,
error=result.error
)
except Exception as e:
logger.error(f"ActionToolAdapter dispatch failed for {methodName}.{actionName}: {e}")
return ToolResult(
toolCallId="",
toolName=f"{methodName}.{actionName}",
success=False,
error=str(e)
)
return _handler
def _formatActionResult(result) -> str:
"""Format an ActionResult into a text representation for the agent."""
parts = []
if result.resultLabel:
parts.append(f"Result: {result.resultLabel}")
if result.error:
parts.append(f"Error: {result.error}")
if result.documents:
parts.append(f"Documents ({len(result.documents)}):")
for doc in result.documents:
docName = getattr(doc, "documentName", "unnamed")
docType = getattr(doc, "mimeType", "unknown")
parts.append(f" - {docName} ({docType})")
docData = getattr(doc, "documentData", None)
if docData and isinstance(docData, str) and len(docData) < 2000:
parts.append(f" Content: {docData[:2000]}")
if not parts:
parts.append("Action completed successfully." if result.success else "Action failed.")
return "\n".join(parts)

View file

@ -0,0 +1,406 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Agent loop: ReAct pattern with native function calling, budget control, and error handling."""
import asyncio
import logging
import time
import json
import re
from typing import List, Dict, Any, Optional, AsyncGenerator, Callable, Awaitable
from modules.datamodels.datamodelAi import (
AiCallRequest, AiCallOptions, AiCallResponse, OperationTypeEnum
)
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
AgentState, AgentStatusEnum, AgentConfig, AgentEvent, AgentEventTypeEnum,
ToolCallRequest, ToolResult, ToolCallLog, AgentRoundLog, AgentTrace
)
from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistry
from modules.serviceCenter.services.serviceAgent.conversationManager import (
ConversationManager, buildSystemPrompt
)
from modules.shared.timeUtils import getUtcTimestamp
logger = logging.getLogger(__name__)
MAX_RETRIES_PER_TOOL = 3
RETRY_BASE_DELAY_S = 1.0
async def runAgentLoop(
prompt: str,
toolRegistry: ToolRegistry,
config: AgentConfig,
aiCallFn: Callable[[AiCallRequest], Awaitable[AiCallResponse]],
getWorkflowCostFn: Callable[[], Awaitable[float]],
workflowId: str,
userId: str = "",
featureInstanceId: str = "",
buildRagContextFn: Callable[..., Awaitable[str]] = None,
mandateId: str = "",
aiCallStreamFn: Callable = None,
userLanguage: str = "",
) -> AsyncGenerator[AgentEvent, None]:
"""Run the agent loop. Yields AgentEvent for each step (SSE-ready).
Args:
prompt: User prompt
toolRegistry: Registry with available tools
config: Agent configuration (maxRounds, maxCostCHF, etc.)
aiCallFn: Function to call the AI (wraps serviceAi.callAi with billing)
getWorkflowCostFn: Function to get current workflow cost
workflowId: Workflow ID for tracking
userId: User ID for tracing
featureInstanceId: Feature instance ID for tracing
buildRagContextFn: Optional async function to build RAG context before each round
mandateId: Mandate ID for RAG scoping
userLanguage: ISO 639-1 language code for agent responses
"""
state = AgentState(workflowId=workflowId, maxRounds=config.maxRounds)
trace = AgentTrace(
workflowId=workflowId, userId=userId,
featureInstanceId=featureInstanceId
)
tools = toolRegistry.getTools()
toolDefinitions = toolRegistry.formatToolsForFunctionCalling()
toolsText = toolRegistry.formatToolsForPrompt()
systemPrompt = buildSystemPrompt(tools, toolsText, userLanguage=userLanguage)
conversation = ConversationManager(systemPrompt)
conversation.addUserMessage(prompt)
while state.status == AgentStatusEnum.RUNNING and state.currentRound < state.maxRounds:
await asyncio.sleep(0)
state.currentRound += 1
roundStartTime = time.time()
roundLog = AgentRoundLog(roundNumber=state.currentRound)
# RAG context injection (before each round for fresh relevance)
if buildRagContextFn:
try:
latestUserMsg = ""
for msg in reversed(conversation.messages):
if msg.get("role") == "user":
latestUserMsg = msg.get("content", "")
break
ragContext = await buildRagContextFn(
currentPrompt=latestUserMsg or prompt,
workflowId=workflowId,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
)
if ragContext:
conversation.injectRagContext(ragContext)
except Exception as ragErr:
logger.warning(f"RAG context injection failed (non-blocking): {ragErr}")
# Budget check
budgetExceeded = await _checkBudget(config, getWorkflowCostFn)
if budgetExceeded:
state.status = AgentStatusEnum.BUDGET_EXCEEDED
state.abortReason = "Workflow cost budget exceeded"
yield AgentEvent(
type=AgentEventTypeEnum.FINAL,
content=_buildProgressSummary(state, "Budget exceeded. Here is the progress so far.")
)
break
logger.info(f"Agent round {state.currentRound}/{state.maxRounds} for workflow {workflowId} (tools={state.totalToolCalls}, cost={state.totalCostCHF:.4f})")
yield AgentEvent(
type=AgentEventTypeEnum.AGENT_PROGRESS,
data={
"round": state.currentRound,
"maxRounds": state.maxRounds,
"totalAiCalls": state.totalAiCalls,
"totalToolCalls": state.totalToolCalls,
"costCHF": state.totalCostCHF
}
)
# Progressive summarization
if conversation.needsSummarization(state.currentRound):
async def _summarizeCall(summaryPrompt: str) -> str:
req = AiCallRequest(
prompt=summaryPrompt,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE)
)
resp = await aiCallFn(req)
state.totalCostCHF += resp.priceCHF
state.totalAiCalls += 1
return resp.content
await conversation.summarize(state.currentRound, _summarizeCall)
# AI call
aiRequest = AiCallRequest(
prompt="",
options=AiCallOptions(
operationType=OperationTypeEnum.AGENT,
temperature=config.temperature
),
messages=conversation.messages,
tools=toolDefinitions
)
try:
aiResponse = None
streamedText = ""
isFirstChunkOfRound = True
if aiCallStreamFn:
async for chunk in aiCallStreamFn(aiRequest):
if isinstance(chunk, str):
if isFirstChunkOfRound and state.currentRound > 1:
chunk = "\n\n" + chunk
isFirstChunkOfRound = False
elif isFirstChunkOfRound:
isFirstChunkOfRound = False
streamedText += chunk
yield AgentEvent(type=AgentEventTypeEnum.CHUNK, content=chunk)
else:
aiResponse = chunk
if aiResponse is None:
raise RuntimeError("Stream ended without final AiCallResponse")
else:
aiResponse = await aiCallFn(aiRequest)
except Exception as e:
logger.error(f"AI call failed in round {state.currentRound}: {e}", exc_info=True)
state.status = AgentStatusEnum.ERROR
state.abortReason = f"AI call error: {e}"
yield AgentEvent(type=AgentEventTypeEnum.ERROR, content=str(e))
break
state.totalAiCalls += 1
state.totalCostCHF += aiResponse.priceCHF
state.totalProcessingTime += aiResponse.processingTime
roundLog.aiModel = aiResponse.modelName
roundLog.costCHF = aiResponse.priceCHF
if aiResponse.errorCount > 0:
state.status = AgentStatusEnum.ERROR
state.abortReason = f"AI returned error: {aiResponse.content}"
yield AgentEvent(type=AgentEventTypeEnum.ERROR, content=aiResponse.content)
break
# Parse response for tool calls
toolCalls = _parseToolCalls(aiResponse)
textContent = _extractTextContent(aiResponse)
if textContent and not streamedText:
yield AgentEvent(type=AgentEventTypeEnum.MESSAGE, content=textContent)
if not toolCalls:
state.status = AgentStatusEnum.COMPLETED
conversation.addAssistantMessage(aiResponse.content)
roundLog.durationMs = int((time.time() - roundStartTime) * 1000)
trace.rounds.append(roundLog)
yield AgentEvent(type=AgentEventTypeEnum.FINAL, content=textContent or aiResponse.content)
break
# Add assistant message with tool calls to conversation
assistantToolCalls = _formatAssistantToolCalls(toolCalls)
conversation.addAssistantMessage(textContent or "", assistantToolCalls)
# Execute tool calls
for tc in toolCalls:
yield AgentEvent(
type=AgentEventTypeEnum.TOOL_CALL,
data={"toolName": tc.name, "args": tc.args}
)
results = await _executeToolCalls(toolCalls, toolRegistry, {
"workflowId": workflowId,
"userId": userId,
"featureInstanceId": featureInstanceId,
"mandateId": mandateId,
})
state.totalToolCalls += len(results)
for result in results:
roundLog.toolCalls.append(ToolCallLog(
toolName=result.toolName,
args=next((tc.args for tc in toolCalls if tc.id == result.toolCallId), {}),
success=result.success,
durationMs=result.durationMs,
error=result.error
))
if not result.success:
logger.warning(f"Tool '{result.toolName}' failed: {result.error}")
yield AgentEvent(
type=AgentEventTypeEnum.TOOL_RESULT,
data={
"toolName": result.toolName,
"success": result.success,
"data": result.data[:500] if result.data else "",
"error": result.error
}
)
if result.sideEvents:
for sideEvt in result.sideEvents:
evtType = sideEvt.get("type", "")
try:
evtEnum = AgentEventTypeEnum(evtType)
except (ValueError, KeyError):
continue
yield AgentEvent(
type=evtEnum,
data=sideEvt.get("data"),
content=sideEvt.get("content"),
)
# Add tool results to conversation
toolResultMessages = [
{"toolCallId": r.toolCallId, "toolName": r.toolName,
"content": r.data if r.success else f"Error: {r.error}"}
for r in results
]
conversation.addToolResults(toolResultMessages)
roundLog.durationMs = int((time.time() - roundStartTime) * 1000)
trace.rounds.append(roundLog)
# maxRounds reached
if state.currentRound >= state.maxRounds and state.status == AgentStatusEnum.RUNNING:
state.status = AgentStatusEnum.MAX_ROUNDS_REACHED
state.abortReason = f"Maximum rounds ({state.maxRounds}) reached"
yield AgentEvent(
type=AgentEventTypeEnum.FINAL,
content=_buildProgressSummary(state, "Maximum rounds reached.")
)
# Agent summary
trace.completedAt = getUtcTimestamp()
trace.status = state.status
trace.totalRounds = state.currentRound
trace.totalToolCalls = state.totalToolCalls
trace.totalCostCHF = state.totalCostCHF
trace.abortReason = state.abortReason
yield AgentEvent(
type=AgentEventTypeEnum.AGENT_SUMMARY,
data={
"rounds": state.currentRound,
"totalAiCalls": state.totalAiCalls,
"totalToolCalls": state.totalToolCalls,
"costCHF": round(state.totalCostCHF, 4),
"processingTime": round(state.totalProcessingTime, 2),
"status": state.status.value,
"abortReason": state.abortReason
}
)
async def _checkBudget(config: AgentConfig,
getWorkflowCostFn: Callable[[], Awaitable[float]]) -> bool:
"""Check if workflow budget is exceeded. Returns True if exceeded."""
if config.maxCostCHF is None:
return False
try:
currentCost = await getWorkflowCostFn()
return currentCost > config.maxCostCHF
except Exception as e:
logger.warning(f"Could not check workflow cost: {e}")
return False
async def _executeToolCalls(toolCalls: List[ToolCallRequest],
toolRegistry: ToolRegistry,
context: Dict[str, Any]) -> List[ToolResult]:
"""Execute tool calls: readOnly tools in parallel, others sequentially."""
readOnlyCalls = [tc for tc in toolCalls if toolRegistry.isReadOnly(tc.name)]
writeCalls = [tc for tc in toolCalls if not toolRegistry.isReadOnly(tc.name)]
results: Dict[str, ToolResult] = {}
if readOnlyCalls:
readResults = await asyncio.gather(*[
toolRegistry.dispatch(tc, context) for tc in readOnlyCalls
])
for tc, result in zip(readOnlyCalls, readResults):
results[tc.id] = result
for tc in writeCalls:
results[tc.id] = await toolRegistry.dispatch(tc, context)
return [results[tc.id] for tc in toolCalls]
def _parseToolCalls(aiResponse: AiCallResponse) -> List[ToolCallRequest]:
"""Parse tool calls from AI response. Supports native function calling and text-based fallback."""
toolCalls = []
# Native function calling: check response metadata
if hasattr(aiResponse, 'toolCalls') and aiResponse.toolCalls:
for tc in aiResponse.toolCalls:
rawArgs = tc["function"]["arguments"]
if isinstance(rawArgs, str):
rawArgs = rawArgs.strip()
try:
parsedArgs = json.loads(rawArgs) if rawArgs else {}
except json.JSONDecodeError:
logger.warning(f"Failed to parse tool args for '{tc['function']['name']}': {rawArgs[:200]}")
parsedArgs = {}
else:
parsedArgs = rawArgs if rawArgs else {}
toolCalls.append(ToolCallRequest(
id=tc.get("id", str(len(toolCalls))),
name=tc["function"]["name"],
args=parsedArgs,
))
return toolCalls
# Text-based fallback: parse ```tool_call blocks
content = aiResponse.content or ""
pattern = r"```tool_call\s*\n\s*tool:\s*(\S+)\s*\n\s*args:\s*(\{.*?\})\s*\n\s*```"
matches = re.finditer(pattern, content, re.DOTALL)
for match in matches:
toolName = match.group(1).strip()
argsStr = match.group(2).strip()
try:
args = json.loads(argsStr)
except json.JSONDecodeError:
logger.warning(f"Failed to parse tool args for '{toolName}': {argsStr}")
args = {}
toolCalls.append(ToolCallRequest(name=toolName, args=args))
return toolCalls
def _extractTextContent(aiResponse: AiCallResponse) -> str:
"""Extract text content from AI response, removing tool_call blocks."""
content = aiResponse.content or ""
cleaned = re.sub(r"```tool_call\s*\n.*?\n\s*```", "", content, flags=re.DOTALL)
return cleaned.strip()
def _formatAssistantToolCalls(toolCalls: List[ToolCallRequest]) -> List[Dict[str, Any]]:
"""Format tool calls for the conversation history (OpenAI tool_calls format)."""
return [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.name,
"arguments": json.dumps(tc.args)
}
}
for tc in toolCalls
]
def _buildProgressSummary(state: AgentState, reason: str) -> str:
"""Build a human-readable summary of agent progress for graceful termination."""
return (
f"{reason}\n\n"
f"Progress after {state.currentRound} rounds:\n"
f"- AI calls: {state.totalAiCalls}\n"
f"- Tool calls: {state.totalToolCalls}\n"
f"- Cost: {state.totalCostCHF:.4f} CHF\n"
f"- Processing time: {state.totalProcessingTime:.1f}s"
)

View file

@ -0,0 +1,280 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Conversation manager for the Agent service.
Handles message history, context window management, and progressive summarization."""
import logging
from typing import List, Dict, Any, Optional
from modules.serviceCenter.services.serviceAgent.datamodelAgent import ToolDefinition
logger = logging.getLogger(__name__)
FIRST_SUMMARY_ROUND = 4
META_SUMMARY_ROUND = 7
KEEP_RECENT_MESSAGES = 4
MAX_ESTIMATED_TOKENS = 60000
class ConversationManager:
"""Manages the conversation history and context window for agent runs.
Progressive summarization strategy:
- Rounds 1-3: full conversation retained
- Round 4+: older messages compressed into a running summary
- Round 7+: meta-summary replaces prior summaries
Supports RAG context injection before each round via injectRagContext."""
def __init__(self, systemPrompt: str):
self._messages: List[Dict[str, Any]] = [
{"role": "system", "content": systemPrompt}
]
self._summaries: List[Dict[str, Any]] = []
self._lastSummarizedRound: int = 0
self._ragContextInjected: bool = False
@property
def messages(self) -> List[Dict[str, Any]]:
"""Current messages for the next AI call (internal markers stripped)."""
return [
{k: v for k, v in msg.items() if not k.startswith("_")}
for msg in self._messages
]
def addUserMessage(self, content: str):
"""Add a user message."""
self._messages.append({"role": "user", "content": content})
def addAssistantMessage(self, content: str, toolCalls: List[Dict[str, Any]] = None):
"""Add an assistant message, optionally with tool calls."""
msg: Dict[str, Any] = {"role": "assistant", "content": content}
if toolCalls:
msg["tool_calls"] = toolCalls
self._messages.append(msg)
def addToolResults(self, results: List[Dict[str, Any]]):
"""Add tool results to the conversation.
Each result: {toolCallId, toolName, content}."""
for result in results:
self._messages.append({
"role": "tool",
"tool_call_id": result["toolCallId"],
"content": result["content"]
})
def addToolResultsAsText(self, resultText: str):
"""Add combined tool results as a user message (text-based fallback)."""
self._messages.append({
"role": "user",
"content": f"Tool Results:\n{resultText}"
})
def injectRagContext(self, ragContext: str):
"""Inject RAG context as a system message right after the main system prompt.
Called before each agent round by the agent loop if KnowledgeService is available.
Replaces any previously injected RAG context to keep the context fresh."""
if not ragContext:
return
ragMessage = {
"role": "system",
"content": f"Relevant Knowledge (from indexed documents and workflow context):\n{ragContext}",
"_isRagContext": True,
}
# Replace existing RAG message if present, otherwise insert after system prompt
for i, msg in enumerate(self._messages):
if msg.get("_isRagContext"):
self._messages[i] = ragMessage
self._ragContextInjected = True
return
# Insert after the first system prompt
self._messages.insert(1, ragMessage)
self._ragContextInjected = True
def getMessageCount(self) -> int:
"""Get the number of messages (excluding system prompt)."""
return len(self._messages) - 1
def estimateTokenCount(self) -> int:
"""Rough estimate of total tokens in the conversation (4 chars ≈ 1 token)."""
totalChars = sum(len(str(m.get("content", ""))) for m in self._messages)
return totalChars // 4
def needsSummarization(self, currentRound: int) -> bool:
"""Check if progressive summarization should be triggered.
Triggers:
- At round FIRST_SUMMARY_ROUND (4) if not yet summarized
- At round META_SUMMARY_ROUND (7) for meta-summary
- Every 5 rounds after that
- When estimated token count exceeds MAX_ESTIMATED_TOKENS
"""
if currentRound >= FIRST_SUMMARY_ROUND and self._lastSummarizedRound < currentRound:
if currentRound == FIRST_SUMMARY_ROUND or currentRound == META_SUMMARY_ROUND:
return True
if (currentRound - META_SUMMARY_ROUND) % 5 == 0 and currentRound > META_SUMMARY_ROUND:
return True
if self.estimateTokenCount() > MAX_ESTIMATED_TOKENS:
return True
return False
async def summarize(self, currentRound: int, aiCallFn) -> Optional[str]:
"""Perform progressive summarization of older messages.
Rounds 1-3: full history retained, no summarization.
Round 4+: compress older messages into a running summary.
Round 7+: meta-summary that consolidates prior summaries.
"""
if currentRound < FIRST_SUMMARY_ROUND and self.estimateTokenCount() <= MAX_ESTIMATED_TOKENS:
return None
systemMsgs = [m for m in self._messages if m.get("role") == "system"]
nonSystemMessages = [m for m in self._messages if m.get("role") != "system"]
keepRecent = min(KEEP_RECENT_MESSAGES, len(nonSystemMessages))
if len(nonSystemMessages) <= keepRecent + 1:
return None
splitIdx = len(nonSystemMessages) - keepRecent
# Ensure the split doesn't orphan tool messages from their assistant.
# Walk backwards from splitIdx: if we're landing in the middle of a
# tool-call sequence (assistant+tool_calls → tool → tool …), include
# the entire sequence in recentMessages.
while splitIdx > 0 and nonSystemMessages[splitIdx].get("role") == "tool":
splitIdx -= 1
# Also include the assistant message that triggered the tool calls.
if splitIdx > 0 and splitIdx < len(nonSystemMessages) and \
nonSystemMessages[splitIdx].get("role") == "assistant" and \
nonSystemMessages[splitIdx].get("tool_calls"):
pass # splitIdx already points at the assistant; keep it in recent
elif splitIdx == 0:
return None # nothing to summarize
messagesToSummarize = nonSystemMessages[:splitIdx]
recentMessages = nonSystemMessages[splitIdx:]
summaryInput = _formatMessagesForSummary(messagesToSummarize)
previousSummary = self._summaries[-1]["content"] if self._summaries else ""
isMetaSummary = currentRound >= META_SUMMARY_ROUND and len(self._summaries) >= 2
summaryPrompt = _buildSummaryPrompt(summaryInput, previousSummary, isMetaSummary)
try:
summaryText = await aiCallFn(summaryPrompt)
except Exception as e:
logger.error(f"Progressive summarization failed: {e}")
return None
self._summaries.append({
"round": currentRound,
"content": summaryText,
"isMeta": isMetaSummary,
})
self._lastSummarizedRound = currentRound
mainSystem = systemMsgs[0] if systemMsgs else {"role": "system", "content": ""}
ragMessages = [m for m in systemMsgs if m.get("_isRagContext")]
self._messages = [
mainSystem,
*ragMessages,
{"role": "system", "content": f"Conversation Summary (rounds 1-{currentRound - keepRecent}):\n{summaryText}"},
*recentMessages,
]
logger.info(
f"Progressive summarization at round {currentRound}: "
f"compressed {len(messagesToSummarize)} messages into "
f"{'meta-' if isMetaSummary else ''}summary"
)
return summaryText
def _formatMessagesForSummary(messages: List[Dict[str, Any]]) -> str:
"""Format messages into a text block for summarization."""
parts = []
for msg in messages:
role = msg.get("role", "unknown")
content = msg.get("content", "")
if role == "tool":
toolName = msg.get("tool_call_id", "tool")
parts.append(f"[Tool Result ({toolName})]:\n{content}")
elif role == "assistant" and msg.get("tool_calls"):
calls = msg["tool_calls"]
callNames = [c.get("function", {}).get("name", "?") for c in calls]
parts.append(f"[Assistant → Tool Calls: {', '.join(callNames)}]")
if content:
parts.append(f"[Assistant]: {content}")
else:
parts.append(f"[{role.capitalize()}]: {content}")
return "\n\n".join(parts)
def _buildSummaryPrompt(messagesText: str, previousSummary: str, isMetaSummary: bool = False) -> str:
"""Build the prompt for progressive summarization."""
if isMetaSummary:
prompt = (
"Create a comprehensive meta-summary consolidating the previous summary "
"and the new messages. Preserve all key facts, decisions, entities (names, "
"numbers, dates), tool results, and action outcomes. Be concise but complete.\n\n"
)
else:
prompt = (
"Summarize the following conversation concisely. Preserve all key facts, "
"decisions, entities (names, numbers, dates), and tool results. "
"Do not lose any important information.\n\n"
)
if previousSummary:
prompt += f"Previous Summary:\n{previousSummary}\n\n"
prompt += f"New Messages to Summarize:\n{messagesText}\n\nProvide a concise, factual summary:"
return prompt
_LANGUAGE_NAMES = {
"de": "German", "en": "English", "fr": "French", "it": "Italian",
"es": "Spanish", "pt": "Portuguese", "nl": "Dutch", "ja": "Japanese",
"zh": "Chinese", "ko": "Korean", "ar": "Arabic", "ru": "Russian",
}
def buildSystemPrompt(
tools: List[ToolDefinition],
toolsFormatted: str = None,
userLanguage: str = "",
) -> str:
"""Build the system prompt for the agent.
Args:
tools: Available tool definitions.
toolsFormatted: Pre-formatted tool descriptions for text-based fallback.
userLanguage: ISO 639-1 language code (e.g. "de", "en"). The agent will
respond in this language.
"""
langName = _LANGUAGE_NAMES.get(userLanguage, "")
langInstruction = (
f"IMPORTANT: Always respond in {langName} ({userLanguage}). "
f"The user's language is {langName}. All your messages, explanations, "
f"and summaries MUST be in {langName}. "
f"Only use English for tool call arguments and technical identifiers.\n\n"
) if langName else ""
prompt = (
f"{langInstruction}"
"You are an AI agent with access to tools. "
"Use the provided tools to accomplish the user's task. "
"Think step by step. Call tools when you need information or need to perform actions. "
"When you have enough information to answer, respond directly without calling tools.\n\n"
)
if toolsFormatted:
prompt += f"Available Tools:\n{toolsFormatted}\n\n"
prompt += (
"To call a tool, use this format:\n"
"```tool_call\n"
"tool: <tool_name>\n"
'args: {"param": "value"}\n'
"```\n\n"
)
return prompt

View file

@ -0,0 +1,132 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Data models for the Agent service."""
from typing import List, Dict, Any, Optional
from enum import Enum
from pydantic import BaseModel, Field
from modules.shared.timeUtils import getUtcTimestamp
import uuid
class AgentStatusEnum(str, Enum):
RUNNING = "running"
COMPLETED = "completed"
MAX_ROUNDS_REACHED = "maxRoundsReached"
BUDGET_EXCEEDED = "budgetExceeded"
ERROR = "error"
STOPPED = "stopped"
class AgentEventTypeEnum(str, Enum):
MESSAGE = "message"
CHUNK = "chunk"
TOOL_CALL = "toolCall"
TOOL_RESULT = "toolResult"
AGENT_PROGRESS = "agentProgress"
AGENT_SUMMARY = "agentSummary"
FILE_CREATED = "fileCreated"
DATA_SOURCE_ACCESS = "dataSourceAccess"
VOICE_RESPONSE = "voiceResponse"
FINAL = "final"
ERROR = "error"
class ToolDefinition(BaseModel):
"""Schema for a tool available to the agent."""
name: str = Field(description="Unique tool name")
description: str = Field(description="What this tool does")
parameters: Dict[str, Any] = Field(
default_factory=dict,
description="JSON Schema for tool parameters"
)
readOnly: bool = Field(
default=False,
description="If True, tool can run in parallel with other readOnly tools"
)
featureType: Optional[str] = Field(
default=None,
description="Feature scope for this tool (None = available to all)"
)
class ToolCallRequest(BaseModel):
"""A tool call requested by the AI model."""
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
name: str
args: Dict[str, Any] = Field(default_factory=dict)
class ToolResult(BaseModel):
"""Result from executing a tool."""
toolCallId: str
toolName: str
success: bool = True
data: str = ""
error: Optional[str] = None
durationMs: int = 0
sideEvents: Optional[List[Dict[str, Any]]] = None
class AgentEvent(BaseModel):
"""Event emitted during agent execution for SSE streaming."""
type: AgentEventTypeEnum
content: Optional[str] = None
data: Optional[Dict[str, Any]] = None
class AgentConfig(BaseModel):
"""Configuration for an agent run."""
maxRounds: int = Field(default=25, ge=1, le=100)
maxCostCHF: Optional[float] = Field(default=None, ge=0.0)
entityCacheEnabled: bool = Field(default=False)
toolSet: str = Field(default="core")
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
class AgentState(BaseModel):
"""Tracks state across an agent loop execution."""
workflowId: str
currentRound: int = 0
maxRounds: int = 25
totalAiCalls: int = 0
totalToolCalls: int = 0
totalCostCHF: float = 0.0
totalProcessingTime: float = 0.0
status: AgentStatusEnum = AgentStatusEnum.RUNNING
abortReason: Optional[str] = None
class ToolCallLog(BaseModel):
"""Log of a single tool call for observability."""
toolName: str
args: Dict[str, Any] = Field(default_factory=dict)
success: bool = True
durationMs: int = 0
error: Optional[str] = None
class AgentRoundLog(BaseModel):
"""Log of a single agent round for observability."""
roundNumber: int
aiModel: str = ""
inputTokens: int = 0
outputTokens: int = 0
costCHF: float = 0.0
toolCalls: List[ToolCallLog] = Field(default_factory=list)
durationMs: int = 0
class AgentTrace(BaseModel):
"""Full trace of an agent workflow for observability."""
workflowId: str
userId: str = ""
featureInstanceId: str = ""
startedAt: float = Field(default_factory=getUtcTimestamp)
completedAt: Optional[float] = None
status: AgentStatusEnum = AgentStatusEnum.RUNNING
totalRounds: int = 0
totalToolCalls: int = 0
totalCostCHF: float = 0.0
abortReason: Optional[str] = None
rounds: List[AgentRoundLog] = Field(default_factory=list)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,150 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Tool registry for the Agent service. Manages tool definitions and dispatch."""
import logging
import time
from typing import Dict, List, Any, Optional, Callable, Awaitable
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
ToolDefinition, ToolCallRequest, ToolResult
)
logger = logging.getLogger(__name__)
class ToolRegistry:
"""Registry for agent tools. Handles registration, lookup, and dispatch."""
def __init__(self):
self._tools: Dict[str, ToolDefinition] = {}
self._handlers: Dict[str, Callable[..., Awaitable[ToolResult]]] = {}
def register(self, name: str, handler: Callable[..., Awaitable[ToolResult]],
description: str = "", parameters: Dict[str, Any] = None,
readOnly: bool = False, featureType: str = None):
"""Register a tool with its handler function."""
if name in self._tools:
logger.warning(f"Tool '{name}' already registered, overwriting")
self._tools[name] = ToolDefinition(
name=name,
description=description,
parameters=parameters or {},
readOnly=readOnly,
featureType=featureType
)
self._handlers[name] = handler
logger.debug(f"Registered tool: {name} (readOnly={readOnly})")
def registerFromDefinition(self, definition: ToolDefinition,
handler: Callable[..., Awaitable[ToolResult]]):
"""Register a tool from a pre-built ToolDefinition."""
self._tools[definition.name] = definition
self._handlers[definition.name] = handler
logger.debug(f"Registered tool: {definition.name} (readOnly={definition.readOnly})")
def unregister(self, name: str):
"""Remove a tool from the registry."""
self._tools.pop(name, None)
self._handlers.pop(name, None)
def getTools(self, toolSet: str = None, featureType: str = None) -> List[ToolDefinition]:
"""Get available tools, optionally filtered by toolSet or featureType."""
tools = list(self._tools.values())
if featureType:
tools = [t for t in tools if t.featureType is None or t.featureType == featureType]
return tools
def getToolNames(self) -> List[str]:
"""Get names of all registered tools."""
return list(self._tools.keys())
def getTool(self, name: str) -> Optional[ToolDefinition]:
"""Get a single tool definition by name."""
return self._tools.get(name)
def isReadOnly(self, name: str) -> bool:
"""Check if a tool is marked as readOnly."""
tool = self._tools.get(name)
return tool.readOnly if tool else False
def isValidTool(self, name: str) -> bool:
"""Check if a tool name is valid (registered)."""
return name in self._tools
async def dispatch(self, toolCall: ToolCallRequest, context: Dict[str, Any] = None) -> ToolResult:
"""Execute a tool call and return the result."""
startTime = time.time()
if not self.isValidTool(toolCall.name):
return ToolResult(
toolCallId=toolCall.id,
toolName=toolCall.name,
success=False,
error=f"Unknown tool: '{toolCall.name}'. Available: {', '.join(self.getToolNames())}"
)
handler = self._handlers[toolCall.name]
argsSummary = ", ".join(f"{k}={str(v)[:80]}" for k, v in (toolCall.args or {}).items())
logger.info(f"Tool dispatch: {toolCall.name}({argsSummary})")
try:
result = await handler(toolCall.args, context or {})
durationMs = int((time.time() - startTime) * 1000)
if isinstance(result, ToolResult):
result.toolCallId = toolCall.id
result.durationMs = durationMs
dataSummary = (result.data[:200] + "...") if result.data and len(result.data) > 200 else (result.data or "")
if result.success:
logger.info(f"Tool result: {toolCall.name} OK ({durationMs}ms) → {dataSummary}")
else:
logger.warning(f"Tool result: {toolCall.name} FAILED ({durationMs}ms) → {result.error}")
return result
return ToolResult(
toolCallId=toolCall.id,
toolName=toolCall.name,
success=True,
data=str(result),
durationMs=durationMs
)
except Exception as e:
durationMs = int((time.time() - startTime) * 1000)
logger.error(f"Tool '{toolCall.name}' failed: {e}", exc_info=True)
return ToolResult(
toolCallId=toolCall.id,
toolName=toolCall.name,
success=False,
error=str(e),
durationMs=durationMs
)
def formatToolsForPrompt(self) -> str:
"""Format all tools as text for system prompt (text-based fallback)."""
parts = []
for tool in self._tools.values():
paramStr = ", ".join(
f"{k}: {v}" for k, v in tool.parameters.items()
) if tool.parameters else "none"
parts.append(f"- **{tool.name}**: {tool.description}\n Parameters: {{{paramStr}}}")
return "\n".join(parts)
def formatToolsForFunctionCalling(self) -> List[Dict[str, Any]]:
"""Format all tools as OpenAI-compatible function definitions for native function calling."""
functions = []
for tool in self._tools.values():
functions.append({
"type": "function",
"function": {
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters if tool.parameters else {
"type": "object",
"properties": {},
"required": []
}
}
})
return functions

View file

@ -64,6 +64,10 @@ class _ServicesAdapter:
def interfaceDbChat(self):
return self._get_service("chat").interfaceDbChat
@property
def interfaceDbComponent(self):
return self._get_service("chat").interfaceDbComponent
@property
def featureCode(self) -> Optional[str]:
w = self.workflow
@ -142,6 +146,8 @@ class AiService:
3. billingCallback on aiObjects: records one billing transaction per model call
with exact provider + model name (set before AI call, invoked by _callWithModel)
"""
await self.ensureAiObjectsInitialized()
# SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection
if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS:
return await self._handleSpeechTeams(request)
@ -171,14 +177,27 @@ class AiService:
else:
response = await self.aiObjects.callWithTextContext(request)
finally:
# Clear callback after call completes
self.aiObjects.billingCallback = None
# Store workflow stats for analytics
self._storeAiCallStats(response, request)
return response
async def callAiStream(self, request: AiCallRequest):
"""Streaming variant of callAi. Yields str deltas during generation, then final AiCallResponse."""
await self.ensureAiObjectsInitialized()
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
effectiveProviders = self._calculateEffectiveProviders()
if effectiveProviders and request.options:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
self.aiObjects.billingCallback = self._createBillingCallback()
try:
async for chunk in self.aiObjects.callWithTextContextStream(request):
yield chunk
finally:
self.aiObjects.billingCallback = None
# =========================================================================
# SPEECH_TEAMS: Dedicated handler for Teams Meeting AI analysis
# Bypasses standard model selection. Uses a fixed fast model.
@ -295,9 +314,6 @@ class AiService:
except Exception as e:
logger.error(f"BILLING: Failed to record billing for SPEECH_TEAMS: {e}")
# Store stats
self._storeAiCallStats(response, request)
logger.info(f"SPEECH_TEAMS call completed: model={model.name}, time={processingTime:.2f}s, cost={priceCHF:.4f} CHF")
return response
@ -644,12 +660,12 @@ detectedIntent-Werte:
billingService = getBillingService(user, mandateId, featureInstanceId, featureCode)
def _billingCallback(response) -> None:
"""Record billing for a single AI model call."""
"""Record billing transaction with full AI call metadata."""
if not response or getattr(response, 'errorCount', 0) > 0:
return
priceCHF = getattr(response, 'priceCHF', 0.0)
if not priceCHF or priceCHF <= 0:
basePriceCHF = getattr(response, 'priceCHF', 0.0)
if not basePriceCHF or basePriceCHF <= 0:
return
provider = getattr(response, 'provider', None) or 'unknown'
@ -657,20 +673,24 @@ detectedIntent-Werte:
try:
billingService.recordUsage(
priceCHF=priceCHF,
priceCHF=basePriceCHF,
workflowId=workflowId,
aicoreProvider=provider,
aicoreModel=modelName,
description=f"AI: {modelName}"
description=f"AI: {modelName}",
processingTime=getattr(response, 'processingTime', None),
bytesSent=getattr(response, 'bytesSent', None),
bytesReceived=getattr(response, 'bytesReceived', None),
errorCount=getattr(response, 'errorCount', None)
)
logger.debug(
f"Billed model call: {priceCHF:.4f} CHF, "
f"Billed model call: {basePriceCHF:.4f} CHF, "
f"provider={provider}, model={modelName}, mandate={mandateId}"
)
except Exception as e:
logger.error(
f"BILLING: Failed to record transaction! "
f"Cost={priceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
f"Cost={basePriceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
f"provider={provider}, model={modelName}, error={e}"
)
@ -723,40 +743,6 @@ detectedIntent-Werte:
logger.warning(f"Error calculating effective providers: {e}")
return None
def _storeAiCallStats(self, response, request: AiCallRequest) -> None:
"""Store workflow stats after an AI call.
This method stores the AI call statistics (cost, processing time, bytes)
to the workflow stats collection for tracking and billing purposes.
Args:
response: AiCallResponse with cost/timing data
request: Original AiCallRequest for context
"""
try:
# Skip if no workflow context
workflow = getattr(self.services, 'workflow', None)
if not workflow or not hasattr(workflow, 'id') or not workflow.id:
logger.debug("No workflow context - skipping stats storage")
return
# Skip if response is an error
if not response or getattr(response, 'errorCount', 0) > 0:
logger.debug("Error response - skipping stats storage")
return
# Determine process name from operation type
opType = getattr(request.options, 'operationType', 'unknown') if request.options else 'unknown'
process = f"ai.call.{opType}"
# Store the stat
self.services.chat.storeWorkflowStat(workflow, response, process)
logger.debug(f"Stored AI call stat: {process}, cost={getattr(response, 'priceCHF', 0):.4f} CHF")
except Exception as e:
# Log but don't fail - stats storage is not critical
logger.debug(f"Could not store AI call stat: {str(e)}")
async def ensureAiObjectsInitialized(self):
"""Ensure aiObjects is initialized and submodules are ready."""
if self.aiObjects is None:
@ -766,17 +752,17 @@ detectedIntent-Werte:
self._initializeSubmodules()
@classmethod
async def create(cls, legacy_services) -> "AiService":
"""Create AiService from legacy Services hub. For backward compatibility with tests."""
async def create(cls, servicesHub) -> "AiService":
"""Create AiService from a ServiceHub instance."""
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(
user=legacy_services.user,
mandate_id=legacy_services.mandateId,
feature_instance_id=legacy_services.featureInstanceId,
workflow=getattr(legacy_services, "workflow", None),
user=servicesHub.user,
mandate_id=servicesHub.mandateId,
feature_instance_id=servicesHub.featureInstanceId,
workflow=getattr(servicesHub, "workflow", None),
)
return getService("ai", ctx, legacy_hub=legacy_services)
return getService("ai", ctx)
# Helper methods

View file

@ -125,10 +125,11 @@ class AiCallLooper:
logger.error(errorMsg)
raise ValueError(errorMsg)
maxIterations = 50 # Prevent infinite loops
maxIterations = 10
iteration = 0
allSections = [] # Accumulate all sections across iterations
lastRawResponse = None # Store last raw JSON response for continuation
result = ""
allSections = []
lastRawResponse = None
# JSON Base Iteration System:
# - jsonBase: the merged JSON string (replaces accumulatedDirectJson array)

View file

@ -261,35 +261,34 @@ class ContentExtractor:
# Check if it's standardized JSON format (has "documents" or "sections")
if document.mimeType == "application/json":
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
try:
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
except (json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(f"Could not parse JSON document {document.fileName}: {str(e)}")
jsonData = None
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
# Create reference ContentPart for structured JSON
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue # Skip normal extraction for this document
except Exception as e:
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
# Continue with normal extraction
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue
# Normal extraction path
intent = getIntentForDocument(document.id, documentIntents)

View file

@ -230,9 +230,12 @@ class DocumentIntentAnalyzer:
else:
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
return None
except (json.JSONDecodeError, UnicodeDecodeError) as e:
logger.debug(f"Error parsing document {document.fileName}: {str(e)}")
return None
except Exception as e:
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
logger.error(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
def _buildIntentAnalysisPrompt(

View file

@ -330,17 +330,7 @@ class JsonMergeLogger:
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
else:
# No log file set - write individual file (fallback)
currentFileDir = os.path.dirname(os.path.abspath(__file__))
logDir = currentFileDir
os.makedirs(logDir, exist_ok=True)
logFilePath = os.path.join(logDir, f"{mergeId}.txt")
try:
with open(logFilePath, 'w', encoding='utf-8') as f:
f.write(logContent)
logger.info(f"JSON merge log written to: {logFilePath}")
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
logger.debug(f"JSON merge {mergeId} completed ({len(logContent)} chars log). Use initializeLogFile() to persist merge logs.")
# Clear buffer for next merge
JsonMergeLogger._logBuffer = []

View file

@ -25,7 +25,7 @@ class StructureFiller:
"""Handles filling document structure with content."""
# Default concurrency limit for parallel generation (chapters/sections)
DEFAULT_MAX_CONCURRENT_GENERATION = 16
DEFAULT_MAX_CONCURRENT_GENERATION = 5
def __init__(self, services, aiService):
"""Initialize StructureFiller with service center and AI service access."""
@ -568,11 +568,16 @@ class StructureFiller:
all_sections_list: List[Dict[str, Any]],
language: str,
outputFormat: str = "txt",
calculateOverallProgress: callable = None
calculateOverallProgress: callable = None,
preExtractedText: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Process a single section and return its elements.
Used for parallel processing of sections within a chapter.
When preExtractedText is provided, the section uses the pre-extracted
content directly in its prompt instead of sending raw content parts
through the heavy extraction pipeline (avoids chunking + N*M AI calls).
"""
sectionId = section.get("id")
sectionTitle = section.get("title", sectionId)
@ -600,6 +605,149 @@ class StructureFiller:
elements = []
# --- Fast path: use pre-extracted text instead of raw content parts ---
if preExtractedText and useAiCall and generationHint:
logger.info(
f"Section {sectionId}: Using pre-extracted text "
f"({len(preExtractedText):,} chars) - lightweight AI path"
)
for partId in contentPartIds:
part = self._findContentPartById(partId, contentParts)
if not part:
continue
cf = contentFormats.get(partId, part.metadata.get("contentFormat"))
if cf == "reference":
elements.append({
"type": "reference",
"documentReference": part.metadata.get("documentReference"),
"label": part.metadata.get("usageHint", part.label)
})
elif cf == "object":
if part.typeGroup == "image" and part.data:
caption = (
section.get("caption")
or section.get("metadata", {}).get("caption")
or part.metadata.get("caption", "")
)
elements.append({
"type": "image",
"content": {
"base64Data": part.data,
"altText": part.metadata.get("usageHint", part.label),
"caption": caption
},
"caption": caption
})
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
section=section,
contentParts=[],
userPrompt=userPrompt,
generationHint=generationHint,
allSections=all_sections_list,
sectionIndex=sectionIndex,
isAggregation=False,
language=language,
outputFormat=outputFormat,
preExtractedText=preExtractedText
)
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
self.services.chat.progressLogStart(
sectionOperationId,
"Section Generation (Pre-extracted)",
f"Section {sectionIndex + 1}/{totalSections}",
f"{sectionTitle} (pre-extracted)",
parentOperationId=chapterOperationId
)
try:
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
operationType = OperationTypeEnum.DATA_ANALYSE
options = AiCallOptions(
operationType=operationType,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.DETAILED
)
checkWorkflowStopped(self.services)
aiResponseJson = await self.aiService.callAiWithLooping(
prompt=generationPrompt,
options=options,
debugPrefix=f"{chapterId}_section_{sectionId}",
promptBuilder=self.buildSectionPromptWithContinuation,
promptArgs={
"section": section,
"contentParts": [],
"userPrompt": userPrompt,
"generationHint": generationHint,
"allSections": all_sections_list,
"sectionIndex": sectionIndex,
"isAggregation": False,
"templateStructure": templateStructure,
"basePrompt": generationPrompt,
"language": language
},
operationId=sectionOperationId,
userPrompt=userPrompt,
contentParts=None,
useCaseId="section_content"
)
try:
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
if isinstance(aiResponseJson, str) and ("---" in aiResponseJson or aiResponseJson.count("```json") > 1):
generatedElements = self._extractAndMergeMultipleJsonBlocks(aiResponseJson, contentType, sectionId)
else:
parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson)
if parsedResponse is None:
logger.warning(f"Section {sectionId}: tryParseJson failed, attempting repair")
repairedStr = repairBrokenJson(aiResponseJson)
parsedResponse, parseError2, _ = tryParseJson(repairedStr)
if parsedResponse and isinstance(parsedResponse, dict):
generatedElements = parsedResponse.get("elements", [])
elif parsedResponse and isinstance(parsedResponse, list):
generatedElements = parsedResponse
else:
generatedElements = []
except Exception as parseErr:
logger.error(f"Section {sectionId}: JSON parse error: {parseErr}")
generatedElements = []
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
class _AiResponse:
def __init__(self, content):
self.content = content
responseElements = await self._processAiResponseForSection(
aiResponse=_AiResponse(aiResponseJson),
contentType=contentType,
operationType=operationType,
sectionId=sectionId,
generationHint=generationHint,
generatedElements=generatedElements,
section=section
)
elements.extend(responseElements)
self.services.chat.progressLogFinish(sectionOperationId, True)
except Exception as e:
self.services.chat.progressLogFinish(sectionOperationId, False)
logger.error(f"Error in pre-extracted section {sectionId}: {e}")
elements.append({
"type": "error",
"message": f"Error processing section {sectionId}: {str(e)}",
"sectionId": sectionId
})
return elements
# --- Standard path: process content parts directly ---
# Prüfe ob Aggregation nötig ist
needsAggregation = self._needsAggregation(
contentType=contentType,
@ -1507,6 +1655,156 @@ class StructureFiller:
return elements
async def _preExtractSharedContent(
self,
contentParts: List[ContentPart],
allSectionTasks: List[Dict[str, Any]],
userPrompt: str,
parentOperationId: str
) -> Dict[str, str]:
"""
Pre-extract content from large/shared content parts ONCE before parallel
section filling. Returns dict mapping sectionId -> pre-extracted text.
Extracts a comprehensive plain-text summary per content part, then gives
ALL sections referencing that part the SAME summary. Each section's own
generationHint focuses the AI on the relevant aspect during generation.
This eliminates the N*M AI call explosion where N sections each independently
chunk and process the same M-byte content part through the extraction pipeline.
"""
SIZE_THRESHOLD = 100_000
MIN_SHARED_SECTIONS = 2
partToSections: Dict[str, List[Dict[str, Any]]] = {}
for task in allSectionTasks:
section = task["section"]
for partId in section.get("contentPartIds", []):
if partId not in partToSections:
partToSections[partId] = []
partToSections[partId].append(section)
if not partToSections:
return {}
preExtractedCache: Dict[str, str] = {}
for partId, sections in partToSections.items():
part = self._findContentPartById(partId, contentParts)
if not part:
continue
contentFormat = part.metadata.get("contentFormat", "unknown")
if contentFormat != "extracted":
continue
if part.typeGroup in ("image", "binary"):
continue
if part.mimeType and (
part.mimeType.startswith("image/")
or part.mimeType.startswith("video/")
or part.mimeType.startswith("audio/")
):
continue
partSize = len(part.data) if part.data else 0
numSections = len(sections)
if numSections < MIN_SHARED_SECTIONS and partSize < SIZE_THRESHOLD:
continue
fileName = part.metadata.get("originalFileName", partId)
logger.info(
f"Pre-extracting content part {partId} "
f"({partSize:,} bytes, referenced by {numSections} sections)"
)
topicLines = []
for section in sections:
hint = (
section.get("generationHint")
or section.get("generation_hint")
or section.get("title", "")
)
topicLines.append(f"- {hint}")
topicsText = "\n".join(topicLines)
extractionPrompt = (
"# TASK: Extract key information from this document\n\n"
"Extract ALL relevant information from the provided content as "
"plain text. The extracted content will be used to generate a report "
"covering the topics listed below.\n\n"
f"## User Request\n{userPrompt}\n\n"
f"## Report topics that need data\n{topicsText}\n\n"
"## Instructions\n"
"- Extract key facts, data points, timestamps, error messages, "
"statistics, and specific findings\n"
"- Organize by theme but output as PLAIN TEXT (not JSON)\n"
"- Be comprehensive but concise - include specific data, "
"skip generic filler\n"
"- Include concrete examples with exact values from the source\n"
"- Do NOT add commentary or analysis - just extract the raw data\n"
)
try:
self.services.chat.progressLogUpdate(
parentOperationId, 0.05,
f"Pre-extracting content from {fileName} ({partSize:,} bytes)..."
)
def _preExtractionProgress(chunkProgress, message):
mapped = 0.05 + chunkProgress * 0.05
self.services.chat.progressLogUpdate(
parentOperationId, mapped,
f"Pre-extraction: {message}"
)
request = AiCallRequest(
prompt=extractionPrompt,
contentParts=[part],
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.DETAILED
)
)
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request, progressCallback=_preExtractionProgress)
responseText = response.content if hasattr(response, "content") else str(response)
if responseText and len(responseText.strip()) > 50:
for section in sections:
sId = section.get("id", "unknown")
preExtractedCache[sId] = responseText
logger.info(
f"Pre-extraction of {partId} successful: "
f"{len(responseText):,} chars summary for {numSections} sections"
)
self.services.chat.progressLogUpdate(
parentOperationId, 0.10,
f"Pre-extraction complete ({len(responseText):,} chars). Starting section generation..."
)
else:
logger.warning(
f"Pre-extraction of {partId} returned empty/short response "
f"({len(responseText) if responseText else 0} chars), "
"sections will fall back to direct extraction"
)
except Exception as e:
logger.error(
f"Pre-extraction of {partId} failed: {e}. "
"Sections will fall back to direct extraction."
)
if preExtractedCache:
logger.info(
f"Pre-extraction complete: {len(preExtractedCache)} sections "
"have pre-extracted content (will use lightweight AI path)"
)
return preExtractedCache
async def _fillChapterSections(
self,
chapterStructure: Dict[str, Any],
@ -1564,27 +1862,42 @@ class StructureFiller:
"docFormat": docFormat # Include output format
})
MAX_TOTAL_SECTIONS = 35
if totalSections > MAX_TOTAL_SECTIONS:
logger.warning(
f"Structure has {totalSections} sections (limit {MAX_TOTAL_SECTIONS}). "
"Truncating to stay within budget."
)
allSectionTasks = allSectionTasks[:MAX_TOTAL_SECTIONS]
totalSections = len(allSectionTasks)
preExtractedCache = await self._preExtractSharedContent(
contentParts, allSectionTasks, userPrompt, fillOperationId
)
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
# Create task wrapper for each section with progress tracking
async def processSectionWithSemaphore(taskInfo):
checkWorkflowStopped(self.services)
sectionId = taskInfo["section"].get("id", "unknown")
async with sectionSemaphore:
result = await self._processSingleSection(
section=taskInfo["section"],
sectionIndex=taskInfo["sectionIndex"],
totalSections=taskInfo["chapterSectionCount"],
chapterIndex=0, # Not used for sequential logic anymore
chapterIndex=0,
totalChapters=totalChapters,
chapterId=taskInfo["chapterId"],
chapterOperationId=fillOperationId, # Use fillOperationId as parent (no chapter-level ops in parallel mode)
chapterOperationId=fillOperationId,
fillOperationId=fillOperationId,
contentParts=contentParts,
userPrompt=userPrompt,
all_sections_list=all_sections_list,
language=taskInfo["docLanguage"],
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
outputFormat=taskInfo.get("docFormat", "txt"),
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0,
preExtractedText=preExtractedCache.get(sectionId)
)
# Update progress after each section completes
@ -1810,6 +2123,7 @@ GENERATION HINT: {generationHint}
- Each section should serve a clear purpose with meaningful data
- If no relevant data exists for a topic, do NOT create a section for it
- Prefer ONE comprehensive section over multiple sparse sections
- HARD LIMIT: Maximum 5 sections per chapter. Combine related subtopics into single sections to stay within this limit.
**CRITICAL**: The chapter's generationHint above describes what content this chapter should generate. If the generationHint references documents/images/data, then EACH section that generates content for this chapter MUST assign the relevant ContentParts from AVAILABLE CONTENT PARTS below.
@ -1893,7 +2207,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
sectionIndex: Optional[int] = None,
isAggregation: bool = False,
language: str = "en",
outputFormat: str = "txt"
outputFormat: str = "txt",
preExtractedText: Optional[str] = None
) -> tuple[str, str]:
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
# Filtere None-Werte
@ -2057,7 +2372,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
6. Format based on content_type ({effectiveContentType}).
7. No HTML/styling: Plain text only, no markup.
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
8. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
## OUTPUT FORMAT
@ -2083,6 +2398,62 @@ Output requirements:
{userPrompt}
```
## CONTEXT
{contextText if contextText else ""}
"""
elif preExtractedText:
prompt = f"""# TASK: Generate Section Content from Pre-Extracted Data
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {effectiveContentType}
- Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only
- AVOID verbose text, filler phrases, or redundant explanations
- Be CONCISE and direct - every word should add value
- NO introductory phrases like "This section describes..." or "Here we present..."
- Minimize output size for efficient processing
## PRE-EXTRACTED CONTENT FOR THIS SECTION
```
{preExtractedText}
```
## INSTRUCTIONS
1. Use ONLY the pre-extracted content above. Never invent or generate data not present in it.
2. If the pre-extracted content is empty, return empty structures.
3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
6. Focus on the MOST RELEVANT information. Be concise.
## OUTPUT FORMAT
Return a JSON object with this structure:
{{
"elements": [
{{
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
}}
Output requirements:
- "content" must be an object (never a string)
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
- Start with {{ and end with }} - return ONLY the JSON object itself
- No invented data: Return empty structures if pre-extracted content is empty
## USER REQUEST
```
{userPrompt}
```
## CONTEXT
{contextText if contextText else ""}
"""
@ -2117,7 +2488,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
6. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
## OUTPUT FORMAT
Return a JSON object with this structure:

View file

@ -430,6 +430,7 @@ Then chapters that generate those generic content types MUST assign the relevant
## CHAPTER STRUCTURE REQUIREMENTS
- Generate chapters based on USER REQUEST - analyze what structure the user wants
- Create ONLY the minimum chapters needed to cover the user's request - avoid over-structuring
- HARD LIMIT: Maximum 7 chapters per document. If the topic can be covered in fewer, prefer fewer. Combine related topics into single chapters rather than creating many small ones.
- IMPORTANT: Each chapter MUST have ALL these fields:
- id: Unique identifier (e.g., "chapter_1")
- level: Heading level (1, 2, 3, etc.)

View file

@ -205,36 +205,20 @@ class BillingService:
workflowId: str = None,
aicoreProvider: str = None,
aicoreModel: str = None,
description: str = None
description: str = None,
processingTime: float = None,
bytesSent: int = None,
bytesReceived: int = None,
errorCount: int = None
) -> Optional[Dict[str, Any]]:
"""
Record AI usage cost as a billing transaction.
This method:
1. Applies the pricing markup
2. Creates a DEBIT transaction
3. Updates the account balance
Args:
priceCHF: Base price from AI model (before markup)
workflowId: Optional workflow ID
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
description: Optional description
Returns:
Created transaction dict or None if not recorded
"""
"""Record AI usage cost as a billing transaction with markup applied."""
if priceCHF <= 0:
return None
# Apply markup
finalPrice = self.calculatePriceWithMarkup(priceCHF)
if finalPrice <= 0:
return None
# Build description
if not description:
description = f"AI Usage: {aicoreModel or aicoreProvider or 'unknown'}"
@ -247,9 +231,17 @@ class BillingService:
featureCode=self.featureCode,
aicoreProvider=aicoreProvider,
aicoreModel=aicoreModel,
description=description
description=description,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=errorCount
)
def getWorkflowCost(self, workflowId: str) -> float:
"""Get total cost for a workflow from billing transactions."""
return self._billingInterface.getWorkflowCost(workflowId)
# =========================================================================
# Provider Permission Check (via RBAC)
# =========================================================================

View file

@ -4,7 +4,7 @@
import logging
from typing import Dict, Any, List, Optional, Callable
from modules.datamodels.datamodelUam import User, UserConnection
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatStat, ChatLog
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatLog
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
from modules.shared.progressLogger import ProgressLogger
@ -411,23 +411,159 @@ class ChatService:
return None
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
"""Get file information"""
file_item = self.interfaceDbComponent.getFile(fileId)
if file_item:
"""Get file information including new fields (tags, folderId, description, status)."""
fileItem = self.interfaceDbComponent.getFile(fileId)
if fileItem:
return {
"id": file_item.id,
"fileName": file_item.fileName,
"size": file_item.fileSize,
"mimeType": file_item.mimeType,
"fileHash": file_item.fileHash,
"creationDate": file_item.creationDate
"id": fileItem.id,
"fileName": fileItem.fileName,
"size": fileItem.fileSize,
"mimeType": fileItem.mimeType,
"fileHash": fileItem.fileHash,
"creationDate": fileItem.creationDate,
"tags": getattr(fileItem, "tags", None),
"folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
}
return None
def getFileData(self, fileId: str) -> bytes:
"""Get file data by ID"""
"""Get file data by ID."""
return self.interfaceDbComponent.getFileData(fileId)
def getFileContent(self, fileId: str) -> Optional[Dict[str, Any]]:
"""Get file content as text or base64 via FilePreview."""
preview = self.interfaceDbComponent.getFileContent(fileId)
if preview:
return preview.toDictWithBase64Encoding()
return None
def listFiles(
self,
folderId: str = None,
tags: List[str] = None,
search: str = None,
) -> List[Dict[str, Any]]:
"""List files for the current user with optional filters.
Args:
folderId: Filter by folder (None = root / all).
tags: Filter by tags (any match).
search: Search in fileName and description.
Returns:
List of file info dicts.
"""
allFiles = self.interfaceDbComponent.getAllFiles()
results = []
for fileItem in allFiles:
if folderId is not None:
itemFolderId = getattr(fileItem, "folderId", None)
if itemFolderId != folderId:
continue
if tags:
itemTags = getattr(fileItem, "tags", None) or []
if not any(t in itemTags for t in tags):
continue
if search:
searchLower = search.lower()
nameMatch = searchLower in (fileItem.fileName or "").lower()
descMatch = searchLower in (getattr(fileItem, "description", None) or "").lower()
if not nameMatch and not descMatch:
continue
results.append({
"id": fileItem.id,
"fileName": fileItem.fileName,
"mimeType": fileItem.mimeType,
"fileSize": fileItem.fileSize,
"creationDate": fileItem.creationDate,
"tags": getattr(fileItem, "tags", None),
"folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
})
return results
def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]:
"""List file folders for the current user.
Args:
parentId: Parent folder ID (None = root folders).
Returns:
List of folder dicts.
"""
from modules.datamodels.datamodelFileFolder import FileFolder
recordFilter = {"_createdBy": self.user.id if self.user else ""}
if parentId is not None:
recordFilter["parentId"] = parentId
else:
recordFilter["parentId"] = None
return self.interfaceDbComponent.db.getRecordset(FileFolder, recordFilter=recordFilter)
def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]:
"""Create a new file folder."""
from modules.datamodels.datamodelFileFolder import FileFolder
folder = FileFolder(name=name, parentId=parentId)
return self.interfaceDbComponent.db.recordCreate(FileFolder, folder)
# ---- DataSource CRUD ----
def createDataSource(
self, connectionId: str, sourceType: str, path: str, label: str,
featureInstanceId: str = None
) -> Dict[str, Any]:
"""Create a new external data source reference."""
from modules.datamodels.datamodelDataSource import DataSource
ds = DataSource(
connectionId=connectionId,
sourceType=sourceType,
path=path,
label=label,
featureInstanceId=featureInstanceId or self._context.feature_instance_id or "",
mandateId=self._context.mandate_id or "",
userId=self.user.id if self.user else "",
)
return self.interfaceDbComponent.db.recordCreate(DataSource, ds)
def listDataSources(self, featureInstanceId: str = None) -> List[Dict[str, Any]]:
"""List data sources, optionally filtered by feature instance."""
from modules.datamodels.datamodelDataSource import DataSource
recordFilter = {}
if featureInstanceId:
recordFilter["featureInstanceId"] = featureInstanceId
return self.interfaceDbComponent.db.getRecordset(DataSource, recordFilter=recordFilter)
def getDataSource(self, dataSourceId: str) -> Optional[Dict[str, Any]]:
"""Get a single data source by ID."""
from modules.datamodels.datamodelDataSource import DataSource
results = self.interfaceDbComponent.db.getRecordset(DataSource, recordFilter={"id": dataSourceId})
return results[0] if results else None
def deleteDataSource(self, dataSourceId: str) -> bool:
"""Delete a data source."""
from modules.datamodels.datamodelDataSource import DataSource
try:
self.interfaceDbComponent.db.recordDelete(DataSource, dataSourceId)
return True
except Exception as e:
logger.error(f"Failed to delete DataSource {dataSourceId}: {e}")
return False
def getUserConnections(self) -> List[Dict[str, Any]]:
"""Get all UserConnections for the current user."""
try:
if self.interfaceDbApp and self.user:
connections = self.interfaceDbApp.getUserConnections(self.user.id)
return [c.model_dump() if hasattr(c, "model_dump") else c for c in (connections or [])]
except Exception as e:
logger.error(f"Error getting user connections: {e}")
return []
def _diagnoseDocumentAccess(self, document: ChatDocument) -> Dict[str, Any]:
"""
Diagnose document access issues and provide recovery information.
@ -688,35 +824,6 @@ class ChatService:
workflow.logs.append(chatLog)
return chatLog
def storeWorkflowStat(self, workflow: Any, aiResponse: Any, process: str) -> ChatStat:
"""Persist workflow-level ChatStat from AiCallResponse and append to workflow stats list.
Billing is handled at the AI call source (interfaceAiObjects._callWithModel)
via billingCallback - not here. This method only handles workflow stats.
"""
try:
statData = {
"workflowId": workflow.id,
"process": process,
"engine": aiResponse.modelName,
"priceCHF": aiResponse.priceCHF,
"processingTime": aiResponse.processingTime,
"bytesSent": aiResponse.bytesSent,
"bytesReceived": aiResponse.bytesReceived,
"errorCount": aiResponse.errorCount
}
stat = self.interfaceDbChat.createStat(statData)
if not hasattr(workflow, 'stats') or workflow.stats is None:
workflow.stats = []
workflow.stats.append(stat)
return stat
except Exception as e:
logger.error(f"Failed to store workflow stat: {e}")
raise
def updateMessage(self, messageId: str, messageData: Dict[str, Any]):
"""Update message by delegating to the chat interface"""
try:

View file

@ -2,90 +2,147 @@
# All rights reserved.
from typing import Any, Dict, List
import json
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
logger = logging.getLogger(__name__)
class StructureChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("structureChunkSize", 40000))
data = part.data or ""
# best-effort: try JSON list/object bucketing; else fallback to line-based
chunks: List[Dict[str, Any]] = []
try:
obj = json.loads(data)
def emit(bucket: Any):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
if isinstance(obj, list):
bucket: list[Any] = []
size = 0
for item in obj:
text = json.dumps(item, ensure_ascii=False)
s = len(text.encode('utf-8'))
if size + s > maxBytes and bucket:
emit(bucket)
bucket = [item]
size = s
else:
bucket.append(item)
size += s
if bucket:
emit(bucket)
else:
# JSON object (dict) - check if it fits
text = json.dumps(obj, ensure_ascii=False)
textSize = len(text.encode('utf-8'))
if textSize <= maxBytes:
emit(obj)
else:
# Object too large - try to split by keys if possible
# For large objects, we need to chunk by character boundaries
# since we can't split JSON objects arbitrarily
if isinstance(obj, dict) and len(obj) > 1:
# Try to split object into multiple chunks by keys
# This preserves JSON structure better than line-based chunking
currentChunk: Dict[str, Any] = {}
currentSize = 2 # Start with "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
# Account for comma and spacing between items
if currentChunk:
itemSize += 2 # ", " separator
self._chunkValue(obj, maxBytes, chunks)
except (json.JSONDecodeError, ValueError):
self._chunkByLines(data, maxBytes, chunks)
if currentSize + itemSize > maxBytes and currentChunk:
# Current chunk is full, emit it
emit(currentChunk)
currentChunk = {key: value}
currentSize = len(itemText.encode('utf-8'))
else:
currentChunk[key] = value
currentSize += itemSize
# Emit remaining chunk
if currentChunk:
emit(currentChunk)
else:
# Single large value or can't split - fallback to line chunking
raise ValueError("too large")
except Exception:
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
return chunks
def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Recursively chunk a JSON value (list or dict) into pieces <= maxBytes."""
text = json.dumps(obj, ensure_ascii=False)
if len(text.encode('utf-8')) <= maxBytes:
self._emit(obj, chunks)
return
if isinstance(obj, list):
self._chunkList(obj, maxBytes, chunks)
elif isinstance(obj, dict):
self._chunkDict(obj, maxBytes, chunks)
else:
self._chunkByLines(text, maxBytes, chunks)
def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON array into sub-arrays that each fit within maxBytes."""
bucket: list = []
bucketSize = 2 # "[]" overhead
for item in items:
itemText = json.dumps(item, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if bucket else 0 # ", "
if bucketSize + itemSize + separator > maxBytes and bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
self._chunkValue(item, maxBytes, chunks)
else:
bucket.append(item)
bucketSize += itemSize + separator
if bucket:
self._emit(bucket, chunks)
def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it."""
if len(obj) <= 1:
key, value = next(iter(obj.items()))
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
text = json.dumps(obj, ensure_ascii=False)
self._chunkByLines(text, maxBytes, chunks)
return
currentChunk: Dict[str, Any] = {}
currentSize = 2 # "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if currentChunk else 0
if currentSize + itemSize + separator > maxBytes and currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
self._chunkByLines(itemText, maxBytes, chunks)
else:
currentChunk[key] = value
currentSize += itemSize + separator
if currentChunk:
self._emit(currentChunk, chunks)
def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}."""
subChunks: List[Dict[str, Any]] = []
self._chunkValue(value, maxBytes, subChunks)
for sub in subChunks:
subData = json.loads(sub["data"])
wrapped = {key: subData}
wrappedText = json.dumps(wrapped, ensure_ascii=False)
wrappedSize = len(wrappedText.encode('utf-8'))
if wrappedSize <= maxBytes:
self._emit(wrapped, chunks)
else:
self._chunkByLines(wrappedText, maxBytes, chunks)
def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Line-based fallback for content that cannot be split structurally."""
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})

View file

@ -0,0 +1,175 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Audio extractor for common audio formats.
Extracts metadata (duration, bitrate, sample rate, channels) and produces
an `audiostream` ContentPart. For files under 10 MB the base64 audio data
is included; larger files only get metadata.
Optional dependency: mutagen (for rich metadata).
"""
from typing import Any, Dict, List
import base64
import logging
import struct
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
_AUDIO_MIME_TYPES = [
"audio/mpeg",
"audio/mp3",
"audio/wav",
"audio/x-wav",
"audio/ogg",
"audio/flac",
"audio/x-flac",
"audio/mp4",
"audio/x-m4a",
"audio/aac",
"audio/webm",
]
_AUDIO_EXTENSIONS = [".mp3", ".wav", ".ogg", ".flac", ".m4a", ".aac", ".wma", ".webm"]
_MAX_INLINE_SIZE = 10 * 1024 * 1024 # 10 MB
class AudioExtractor(Extractor):
"""Extractor for audio files.
Produces:
- 1 text ContentPart with metadata summary
- 1 audiostream ContentPart (base64 data included only if < 10 MB)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
if mimeType in _AUDIO_MIME_TYPES:
return True
lower = (fileName or "").lower()
return any(lower.endswith(ext) for ext in _AUDIO_EXTENSIONS)
def getSupportedExtensions(self) -> list[str]:
return list(_AUDIO_EXTENSIONS)
def getSupportedMimeTypes(self) -> list[str]:
return list(_AUDIO_MIME_TYPES)
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName", "audio")
mimeType = context.get("mimeType") or "audio/mpeg"
fileSize = len(fileBytes)
rootId = makeId()
parts: List[ContentPart] = []
meta = _extractMetadata(fileBytes, fileName)
meta["size"] = fileSize
meta["fileName"] = fileName
meta["mimeType"] = mimeType
metaLines = [f"Audio file: {fileName}"]
if meta.get("duration"):
mins = int(meta["duration"] // 60)
secs = int(meta["duration"] % 60)
metaLines.append(f"Duration: {mins}:{secs:02d}")
if meta.get("bitrate"):
metaLines.append(f"Bitrate: {meta['bitrate']} kbps")
if meta.get("sampleRate"):
metaLines.append(f"Sample rate: {meta['sampleRate']} Hz")
if meta.get("channels"):
metaLines.append(f"Channels: {meta['channels']}")
if meta.get("title") or meta.get("artist") or meta.get("album"):
metaLines.append(f"Title: {meta.get('title', 'N/A')}")
metaLines.append(f"Artist: {meta.get('artist', 'N/A')}")
metaLines.append(f"Album: {meta.get('album', 'N/A')}")
metaLines.append(f"Size: {fileSize:,} bytes")
parts.append(ContentPart(
id=rootId, parentId=None, label="metadata",
typeGroup="text", mimeType="text/plain",
data="\n".join(metaLines), metadata=meta,
))
audioData = ""
if fileSize <= _MAX_INLINE_SIZE:
audioData = base64.b64encode(fileBytes).decode("utf-8")
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="audiostream",
typeGroup="audiostream", mimeType=mimeType,
data=audioData, metadata={"size": fileSize, "inlined": fileSize <= _MAX_INLINE_SIZE},
))
return parts
def _extractMetadata(fileBytes: bytes, fileName: str) -> Dict[str, Any]:
"""Extract audio metadata using mutagen (optional) with stdlib fallback."""
meta: Dict[str, Any] = {}
try:
import mutagen
import io
audio = mutagen.File(io.BytesIO(fileBytes))
if audio is not None:
if audio.info:
meta["duration"] = getattr(audio.info, "length", None)
meta["bitrate"] = getattr(audio.info, "bitrate", None)
if meta["bitrate"]:
meta["bitrate"] = meta["bitrate"] // 1000
meta["sampleRate"] = getattr(audio.info, "sample_rate", None)
meta["channels"] = getattr(audio.info, "channels", None)
tags = audio.tags
if tags:
meta["title"] = _getTag(tags, ["TIT2", "title", "\xa9nam"])
meta["artist"] = _getTag(tags, ["TPE1", "artist", "\xa9ART"])
meta["album"] = _getTag(tags, ["TALB", "album", "\xa9alb"])
return {k: v for k, v in meta.items() if v is not None}
except ImportError:
logger.debug("mutagen not installed -- using basic metadata extraction")
except Exception as e:
logger.debug(f"mutagen metadata extraction failed: {e}")
lower = fileName.lower()
if lower.endswith(".wav"):
meta.update(_parseWavHeader(fileBytes))
return {k: v for k, v in meta.items() if v is not None}
def _getTag(tags, keys: list) -> Any:
"""Try multiple tag keys and return the first found value."""
for key in keys:
val = tags.get(key)
if val is not None:
return str(val) if not isinstance(val, str) else val
return None
def _parseWavHeader(fileBytes: bytes) -> Dict[str, Any]:
"""Minimal WAV header parser for basic metadata."""
meta: Dict[str, Any] = {}
if len(fileBytes) < 44:
return meta
try:
if fileBytes[:4] != b"RIFF" or fileBytes[8:12] != b"WAVE":
return meta
channels = struct.unpack_from("<H", fileBytes, 22)[0]
sampleRate = struct.unpack_from("<I", fileBytes, 24)[0]
bitsPerSample = struct.unpack_from("<H", fileBytes, 34)[0]
dataSize = struct.unpack_from("<I", fileBytes, 40)[0]
meta["channels"] = channels
meta["sampleRate"] = sampleRate
meta["bitrate"] = (sampleRate * channels * bitsPerSample) // 1000
if sampleRate and channels and bitsPerSample:
meta["duration"] = dataSize / (sampleRate * channels * (bitsPerSample / 8))
except Exception:
pass
return meta

View file

@ -0,0 +1,339 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Container extractor for ZIP, TAR, GZ, and 7Z archives.
Recursively unpacks containers and delegates each contained file to the
appropriate type-specific extractor via the ExtractorRegistry.
Safety limits:
- MAX_TOTAL_EXTRACTED_SIZE: 500 MB
- MAX_FILE_COUNT: 10000
- maxDepth: 5
- Symlinks blocked
"""
from typing import Any, Dict, List, Optional
import io
import logging
import mimetypes
import zipfile
import tarfile
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelContent import ContainerLimitError, ContentContextRef
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
MAX_TOTAL_EXTRACTED_SIZE = 500 * 1024 * 1024 # 500 MB
MAX_FILE_COUNT = 10000
MAX_DEPTH = 5
_CONTAINER_MIME_TYPES = [
"application/zip",
"application/x-zip-compressed",
"application/x-tar",
"application/gzip",
"application/x-gzip",
"application/x-7z-compressed",
]
_CONTAINER_EXTENSIONS = [".zip", ".tar", ".gz", ".tar.gz", ".tgz", ".7z"]
def _detectMimeType(fileName: str) -> str:
"""Detect MIME type from file name."""
guessed, _ = mimetypes.guess_type(fileName)
return guessed or "application/octet-stream"
def _isSymlink(info) -> bool:
"""Check if a tar member is a symlink."""
if hasattr(info, "issym") and callable(info.issym):
return info.issym() or info.islnk()
return False
class ContainerExtractor(Extractor):
"""Extractor for archive containers (ZIP, TAR, GZ, 7Z).
Recursively resolves nested containers and produces a flat list of
ContentPart entries -- one per contained file -- with containerPath metadata.
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
if mimeType in _CONTAINER_MIME_TYPES:
return True
lower = (fileName or "").lower()
return any(lower.endswith(ext) for ext in _CONTAINER_EXTENSIONS)
def getSupportedExtensions(self) -> list[str]:
return list(_CONTAINER_EXTENSIONS)
def getSupportedMimeTypes(self) -> list[str]:
return list(_CONTAINER_MIME_TYPES)
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""Extract by recursively unpacking the container."""
fileName = context.get("fileName", "archive")
mimeType = context.get("mimeType", "application/octet-stream")
rootId = makeId()
parts: List[ContentPart] = [
ContentPart(
id=rootId,
parentId=None,
label=fileName,
typeGroup="container",
mimeType=mimeType,
data="",
metadata={"size": len(fileBytes), "containerType": "archive"},
)
]
state = {"totalSize": 0, "fileCount": 0}
try:
childParts = _resolveContainerRecursive(
fileBytes, mimeType, fileName, rootId, "", 0, state
)
parts.extend(childParts)
except ContainerLimitError as e:
logger.warning(f"Container limit reached for {fileName}: {e}")
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="limit_exceeded",
typeGroup="text",
mimeType="text/plain",
data=str(e),
metadata={"warning": "Container extraction limit exceeded"},
))
return parts
def _resolveContainerRecursive(
containerBytes: bytes,
containerMime: str,
containerName: str,
parentId: str,
containerPath: str,
depth: int,
state: Dict[str, int],
) -> List[ContentPart]:
"""Recursively unpack containers. No AI calls."""
if depth > MAX_DEPTH:
raise ContainerLimitError(f"Max nesting depth {MAX_DEPTH} exceeded")
parts: List[ContentPart] = []
if containerMime in ("application/zip", "application/x-zip-compressed") or containerName.lower().endswith(".zip"):
parts.extend(_extractZip(containerBytes, parentId, containerPath, depth, state))
elif containerMime in ("application/x-tar",) or containerName.lower().endswith(".tar"):
parts.extend(_extractTar(containerBytes, parentId, containerPath, depth, state, compressed=False))
elif containerMime in ("application/gzip", "application/x-gzip") or containerName.lower().endswith((".gz", ".tgz", ".tar.gz")):
parts.extend(_extractTar(containerBytes, parentId, containerPath, depth, state, compressed=True))
elif containerName.lower().endswith(".7z"):
parts.extend(_extract7z(containerBytes, parentId, containerPath, depth, state))
else:
logger.warning(f"Unknown container format: {containerMime} ({containerName})")
return parts
def _addFilePart(
data: bytes,
fileName: str,
parentId: str,
containerPath: str,
state: Dict[str, int],
) -> List[ContentPart]:
"""Extract a file via its type-specific Extractor and return ContentParts."""
state["totalSize"] += len(data)
state["fileCount"] += 1
if state["totalSize"] > MAX_TOTAL_EXTRACTED_SIZE:
raise ContainerLimitError(f"Total extracted size exceeds {MAX_TOTAL_EXTRACTED_SIZE // (1024 * 1024)} MB")
if state["fileCount"] > MAX_FILE_COUNT:
raise ContainerLimitError(f"File count exceeds {MAX_FILE_COUNT}")
entryPath = f"{containerPath}/{fileName}" if containerPath else fileName
detectedMime = _detectMimeType(fileName)
from ..subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
extractor = registry.resolve(detectedMime, fileName)
if extractor and not isinstance(extractor, ContainerExtractor):
try:
childParts = extractor.extract(data, {"fileName": fileName, "mimeType": detectedMime})
for part in childParts:
part.parentId = parentId
if not part.metadata:
part.metadata = {}
part.metadata["containerPath"] = entryPath
return childParts
except Exception as e:
logger.warning(f"Type-extractor failed for {fileName} in container: {e}")
import base64
encodedData = base64.b64encode(data).decode("utf-8") if data else ""
return [ContentPart(
id=makeId(),
parentId=parentId,
label=fileName,
typeGroup="binary",
mimeType=detectedMime,
data=encodedData,
metadata={
"size": len(data),
"containerPath": entryPath,
"contextRef": ContentContextRef(
containerPath=entryPath,
location="file",
).model_dump(),
},
)]
def _isNestedContainer(fileName: str, mimeType: str) -> bool:
lower = fileName.lower()
return any(lower.endswith(ext) for ext in _CONTAINER_EXTENSIONS) or mimeType in _CONTAINER_MIME_TYPES
def _extractZip(
data: bytes, parentId: str, containerPath: str, depth: int, state: Dict[str, int]
) -> List[ContentPart]:
parts: List[ContentPart] = []
try:
with zipfile.ZipFile(io.BytesIO(data)) as zf:
for info in zf.infolist():
if info.is_dir():
continue
if info.file_size == 0:
continue
entryPath = f"{containerPath}/{info.filename}" if containerPath else info.filename
entryMime = _detectMimeType(info.filename)
entryData = zf.read(info.filename)
if _isNestedContainer(info.filename, entryMime):
nestedId = makeId()
parts.append(ContentPart(
id=nestedId,
parentId=parentId,
label=info.filename,
typeGroup="container",
mimeType=entryMime,
data="",
metadata={"size": len(entryData), "containerPath": entryPath},
))
nested = _resolveContainerRecursive(
entryData, entryMime, info.filename, nestedId, entryPath, depth + 1, state
)
parts.extend(nested)
else:
parts.extend(_addFilePart(entryData, info.filename, parentId, containerPath, state))
except zipfile.BadZipFile as e:
logger.error(f"Invalid ZIP file: {e}")
parts.append(ContentPart(
id=makeId(), parentId=parentId, label="error",
typeGroup="text", mimeType="text/plain",
data=f"Invalid ZIP archive: {e}", metadata={"error": True},
))
return parts
def _extractTar(
data: bytes, parentId: str, containerPath: str, depth: int, state: Dict[str, int],
compressed: bool = False,
) -> List[ContentPart]:
parts: List[ContentPart] = []
mode = "r:gz" if compressed else "r:"
try:
with tarfile.open(fileobj=io.BytesIO(data), mode=mode) as tf:
for member in tf.getmembers():
if member.isdir():
continue
if _isSymlink(member):
logger.warning(f"Skipping symlink in TAR: {member.name}")
continue
if member.size == 0:
continue
entryPath = f"{containerPath}/{member.name}" if containerPath else member.name
entryMime = _detectMimeType(member.name)
fobj = tf.extractfile(member)
if fobj is None:
continue
entryData = fobj.read()
if _isNestedContainer(member.name, entryMime):
nestedId = makeId()
parts.append(ContentPart(
id=nestedId, parentId=parentId, label=member.name,
typeGroup="container", mimeType=entryMime, data="",
metadata={"size": len(entryData), "containerPath": entryPath},
))
nested = _resolveContainerRecursive(
entryData, entryMime, member.name, nestedId, entryPath, depth + 1, state
)
parts.extend(nested)
else:
parts.extend(_addFilePart(entryData, member.name, parentId, containerPath, state))
except tarfile.TarError as e:
logger.error(f"Invalid TAR file: {e}")
parts.append(ContentPart(
id=makeId(), parentId=parentId, label="error",
typeGroup="text", mimeType="text/plain",
data=f"Invalid TAR archive: {e}", metadata={"error": True},
))
return parts
def _extract7z(
data: bytes, parentId: str, containerPath: str, depth: int, state: Dict[str, int]
) -> List[ContentPart]:
"""Extract 7z archive. Requires py7zr (optional dependency)."""
parts: List[ContentPart] = []
try:
import py7zr
with py7zr.SevenZipFile(io.BytesIO(data), mode="r") as szf:
allFiles = szf.readall()
for fileName, bio in allFiles.items():
entryData = bio.read() if hasattr(bio, "read") else bytes(bio)
if not entryData:
continue
entryPath = f"{containerPath}/{fileName}" if containerPath else fileName
entryMime = _detectMimeType(fileName)
if _isNestedContainer(fileName, entryMime):
nestedId = makeId()
parts.append(ContentPart(
id=nestedId, parentId=parentId, label=fileName,
typeGroup="container", mimeType=entryMime, data="",
metadata={"size": len(entryData), "containerPath": entryPath},
))
nested = _resolveContainerRecursive(
entryData, entryMime, fileName, nestedId, entryPath, depth + 1, state
)
parts.extend(nested)
else:
parts.extend(_addFilePart(entryData, fileName, parentId, containerPath, state))
except ImportError:
logger.warning("py7zr not installed -- 7z files will be treated as binary")
parts.append(ContentPart(
id=makeId(), parentId=parentId, label="unsupported",
typeGroup="text", mimeType="text/plain",
data="7z extraction requires py7zr package", metadata={"warning": True},
))
except Exception as e:
logger.error(f"Invalid 7z file: {e}")
parts.append(ContentPart(
id=makeId(), parentId=parentId, label="error",
typeGroup="text", mimeType="text/plain",
data=f"Invalid 7z archive: {e}", metadata={"error": True},
))
return parts

View file

@ -74,19 +74,33 @@ class DocxExtractor(Extractor):
with io.BytesIO(fileBytes) as buf:
d = docx.Document(buf)
# paragraphs
fileName = context.get("fileName", "document.docx")
headingIndex = 0
currentSection = "body"
for i, para in enumerate(d.paragraphs):
text = para.text or ""
if text.strip():
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"p_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"size": len(text.encode('utf-8'))}
))
# tables → CSV rows
if not text.strip():
continue
styleName = (para.style.name or "").lower() if para.style else ""
if "heading" in styleName:
headingIndex += 1
currentSection = f"heading:{headingIndex}"
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"p_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={
"size": len(text.encode('utf-8')),
"contextRef": {
"containerPath": fileName,
"location": f"paragraph:{i+1}",
"sectionId": currentSection,
},
}
))
for ti, table in enumerate(d.tables):
rows: list[str] = []
for row in table.rows:
@ -101,7 +115,14 @@ class DocxExtractor(Extractor):
typeGroup="table",
mimeType="text/csv",
data=csvData,
metadata={"size": len(csvData.encode('utf-8'))}
metadata={
"size": len(csvData.encode('utf-8')),
"contextRef": {
"containerPath": fileName,
"location": f"table:{ti+1}",
"sectionId": currentSection,
},
}
))
return parts

View file

@ -0,0 +1,230 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Email extractor for EML and MSG files.
Parses email headers, body (text/html), and attachments.
Attachments are delegated to the ExtractorRegistry for type-specific processing.
Optional dependency: extract-msg (for .msg files).
"""
from typing import Any, Dict, List
import email
import email.policy
import email.utils
import io
import logging
import mimetypes
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
_EMAIL_MIME_TYPES = [
"message/rfc822",
"application/vnd.ms-outlook",
]
_EMAIL_EXTENSIONS = [".eml", ".msg"]
class EmailExtractor(Extractor):
"""Extractor for email files (EML, MSG).
Produces:
- 1 text ContentPart with header metadata (From, To, Subject, Date)
- 1 text ContentPart per body part (plain text / HTML)
- Delegated ContentParts for each attachment via ExtractorRegistry
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
if mimeType in _EMAIL_MIME_TYPES:
return True
lower = (fileName or "").lower()
return any(lower.endswith(ext) for ext in _EMAIL_EXTENSIONS)
def getSupportedExtensions(self) -> list[str]:
return list(_EMAIL_EXTENSIONS)
def getSupportedMimeTypes(self) -> list[str]:
return list(_EMAIL_MIME_TYPES)
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName", "email")
lower = (fileName or "").lower()
if lower.endswith(".msg"):
return self._extractMsg(fileBytes, fileName)
return self._extractEml(fileBytes, fileName)
def _extractEml(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
"""Parse standard EML (RFC 822) using stdlib email."""
rootId = makeId()
parts: List[ContentPart] = []
try:
msg = email.message_from_bytes(fileBytes, policy=email.policy.default)
except Exception as e:
logger.error(f"EmailExtractor: failed to parse EML: {e}")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data=f"Failed to parse email: {e}", metadata={"error": True},
)]
headerText = _buildHeaderText(msg)
parts.append(ContentPart(
id=rootId, parentId=None, label="headers",
typeGroup="text", mimeType="text/plain",
data=headerText, metadata={"emailPart": "headers"},
))
for part in msg.walk():
contentType = part.get_content_type()
disposition = str(part.get("Content-Disposition", ""))
if part.is_multipart():
continue
if "attachment" in disposition:
attachName = part.get_filename() or "attachment"
attachData = part.get_payload(decode=True)
if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId))
continue
if contentType == "text/plain":
body = part.get_content()
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_text",
typeGroup="text", mimeType="text/plain",
data=str(body), metadata={"emailPart": "body"},
))
elif contentType == "text/html":
body = part.get_content()
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_html",
typeGroup="text", mimeType="text/html",
data=str(body), metadata={"emailPart": "body_html"},
))
return parts
def _extractMsg(self, fileBytes: bytes, fileName: str) -> List[ContentPart]:
"""Parse Outlook MSG files using extract-msg (optional)."""
rootId = makeId()
parts: List[ContentPart] = []
try:
import extract_msg
except ImportError:
logger.warning("extract-msg not installed -- MSG files will be treated as binary")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data="MSG extraction requires the extract-msg package.",
metadata={"warning": True},
)]
try:
msgFile = extract_msg.Message(io.BytesIO(fileBytes))
except Exception as e:
logger.error(f"EmailExtractor: failed to parse MSG: {e}")
return [ContentPart(
id=rootId, parentId=None, label=fileName,
typeGroup="text", mimeType="text/plain",
data=f"Failed to parse MSG: {e}", metadata={"error": True},
)]
headerLines = []
if msgFile.sender:
headerLines.append(f"From: {msgFile.sender}")
if msgFile.to:
headerLines.append(f"To: {msgFile.to}")
if getattr(msgFile, "cc", None):
headerLines.append(f"Cc: {msgFile.cc}")
if msgFile.subject:
headerLines.append(f"Subject: {msgFile.subject}")
if msgFile.date:
headerLines.append(f"Date: {msgFile.date}")
parts.append(ContentPart(
id=rootId, parentId=None, label="headers",
typeGroup="text", mimeType="text/plain",
data="\n".join(headerLines), metadata={"emailPart": "headers"},
))
body = msgFile.body
if body:
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_text",
typeGroup="text", mimeType="text/plain",
data=body, metadata={"emailPart": "body"},
))
htmlBody = getattr(msgFile, "htmlBody", None)
if htmlBody:
if isinstance(htmlBody, bytes):
htmlBody = htmlBody.decode("utf-8", errors="replace")
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="body_html",
typeGroup="text", mimeType="text/html",
data=htmlBody, metadata={"emailPart": "body_html"},
))
for attachment in (msgFile.attachments or []):
attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment"
attachData = getattr(attachment, "data", None)
if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId))
try:
msgFile.close()
except Exception:
pass
return parts
def _buildHeaderText(msg) -> str:
"""Build a readable text summary of key email headers."""
lines = []
for header in ("From", "To", "Cc", "Subject", "Date", "Message-ID"):
value = msg.get(header)
if value:
lines.append(f"{header}: {value}")
return "\n".join(lines)
def _delegateAttachment(attachData: bytes, attachName: str, parentId: str) -> List[ContentPart]:
"""Delegate an attachment to the appropriate type-specific extractor."""
guessedMime, _ = mimetypes.guess_type(attachName)
detectedMime = guessedMime or "application/octet-stream"
from ..subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
extractor = registry.resolve(detectedMime, attachName)
if extractor and not isinstance(extractor, EmailExtractor):
try:
childParts = extractor.extract(attachData, {"fileName": attachName, "mimeType": detectedMime})
for part in childParts:
part.parentId = parentId
if not part.metadata:
part.metadata = {}
part.metadata["emailAttachment"] = attachName
return childParts
except Exception as e:
logger.warning(f"Extractor failed for email attachment {attachName}: {e}")
import base64
encodedData = base64.b64encode(attachData).decode("utf-8") if attachData else ""
return [ContentPart(
id=makeId(), parentId=parentId, label=attachName,
typeGroup="binary", mimeType=detectedMime,
data=encodedData,
metadata={"size": len(attachData), "emailAttachment": attachName},
)]

View file

@ -0,0 +1,184 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Folder extractor -- treats a local folder reference as a container.
Not registered in the MIME-based ExtractorRegistry (folders have no MIME type).
Instead, called directly by agent tools (browseContainer) when handling folder references.
Applies the same safety limits as ContainerExtractor.
"""
from typing import Any, Dict, List
import logging
import mimetypes
from pathlib import Path
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelContent import ContainerLimitError, ContentContextRef
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
MAX_TOTAL_EXTRACTED_SIZE = 500 * 1024 * 1024
MAX_FILE_COUNT = 10000
MAX_DEPTH = 5
class FolderExtractor(Extractor):
"""Extracts contents from a local folder path.
Unlike other extractors, this does not receive fileBytes. Instead it
receives a folder path via context["folderPath"] and walks the directory.
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return False
def getSupportedExtensions(self) -> list[str]:
return []
def getSupportedMimeTypes(self) -> list[str]:
return []
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""Extract folder contents.
context must contain:
folderPath: str -- absolute path to the folder
"""
folderPath = context.get("folderPath", "")
if not folderPath:
return []
folder = Path(folderPath)
if not folder.is_dir():
logger.error(f"FolderExtractor: not a directory: {folderPath}")
return []
rootId = makeId()
parts: List[ContentPart] = [
ContentPart(
id=rootId,
parentId=None,
label=folder.name or "folder",
typeGroup="container",
mimeType="inode/directory",
data="",
metadata={"folderPath": str(folder), "containerType": "folder"},
)
]
state = {"totalSize": 0, "fileCount": 0}
try:
_walkFolder(folder, rootId, "", 0, state, parts)
except ContainerLimitError as e:
logger.warning(f"Folder extraction limit reached: {e}")
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="limit_exceeded",
typeGroup="text",
mimeType="text/plain",
data=str(e),
metadata={"warning": "Folder extraction limit exceeded"},
))
return parts
def _walkFolder(
folder: Path,
parentId: str,
containerPath: str,
depth: int,
state: Dict[str, int],
parts: List[ContentPart],
) -> None:
if depth > MAX_DEPTH:
raise ContainerLimitError(f"Max folder depth {MAX_DEPTH} exceeded")
try:
entries = sorted(folder.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
except PermissionError:
logger.warning(f"Permission denied: {folder}")
return
for entry in entries:
if entry.is_symlink():
logger.debug(f"Skipping symlink: {entry}")
continue
entryPath = f"{containerPath}/{entry.name}" if containerPath else entry.name
if entry.is_dir():
folderId = makeId()
parts.append(ContentPart(
id=folderId,
parentId=parentId,
label=entry.name,
typeGroup="container",
mimeType="inode/directory",
data="",
metadata={"containerPath": entryPath, "containerType": "folder"},
))
_walkFolder(entry, folderId, entryPath, depth + 1, state, parts)
elif entry.is_file():
try:
fileSize = entry.stat().st_size
except OSError:
continue
state["totalSize"] += fileSize
state["fileCount"] += 1
if state["totalSize"] > MAX_TOTAL_EXTRACTED_SIZE:
raise ContainerLimitError(f"Total extracted size exceeds {MAX_TOTAL_EXTRACTED_SIZE // (1024 * 1024)} MB")
if state["fileCount"] > MAX_FILE_COUNT:
raise ContainerLimitError(f"File count exceeds {MAX_FILE_COUNT}")
guessedMime, _ = mimetypes.guess_type(entry.name)
detectedMime = guessedMime or "application/octet-stream"
from ..subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
extractor = registry.resolve(detectedMime, entry.name)
if extractor and not isinstance(extractor, FolderExtractor):
try:
fileData = entry.read_bytes()
childParts = extractor.extract(fileData, {"fileName": entry.name, "mimeType": detectedMime})
for part in childParts:
part.parentId = parentId
if not part.metadata:
part.metadata = {}
part.metadata["containerPath"] = entryPath
parts.extend(childParts)
continue
except Exception as e:
logger.warning(f"Type-extractor failed for {entry.name}: {e}")
import base64
try:
fileData = entry.read_bytes()
encodedData = base64.b64encode(fileData).decode("utf-8")
except Exception:
encodedData = ""
parts.append(ContentPart(
id=makeId(),
parentId=parentId,
label=entry.name,
typeGroup="binary",
mimeType=detectedMime,
data=encodedData,
metadata={
"size": fileSize,
"containerPath": entryPath,
"contextRef": ContentContextRef(
containerPath=entryPath,
location="file",
).model_dump(),
},
))

View file

@ -89,7 +89,15 @@ class PdfExtractor(Extractor):
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
metadata={
"pages": 1, "pageIndex": i,
"size": len(text.encode('utf-8')),
"contextRef": {
"containerPath": context.get("fileName", "document.pdf"),
"location": f"page:{i+1}",
"pageIndex": i,
},
}
))
except Exception:
continue
@ -114,7 +122,15 @@ class PdfExtractor(Extractor):
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
metadata={
"pages": 1, "pageIndex": i,
"size": len(text.encode('utf-8')),
"contextRef": {
"containerPath": context.get("fileName", "document.pdf"),
"location": f"page:{i+1}",
"pageIndex": i,
},
}
))
except Exception:
continue
@ -143,7 +159,14 @@ class PdfExtractor(Extractor):
typeGroup="image",
mimeType=f"image/{ext}",
data=base64.b64encode(imgBytes).decode("utf-8"),
metadata={"pageIndex": i, "size": len(imgBytes)}
metadata={
"pageIndex": i, "size": len(imgBytes),
"contextRef": {
"containerPath": context.get("fileName", "document.pdf"),
"location": f"page:{i+1}/image:{j}",
"pageIndex": i,
},
}
))
except Exception:
continue

View file

@ -119,17 +119,22 @@ class PptxExtractor(Extractor):
image_bytes = image.blob
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Create image part
fileName = context.get("fileName", "presentation.pptx")
image_part = ContentPart(
id=f"slide_{slide_index}_image_{len(parts)}",
label=f"Slide {slide_index} Image",
typeGroup="image",
mimeType="image/png", # Default to PNG
mimeType="image/png",
data=image_b64,
metadata={
"slide_number": slide_index,
"shape_type": "image",
"extracted_from": "powerpoint"
"extracted_from": "powerpoint",
"contextRef": {
"containerPath": fileName,
"location": f"slide:{slide_index}/image",
"slideIndex": slide_index - 1,
},
}
)
parts.append(image_part)
@ -140,6 +145,7 @@ class PptxExtractor(Extractor):
if slide_content:
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
fileName = context.get("fileName", "presentation.pptx")
slide_part = ContentPart(
id=f"slide_{slide_index}",
label=f"Slide {slide_index} Content",
@ -150,7 +156,12 @@ class PptxExtractor(Extractor):
"slide_number": slide_index,
"content_type": "slide",
"extracted_from": "powerpoint",
"text_length": len(slide_text)
"text_length": len(slide_text),
"contextRef": {
"containerPath": fileName,
"location": f"slide:{slide_index}",
"slideIndex": slide_index - 1,
},
}
)
parts.append(slide_part)

View file

@ -0,0 +1,208 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Video extractor for common video formats.
Extracts metadata (duration, resolution, codec, bitrate) and produces
a `videostream` ContentPart. Video data is never base64-encoded due to size.
Optional dependency: mutagen (for rich metadata from MP4/WebM containers).
"""
from typing import Any, Dict, List
import logging
import struct
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
_VIDEO_MIME_TYPES = [
"video/mp4",
"video/webm",
"video/x-msvideo",
"video/avi",
"video/quicktime",
"video/x-matroska",
"video/x-ms-wmv",
"video/mpeg",
"video/ogg",
]
_VIDEO_EXTENSIONS = [".mp4", ".webm", ".avi", ".mov", ".mkv", ".wmv", ".mpeg", ".mpg", ".ogv"]
class VideoExtractor(Extractor):
"""Extractor for video files.
Produces:
- 1 text ContentPart with metadata summary
- 1 videostream ContentPart (no inline data -- too large)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
if mimeType in _VIDEO_MIME_TYPES:
return True
lower = (fileName or "").lower()
return any(lower.endswith(ext) for ext in _VIDEO_EXTENSIONS)
def getSupportedExtensions(self) -> list[str]:
return list(_VIDEO_EXTENSIONS)
def getSupportedMimeTypes(self) -> list[str]:
return list(_VIDEO_MIME_TYPES)
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName", "video")
mimeType = context.get("mimeType") or "video/mp4"
fileSize = len(fileBytes)
rootId = makeId()
parts: List[ContentPart] = []
meta = _extractMetadata(fileBytes, fileName)
meta["size"] = fileSize
meta["fileName"] = fileName
meta["mimeType"] = mimeType
metaLines = [f"Video file: {fileName}"]
if meta.get("duration"):
mins = int(meta["duration"] // 60)
secs = int(meta["duration"] % 60)
metaLines.append(f"Duration: {mins}:{secs:02d}")
if meta.get("width") and meta.get("height"):
metaLines.append(f"Resolution: {meta['width']}x{meta['height']}")
if meta.get("codec"):
metaLines.append(f"Codec: {meta['codec']}")
if meta.get("bitrate"):
metaLines.append(f"Bitrate: {meta['bitrate']} kbps")
if meta.get("fps"):
metaLines.append(f"FPS: {meta['fps']}")
metaLines.append(f"Size: {fileSize:,} bytes")
parts.append(ContentPart(
id=rootId, parentId=None, label="metadata",
typeGroup="text", mimeType="text/plain",
data="\n".join(metaLines), metadata=meta,
))
parts.append(ContentPart(
id=makeId(), parentId=rootId, label="videostream",
typeGroup="videostream", mimeType=mimeType,
data="", metadata={"size": fileSize, "inlined": False},
))
return parts
def _extractMetadata(fileBytes: bytes, fileName: str) -> Dict[str, Any]:
"""Extract video metadata using mutagen (optional) with basic fallback."""
meta: Dict[str, Any] = {}
try:
import mutagen
import io
mediaFile = mutagen.File(io.BytesIO(fileBytes))
if mediaFile is not None and mediaFile.info:
meta["duration"] = getattr(mediaFile.info, "length", None)
meta["bitrate"] = getattr(mediaFile.info, "bitrate", None)
if meta["bitrate"]:
meta["bitrate"] = meta["bitrate"] // 1000
if hasattr(mediaFile.info, "video"):
for stream in (mediaFile.info.video if isinstance(mediaFile.info.video, list) else [mediaFile.info.video]):
if hasattr(stream, "width"):
meta["width"] = stream.width
if hasattr(stream, "height"):
meta["height"] = stream.height
if hasattr(stream, "codec"):
meta["codec"] = stream.codec
width = getattr(mediaFile.info, "width", None)
height = getattr(mediaFile.info, "height", None)
if width and height:
meta["width"] = width
meta["height"] = height
fps = getattr(mediaFile.info, "fps", None)
if fps:
meta["fps"] = round(fps, 2)
codec = getattr(mediaFile.info, "codec", None)
if codec:
meta["codec"] = codec
return {k: v for k, v in meta.items() if v is not None}
except ImportError:
logger.debug("mutagen not installed -- using basic video metadata extraction")
except Exception as e:
logger.debug(f"mutagen video metadata extraction failed: {e}")
lower = fileName.lower()
if lower.endswith(".mp4"):
meta.update(_parseMp4Header(fileBytes))
elif lower.endswith(".avi"):
meta.update(_parseAviHeader(fileBytes))
return {k: v for k, v in meta.items() if v is not None}
def _parseMp4Header(fileBytes: bytes) -> Dict[str, Any]:
"""Minimal MP4 moov/mvhd parser for duration and timescale."""
meta: Dict[str, Any] = {}
try:
pos = 0
while pos < len(fileBytes) - 8:
boxSize = struct.unpack_from(">I", fileBytes, pos)[0]
boxType = fileBytes[pos + 4:pos + 8]
if boxSize < 8:
break
if boxType == b"moov":
meta.update(_parseMoovBox(fileBytes[pos + 8:pos + boxSize]))
break
pos += boxSize
except Exception:
pass
return meta
def _parseMoovBox(data: bytes) -> Dict[str, Any]:
"""Parse moov box to find mvhd with duration."""
meta: Dict[str, Any] = {}
pos = 0
while pos < len(data) - 8:
try:
boxSize = struct.unpack_from(">I", data, pos)[0]
boxType = data[pos + 4:pos + 8]
if boxSize < 8:
break
if boxType == b"mvhd":
version = data[pos + 8]
if version == 0 and pos + 28 < len(data):
timeScale = struct.unpack_from(">I", data, pos + 20)[0]
duration = struct.unpack_from(">I", data, pos + 24)[0]
if timeScale > 0:
meta["duration"] = duration / timeScale
break
pos += boxSize
except Exception:
break
return meta
def _parseAviHeader(fileBytes: bytes) -> Dict[str, Any]:
"""Minimal AVI header parser for resolution."""
meta: Dict[str, Any] = {}
if len(fileBytes) < 72:
return meta
try:
if fileBytes[:4] != b"RIFF" or fileBytes[8:12] != b"AVI ":
return meta
width = struct.unpack_from("<I", fileBytes, 64)[0]
height = struct.unpack_from("<I", fileBytes, 68)[0]
if 0 < width < 100000 and 0 < height < 100000:
meta["width"] = width
meta["height"] = height
except Exception:
pass
return meta

View file

@ -99,6 +99,7 @@ class XlsxExtractor(Extractor):
cells.append(f'"{escaped_value}"')
lines.append(",".join(cells))
csvData = "\n".join(lines)
fileName = context.get("fileName", "spreadsheet.xlsx")
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
@ -106,7 +107,15 @@ class XlsxExtractor(Extractor):
typeGroup="table",
mimeType="text/csv",
data=csvData,
metadata={"sheet": sheetName, "size": len(csvData.encode('utf-8'))}
metadata={
"sheet": sheetName,
"size": len(csvData.encode('utf-8')),
"contextRef": {
"containerPath": fileName,
"location": f"sheet:{sheetName}",
"sheetName": sheetName,
},
}
))
return parts

View file

@ -243,11 +243,7 @@ class ExtractionService:
errorCount=0
)
self._get_service("chat").storeWorkflowStat(
self._context.workflow,
aiResponse,
f"extraction.process.{doc.mimeType}"
)
# Cost is recorded via billingCallback in _callWithModel
# Write extraction results to debug file
try:
@ -1230,15 +1226,52 @@ class ExtractionService:
logger.info(f"Chunking {contentPart.typeGroup} part: contentSize={contentSize} bytes, textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes")
chunks = chunker.chunk(contentPart, chunkingOptions)
logger.info(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part (contentSize={contentSize} bytes)")
if chunks:
for i, chunk in enumerate(chunks):
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
logger.info(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes")
return chunks
# Post-chunking validation: force line-based split on any chunk still exceeding target
validatedChunks = []
for i, chunk in enumerate(chunks):
chunkData = chunk.get('data', '')
chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
if chunkSize > availableContentBytes and chunkData:
logger.warning(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes exceeds target {availableContentBytes} bytes, force-splitting by lines")
subChunks = self._forceLineSplit(chunkData, availableContentBytes, len(validatedChunks))
validatedChunks.extend(subChunks)
else:
chunk["order"] = len(validatedChunks)
validatedChunks.append(chunk)
if len(validatedChunks) != len(chunks):
logger.info(f"Post-chunking validation: {len(chunks)} -> {len(validatedChunks)} chunks after force-splitting oversized chunks")
for i, chunk in enumerate(validatedChunks):
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
logger.info(f" Chunk {i+1}/{len(validatedChunks)}: {chunkSize} bytes")
return validatedChunks
except Exception as e:
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
return []
def _forceLineSplit(self, data: str, maxBytes: int, startOrder: int) -> List[Dict[str, Any]]:
"""Line-based safety-net split for chunks that still exceed maxBytes after structured chunking."""
chunks: List[Dict[str, Any]] = []
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
return chunks
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
"""Process a single content part with model-aware chunking and fallback.
@ -1386,73 +1419,210 @@ class ExtractionService:
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
# If either condition fails, chunk the content
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
# Part too large or total exceeds limit - chunk it (but not for image generation)
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
if not chunks:
raise ValueError(f"Failed to chunk content part for model {model.name}")
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
# Parallel chunk processing with per-chunk failover
remainingModels = failoverModelList[attempt:]
allChunkResults, allResponses = await self._processChunksParallel(
chunks, prompt, options, remainingModels, aiObjects, progressCallback
)
if progressCallback:
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
if not allResponses:
raise ValueError("All chunks failed for content part")
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
chunkData = chunk.get('data', '')
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
mergedContent = self.mergePartResults(allResponses, options, [contentPart])
if progressCallback:
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
# Stitch pass: reconcile cross-chunk artifacts when multiple chunks were processed
if len(allResponses) > 1:
mergedContent = await self._stitchChunkResults(
mergedContent, len(allResponses), prompt, options, aiObjects
)
try:
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
except Exception as chunkError:
logger.error(f"Error processing chunk {chunkNum}/{len(chunks)}: {str(chunkError)}")
# Continue with other chunks even if one fails
continue
# Merge chunk results
if not chunkResults:
raise ValueError(f"All chunks failed for content part")
# Pass original contentPart to preserve typeGroup for all chunks (one-to-many: 1 part -> N chunks)
mergedContent = self.mergePartResults(chunkResults, options, [contentPart])
return AiCallResponse(
content=mergedContent,
modelName=model.name,
provider=model.connectorType,
priceCHF=sum(r.priceCHF for r in chunkResults),
processingTime=sum(r.processingTime for r in chunkResults),
bytesSent=sum(r.bytesSent for r in chunkResults),
bytesReceived=sum(r.bytesReceived for r in chunkResults),
errorCount=sum(r.errorCount for r in chunkResults)
priceCHF=sum(r.priceCHF for r in allResponses),
processingTime=sum(r.processingTime for r in allResponses),
bytesSent=sum(r.bytesSent for r in allResponses),
bytesReceived=sum(r.bytesReceived for r in allResponses),
errorCount=sum(r.errorCount for r in allResponses)
)
else:
# Part fits - call AI directly via aiObjects interface
logger.info(f"Content part fits within model limits, processing directly")
# Part fits - call AI directly
logger.info(f"Content part fits within model limits, processing directly")
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"Content part processed successfully with model: {model.name}")
logger.info(f"Content part processed successfully with model: {model.name}")
return response
except Exception as e:
lastError = e
error_msg = str(e) if str(e) else f"{type(e).__name__}"
logger.warning(f"Model {model.name} failed for content part: {error_msg}", exc_info=True)
logger.warning(f"Model {model.name} failed for content part: {error_msg}", exc_info=True)
if attempt < len(failoverModelList) - 1:
logger.info(f"🔄 Trying next failover model...")
logger.info(f"Trying next failover model...")
continue
else:
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
logger.error(f"All {len(failoverModelList)} models failed for content part")
break
# All models failed
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
async def _processChunksParallel(
self,
chunks: List[Dict[str, Any]],
prompt: str,
options,
failoverModels: list,
aiObjects,
progressCallback=None,
maxRetries: int = 3
) -> tuple:
"""Process chunks in parallel. On failure, re-chunk only the failed chunks for the next model.
Returns (orderedResults, allResponses) where orderedResults is a dict
mapping original order -> AiCallResponse and allResponses is a flat list.
"""
if not failoverModels:
return {}, []
pendingChunks = [(chunk.get("order", i), chunk) for i, chunk in enumerate(chunks)]
completedResults: Dict[float, AiCallResponse] = {}
allResponses: List[AiCallResponse] = []
retryCount = 0
modelIdx = 0
currentModel = failoverModels[modelIdx]
maxConcurrent = 3
semaphore = asyncio.Semaphore(maxConcurrent)
logger.info(f"Starting parallel chunk processing: {len(pendingChunks)} chunks with model {currentModel.name}")
while pendingChunks and retryCount <= maxRetries and currentModel:
modelForRound = currentModel
totalInRound = len(pendingChunks)
completedInRound = [0]
async def _processOneChunk(order: float, chunkData: str, model=modelForRound):
async with semaphore:
result = await aiObjects._callWithModel(model, prompt, chunkData, options)
completedInRound[0] += 1
if progressCallback:
progressCallback(completedInRound[0] / totalInRound, f"Chunk {completedInRound[0]}/{totalInRound} completed")
return result
tasks = {}
for order, chunk in pendingChunks:
chunkData = chunk.get('data', '')
tasks[order] = asyncio.create_task(_processOneChunk(order, chunkData))
if progressCallback:
progressCallback(0.0, f"Processing {len(tasks)} chunks in parallel with {currentModel.name}")
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
failedChunks = []
for (order, chunk), result in zip(pendingChunks, results):
if isinstance(result, Exception):
logger.warning(f"Chunk order={order} failed with {currentModel.name}: {result}")
failedChunks.append((order, chunk))
else:
completedResults[order] = result
allResponses.append(result)
logger.info(f"Round {retryCount}: {len(pendingChunks) - len(failedChunks)}/{len(pendingChunks)} chunks succeeded with {currentModel.name}")
if not failedChunks:
break
retryCount += 1
modelIdx += 1
if modelIdx >= len(failoverModels):
logger.error(f"No more failover models available, {len(failedChunks)} chunks remain failed")
break
currentModel = failoverModels[modelIdx]
logger.info(f"Failover: re-chunking {len(failedChunks)} failed chunks for model {currentModel.name}")
newPending = []
for order, failedChunk in failedChunks:
reChunked = await self._reChunkForModel(failedChunk, currentModel, prompt, options)
for i, subChunk in enumerate(reChunked):
subOrder = order + i * 0.001
newPending.append((subOrder, subChunk))
pendingChunks = newPending
orderedResponses = [completedResults[k] for k in sorted(completedResults.keys())]
return orderedResponses, allResponses
async def _reChunkForModel(self, chunk: Dict[str, Any], model, prompt: str, options) -> List[Dict[str, Any]]:
"""Re-chunk a single failed chunk according to the new model's context limits."""
chunkData = chunk.get('data', '')
tempPart = ContentPart(
id=f"rechunk_{uuid.uuid4().hex[:8]}",
label="re-chunk",
typeGroup="structure" if chunkData.strip().startswith(('{', '[')) else "text",
mimeType="application/json" if chunkData.strip().startswith(('{', '[')) else "text/plain",
data=chunkData
)
reChunked = await self.chunkContentPartForAi(tempPart, model, options, prompt)
if not reChunked:
return [chunk]
return reChunked
async def _stitchChunkResults(
self,
mergedContent: str,
chunkCount: int,
originalPrompt: str,
options,
aiObjects
) -> str:
"""Reconcile cross-chunk artifacts in merged content.
Only called when chunkCount > 1. Delegates to aiObjects.callWithTextContext
which handles model selection, failover, and billing.
"""
mergedSize = len(mergedContent.encode('utf-8')) if mergedContent else 0
stitchPrompt = (
"The following content was assembled from multiple independently processed "
f"chunks ({chunkCount} chunks) of the same document. "
"Review and fix ONLY these issues, preserving all content:\n"
"1. Cross-references that point to content from other chunks\n"
"2. Duplicate text at chunk boundaries (remove duplicates)\n"
"3. Sentences or paragraphs split mid-thought (reconnect them)\n"
"4. Inconsistent terminology for the same entity\n\n"
"Do NOT add, remove, or rephrase content beyond these fixes. "
"Return the corrected content in the same format.\n\n"
f"Original processing instruction (truncated): {originalPrompt[:500]}"
)
try:
logger.info(f"Running stitch pass on {mergedSize} bytes")
request = AiCallRequest(
prompt=stitchPrompt,
context=mergedContent,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE)
)
response = await aiObjects.callWithTextContext(request)
if hasattr(response, 'errorCount') and response.errorCount > 0:
logger.warning(f"Stitch pass returned error: {response.content[:200] if response.content else 'empty'}")
return mergedContent
resultSize = len(response.content.encode('utf-8')) if response.content else 0
logger.info(f"Stitch pass completed: {mergedSize} -> {resultSize} bytes")
return response.content
except Exception as e:
logger.warning(f"Stitch pass failed (non-fatal), returning unstitched content: {e}")
return mergedContent
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
"""Create an error response."""
return AiCallResponse(
@ -1521,9 +1691,18 @@ class ExtractionService:
progressCallback(0.1 + (partIndex / totalParts) * 0.8, f"Processing {partLabel} ({partType}) - {partIndex+1}/{totalParts}")
try:
# Process the part
partProgressCb = None
if progressCallback:
partStart = 0.1 + (partIndex / totalParts) * 0.8
partRange = 0.8 / totalParts
def _makePartProgressCb(start, rangeSize):
def _cb(chunkProgress, message):
progressCallback(start + chunkProgress * rangeSize, message)
return _cb
partProgressCb = _makePartProgressCb(partStart, partRange)
partResult = await self.processContentPartWithFallback(
contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging
contentPart, prompt, options, failoverModelList, aiObjects, partProgressCb
)
# Write debug files for generation phase (section content generation)

View file

@ -191,9 +191,11 @@ class ChunkerRegistry:
self.register("table", TableChunker())
self.register("structure", StructureChunker())
self.register("image", ImageChunker())
# Use text chunker for container and binary content
# Use text chunker for container, binary, and media stream content
self.register("container", TextChunker())
self.register("binary", TextChunker())
self.register("audiostream", TextChunker())
self.register("videostream", TextChunker())
except Exception as e:
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
import traceback

View file

@ -0,0 +1,3 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""serviceKnowledge: 3-tier RAG Knowledge Store with semantic search."""

View file

@ -0,0 +1,531 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
import logging
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelKnowledge import (
FileContentIndex, ContentChunk, WorkflowMemory,
)
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.shared.timeUtils import getUtcTimestamp
logger = logging.getLogger(__name__)
DEFAULT_CHUNK_SIZE = 512
DEFAULT_CONTEXT_BUDGET = 8000
class KnowledgeService:
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
def __init__(self, context, get_service: Callable[[str], Any]):
self._context = context
self._getService = get_service
self._knowledgeDb = getKnowledgeInterface(context.user)
# =========================================================================
# Embedding helper
# =========================================================================
async def _embed(self, texts: List[str]) -> List[List[float]]:
"""Embed texts via the AI interface's generic embedding method."""
aiService = self._getService("ai")
await aiService.ensureAiObjectsInitialized()
aiObjects = aiService.aiObjects
if aiObjects is None:
logger.warning("Embedding skipped: aiObjects not available")
return []
response = await aiObjects.callEmbedding(texts)
if response.errorCount > 0:
logger.error(f"Embedding failed: {response.content}")
return []
return (response.metadata or {}).get("embeddings", [])
async def _embedSingle(self, text: str) -> List[float]:
"""Embed a single text. Returns empty list on failure."""
results = await self._embed([text])
return results[0] if results else []
# =========================================================================
# File Indexing (called after extraction, before embedding)
# =========================================================================
async def indexFile(
self,
fileId: str,
fileName: str,
mimeType: str,
userId: str,
featureInstanceId: str = "",
mandateId: str = "",
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
) -> FileContentIndex:
"""Index a file's content objects and create embeddings for text chunks.
This is the main entry point after non-AI extraction has produced content objects.
Args:
fileId: The file ID.
fileName: Original file name.
mimeType: MIME type.
userId: Owner user.
featureInstanceId: Feature instance scope.
mandateId: Mandate scope.
contentObjects: List of extracted content objects, each with keys:
contentType (str), data (str), contextRef (dict), contentObjectId (str).
structure: Structural overview of the file.
containerPath: Path within container if applicable.
Returns:
The created FileContentIndex.
"""
contentObjects = contentObjects or []
# 1. Create FileContentIndex
index = FileContentIndex(
id=fileId,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
fileName=fileName,
mimeType=mimeType,
containerPath=containerPath,
totalObjects=len(contentObjects),
totalSize=sum(len(obj.get("data", "").encode("utf-8")) for obj in contentObjects),
structure=structure or {},
objectSummary=[
{
"id": obj.get("contentObjectId", ""),
"type": obj.get("contentType", "other"),
"size": len(obj.get("data", "").encode("utf-8")),
"ref": obj.get("contextRef", {}),
}
for obj in contentObjects
],
status="extracted",
)
self._knowledgeDb.upsertFileContentIndex(index)
# 2. Chunk text content objects and create embeddings
textObjects = [o for o in contentObjects if o.get("contentType") == "text"]
if textObjects:
self._knowledgeDb.updateFileStatus(fileId, "embedding")
chunks = _chunkForEmbedding(textObjects, chunkSize=DEFAULT_CHUNK_SIZE)
texts = [c["data"] for c in chunks]
embeddings = await self._embed(texts) if texts else []
for i, chunk in enumerate(chunks):
embedding = embeddings[i] if i < len(embeddings) else None
contentChunk = ContentChunk(
contentObjectId=chunk["contentObjectId"],
fileId=fileId,
userId=userId,
featureInstanceId=featureInstanceId,
contentType="text",
data=chunk["data"],
contextRef=chunk["contextRef"],
embedding=embedding,
)
self._knowledgeDb.upsertContentChunk(contentChunk)
# 3. Store non-text content objects (images, etc.) without embedding
nonTextObjects = [o for o in contentObjects if o.get("contentType") != "text"]
for obj in nonTextObjects:
contentChunk = ContentChunk(
contentObjectId=obj.get("contentObjectId", ""),
fileId=fileId,
userId=userId,
featureInstanceId=featureInstanceId,
contentType=obj.get("contentType", "other"),
data=obj.get("data", ""),
contextRef=obj.get("contextRef", {}),
embedding=None,
)
self._knowledgeDb.upsertContentChunk(contentChunk)
self._knowledgeDb.updateFileStatus(fileId, "indexed")
index.status = "indexed"
logger.info(f"Indexed file {fileId} ({fileName}): {len(contentObjects)} objects, {len(textObjects)} text chunks")
return index
# =========================================================================
# RAG Context Building (3-tier search)
# =========================================================================
async def buildAgentContext(
self,
currentPrompt: str,
workflowId: str,
userId: str,
featureInstanceId: str = "",
mandateId: str = "",
contextBudget: int = DEFAULT_CONTEXT_BUDGET,
) -> str:
"""Build RAG context for an agent round by searching all 3 layers.
Args:
currentPrompt: The current user prompt to find relevant context for.
workflowId: Current workflow ID.
userId: Current user.
featureInstanceId: Feature instance scope.
mandateId: Mandate scope.
contextBudget: Maximum characters for the context string.
Returns:
Formatted context string for injection into the agent's system prompt.
"""
queryVector = await self._embedSingle(currentPrompt)
if not queryVector:
return ""
builder = _ContextBuilder(budget=contextBudget)
# Layer 1: Instance Layer (user's own documents, highest priority)
instanceChunks = self._knowledgeDb.semanticSearch(
queryVector=queryVector,
userId=userId,
featureInstanceId=featureInstanceId,
limit=15,
minScore=0.65,
)
if instanceChunks:
builder.add(priority=1, label="Relevant Documents", items=instanceChunks)
# Layer 2: Workflow Layer (current workflow entities & memory)
entities = self._knowledgeDb.getWorkflowEntities(workflowId)
if entities:
builder.add(priority=2, label="Workflow Context", items=entities, isKeyValue=True)
# Layer 3: Shared Layer (mandate-wide shared documents)
sharedChunks = self._knowledgeDb.semanticSearch(
queryVector=queryVector,
mandateId=mandateId,
isShared=True,
limit=10,
minScore=0.7,
)
if sharedChunks:
builder.add(priority=3, label="Shared Knowledge", items=sharedChunks)
return builder.build()
# =========================================================================
# Workflow Memory
# =========================================================================
async def storeEntity(
self,
workflowId: str,
userId: str,
featureInstanceId: str,
key: str,
value: str,
source: str = "extraction",
) -> WorkflowMemory:
"""Store a key-value entity in workflow memory with optional embedding."""
embedding = await self._embedSingle(f"{key}: {value}")
memory = WorkflowMemory(
workflowId=workflowId,
userId=userId,
featureInstanceId=featureInstanceId,
key=key,
value=value,
source=source,
embedding=embedding if embedding else None,
)
self._knowledgeDb.upsertWorkflowMemory(memory)
return memory
def getEntities(self, workflowId: str) -> List[Dict[str, Any]]:
"""Get all entities for a workflow."""
return self._knowledgeDb.getWorkflowEntities(workflowId)
# =========================================================================
# File Status
# =========================================================================
def getFileStatus(self, fileId: str) -> Optional[str]:
"""Get the indexing status of a file."""
index = self._knowledgeDb.getFileContentIndex(fileId)
return index.get("status") if index else None
def isFileIndexed(self, fileId: str) -> bool:
"""Check if a file has been fully indexed."""
return self.getFileStatus(fileId) == "indexed"
# =========================================================================
# On-Demand Extraction (Smart Document Handling)
# =========================================================================
async def readSection(self, fileId: str, sectionId: str) -> List[Dict[str, Any]]:
"""Read content objects for a specific section. Uses cache if available.
Args:
fileId: Source file ID.
sectionId: Section identifier from the FileContentIndex structure.
Returns:
List of content object dicts with data and contextRef.
"""
cached = self._knowledgeDb.getContentChunks(fileId)
sectionChunks = [
c for c in (cached or [])
if (c.get("contextRef", {}).get("sectionId") == sectionId)
]
if sectionChunks:
return sectionChunks
index = self._knowledgeDb.getFileContentIndex(fileId)
if not index:
return []
structure = index.get("structure", {}) if isinstance(index, dict) else getattr(index, "structure", {})
sections = structure.get("sections", [])
section = next((s for s in sections if s.get("id") == sectionId), None)
if not section:
return []
startPage = section.get("startPage", 0)
endPage = section.get("endPage", startPage)
return await self._extractPagesOnDemand(fileId, startPage, endPage, sectionId)
async def readContentObjects(
self, fileId: str, filter: Dict[str, Any] = None
) -> List[Dict[str, Any]]:
"""Read content objects with optional filters (pageIndex, contentType, sectionId).
Args:
fileId: Source file ID.
filter: Optional dict with keys pageIndex (list[int]), contentType (str), sectionId (str).
Returns:
Filtered list of content chunk dicts.
"""
filter = filter or {}
chunks = self._knowledgeDb.getContentChunks(fileId) or []
if "pageIndex" in filter:
targetPages = filter["pageIndex"]
if isinstance(targetPages, int):
targetPages = [targetPages]
chunks = [
c for c in chunks
if c.get("contextRef", {}).get("pageIndex") in targetPages
]
if "contentType" in filter:
chunks = [c for c in chunks if c.get("contentType") == filter["contentType"]]
if "sectionId" in filter:
chunks = [
c for c in chunks
if c.get("contextRef", {}).get("sectionId") == filter["sectionId"]
]
return chunks
async def extractContainerItem(
self, fileId: str, containerPath: str
) -> Optional[Dict[str, Any]]:
"""On-demand extraction of a specific item within a container.
If the item is already indexed, returns existing data.
Otherwise triggers extraction and indexing.
Args:
fileId: The container file ID.
containerPath: Path within the container (e.g. "folder/report.pdf").
Returns:
FileContentIndex dict for the extracted item, or None.
"""
existing = self._knowledgeDb.getFileContentIndex(fileId)
if existing:
existingPath = existing.get("containerPath") if isinstance(existing, dict) else getattr(existing, "containerPath", None)
if existingPath == containerPath:
return existing
logger.info(f"On-demand extraction for {containerPath} in file {fileId}")
return None
async def _extractPagesOnDemand(
self, fileId: str, startPage: int, endPage: int, sectionId: str
) -> List[Dict[str, Any]]:
"""Extract specific pages from a file and cache in knowledge store."""
try:
chatService = self._getService("chat")
fileContent = chatService.getFileContent(fileId)
if not fileContent:
return []
fileData = fileContent.get("data", b"")
mimeType = fileContent.get("mimeType", "")
fileName = fileContent.get("fileName", "")
if isinstance(fileData, str):
import base64
fileData = base64.b64decode(fileData)
if mimeType != "application/pdf":
return []
try:
import fitz
except ImportError:
return []
doc = fitz.open(stream=fileData, filetype="pdf")
results = []
for pageIdx in range(startPage, min(endPage + 1, len(doc))):
page = doc[pageIdx]
text = page.get_text() or ""
if not text.strip():
continue
chunk = ContentChunk(
contentObjectId=f"page-{pageIdx}",
fileId=fileId,
userId=self._context.user.id if self._context.user else "",
featureInstanceId=self._context.feature_instance_id or "",
contentType="text",
data=text,
contextRef={
"containerPath": fileName,
"location": f"page:{pageIdx+1}",
"pageIndex": pageIdx,
"sectionId": sectionId,
},
)
embedding = await self._embedSingle(text[:2000])
if embedding:
chunk.embedding = embedding
self._knowledgeDb.upsertContentChunk(chunk)
results.append(chunk.model_dump())
doc.close()
return results
except Exception as e:
logger.error(f"On-demand page extraction failed: {e}")
return []
def getFileContentIndex(self, fileId: str) -> Optional[Dict[str, Any]]:
"""Get the FileContentIndex for a file."""
return self._knowledgeDb.getFileContentIndex(fileId)
# =============================================================================
# Internal helpers
# =============================================================================
def _chunkForEmbedding(
textObjects: List[Dict[str, Any]], chunkSize: int = 512
) -> List[Dict[str, Any]]:
"""Split text content objects into chunks suitable for embedding.
Each chunk preserves the contextRef from its source object.
Long texts are split at sentence boundaries where possible.
"""
chunks = []
for obj in textObjects:
text = obj.get("data", "")
contentObjectId = obj.get("contentObjectId", "")
contextRef = obj.get("contextRef", {})
if len(text) <= chunkSize:
chunks.append({
"data": text,
"contentObjectId": contentObjectId,
"contextRef": contextRef,
})
continue
# Split at sentence boundaries
sentences = text.replace("\n", " ").split(". ")
currentChunk = ""
for sentence in sentences:
candidate = f"{currentChunk}. {sentence}" if currentChunk else sentence
if len(candidate) > chunkSize and currentChunk:
chunks.append({
"data": currentChunk.strip(),
"contentObjectId": contentObjectId,
"contextRef": contextRef,
})
currentChunk = sentence
else:
currentChunk = candidate
if currentChunk.strip():
chunks.append({
"data": currentChunk.strip(),
"contentObjectId": contentObjectId,
"contextRef": contextRef,
})
return chunks
class _ContextBuilder:
"""Assembles RAG context from multiple sources respecting a character budget."""
def __init__(self, budget: int):
self._budget = budget
self._sections: List[Dict[str, Any]] = []
def add(
self,
priority: int,
label: str,
items: List[Dict[str, Any]],
isKeyValue: bool = False,
):
self._sections.append({
"priority": priority,
"label": label,
"items": items,
"isKeyValue": isKeyValue,
})
def build(self) -> str:
self._sections.sort(key=lambda s: s["priority"])
parts = []
remaining = self._budget
for section in self._sections:
if remaining <= 0:
break
header = f"### {section['label']}\n"
sectionText = header
remaining -= len(header)
for item in section["items"]:
if remaining <= 0:
break
if section["isKeyValue"]:
line = f"- {item.get('key', '')}: {item.get('value', '')}\n"
else:
data = item.get("data", "")
ref = item.get("contextRef", {})
score = item.get("_score", "")
refStr = f" [{ref}]" if ref else ""
line = f"{data}{refStr}\n"
if len(line) <= remaining:
sectionText += line
remaining -= len(line)
parts.append(sectionText)
return "\n".join(parts).strip()

View file

@ -0,0 +1,427 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Structure Pre-Scan: fast, AI-free document analysis.
Extracts TOC, headings, page map, image positions, and structural metadata
from documents. Used as the first step in the auto-index pipeline.
Supported formats:
- PDF: TOC, heading detection (font-size heuristic), page map, image positions
- DOCX: heading styles, paragraph map
- PPTX: slide titles, slide map
- XLSX: sheet names, row/column counts
- Other: minimal index (single content object = the file itself)
"""
import io
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelKnowledge import FileContentIndex
from modules.datamodels.datamodelContent import ContentObjectSummary, ContentContextRef
logger = logging.getLogger(__name__)
async def preScanDocument(
fileData: bytes,
mimeType: str,
fileId: str,
fileName: str = "",
userId: str = "",
featureInstanceId: str = "",
mandateId: str = "",
) -> FileContentIndex:
"""Create a structural FileContentIndex without AI.
This is purely programmatic: TOC extraction, heading detection,
page mapping, image position scanning.
"""
scanner = _SCANNER_MAP.get(mimeType)
if scanner is None:
ext = (fileName.rsplit(".", 1)[-1].lower()) if "." in fileName else ""
scanner = _EXTENSION_SCANNER_MAP.get(ext, _scanMinimal)
try:
structure, objectSummary, totalObjects, totalSize = await scanner(fileData, fileName)
except Exception as e:
logger.error(f"Pre-scan failed for {fileName} ({mimeType}): {e}")
structure = {"error": str(e)}
objectSummary = []
totalObjects = 0
totalSize = len(fileData)
return FileContentIndex(
id=fileId,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
fileName=fileName,
mimeType=mimeType,
totalObjects=totalObjects,
totalSize=totalSize,
structure=structure,
objectSummary=[s.model_dump() for s in objectSummary],
status="extracted",
)
# ---------------------------------------------------------------------------
# PDF scanner
# ---------------------------------------------------------------------------
async def _scanPdf(fileData: bytes, fileName: str):
try:
import fitz
except ImportError:
logger.warning("PyMuPDF not installed -- PDF pre-scan unavailable")
return _fallbackStructure(fileData, fileName)
doc = fitz.open(stream=fileData, filetype="pdf")
toc = doc.get_toc()
pageMap: List[Dict[str, Any]] = []
summaries: List[ContentObjectSummary] = []
totalSize = 0
objIndex = 0
for i in range(len(doc)):
page = doc[i]
textLen = len(page.get_text())
blocks = page.get_text("dict", flags=0).get("blocks", [])
headings = []
for b in blocks:
if b.get("type") != 0:
continue
for line in b.get("lines", []):
for span in line.get("spans", []):
if _isHeading(span):
headings.append(span.get("text", "").strip())
images = page.get_images(full=True)
hasTable = _detectTableHeuristic(page)
pageMap.append({
"pageIndex": i,
"headings": headings,
"hasImages": len(images) > 0,
"imageCount": len(images),
"textLength": textLen,
"hasTable": hasTable,
})
if textLen > 0:
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="text",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"page:{i+1}",
pageIndex=i,
),
charCount=textLen,
))
totalSize += textLen
objIndex += 1
for j in range(len(images)):
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="image",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"page:{i+1}/image:{j}",
pageIndex=i,
),
))
objIndex += 1
sections = _buildSectionsFromTocOrHeadings(toc, pageMap)
doc.close()
structure = {
"pages": len(pageMap),
"toc": toc,
"sections": sections,
"pageMap": pageMap,
"imageCount": sum(p.get("imageCount", 0) for p in pageMap),
"tableCount": sum(1 for p in pageMap if p.get("hasTable")),
}
return structure, summaries, len(summaries), totalSize
def _isHeading(span: Dict) -> bool:
"""Heuristic: heading if font size >= 14 or bold + size >= 12."""
size = span.get("size", 0)
flags = span.get("flags", 0)
isBold = bool(flags & (1 << 4))
return size >= 14 or (isBold and size >= 12)
def _detectTableHeuristic(page) -> bool:
"""Detect tables by looking for grid-like line patterns."""
try:
drawings = page.get_drawings()
lineCount = sum(1 for d in drawings if d.get("type") == "l")
return lineCount >= 6
except Exception:
return False
def _buildSectionsFromTocOrHeadings(
toc: list, pageMap: List[Dict]
) -> List[Dict[str, Any]]:
"""Build section boundaries from TOC or heading data."""
sections: List[Dict[str, Any]] = []
if toc:
for i, entry in enumerate(toc):
level, title, pageNum = entry[0], entry[1], entry[2]
endPage = toc[i + 1][2] - 1 if i + 1 < len(toc) else len(pageMap) - 1
sections.append({
"id": f"section-{i}",
"title": title,
"level": level,
"startPage": pageNum - 1,
"endPage": endPage,
})
else:
currentSection = None
for pm in pageMap:
headings = pm.get("headings", [])
if headings:
if currentSection:
currentSection["endPage"] = pm["pageIndex"] - 1
sections.append(currentSection)
currentSection = {
"id": f"section-{len(sections)}",
"title": headings[0],
"level": 1,
"startPage": pm["pageIndex"],
"endPage": pm["pageIndex"],
}
elif currentSection:
currentSection["endPage"] = pm["pageIndex"]
if currentSection:
sections.append(currentSection)
return sections
# ---------------------------------------------------------------------------
# DOCX scanner
# ---------------------------------------------------------------------------
async def _scanDocx(fileData: bytes, fileName: str):
try:
import docx
except ImportError:
return _fallbackStructure(fileData, fileName)
doc = docx.Document(io.BytesIO(fileData))
summaries: List[ContentObjectSummary] = []
sections: List[Dict[str, Any]] = []
totalSize = 0
objIndex = 0
currentSection = None
for i, para in enumerate(doc.paragraphs):
text = para.text or ""
styleName = (para.style.name or "").lower() if para.style else ""
if "heading" in styleName and text.strip():
if currentSection:
sections.append(currentSection)
level = 1
for ch in styleName:
if ch.isdigit():
level = int(ch)
break
currentSection = {
"id": f"section-{len(sections)}",
"title": text.strip(),
"level": level,
"startParagraph": i,
"endParagraph": i,
}
elif currentSection:
currentSection["endParagraph"] = i
if text.strip():
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="text",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"paragraph:{i+1}",
sectionId=currentSection["id"] if currentSection else "body",
),
charCount=len(text),
))
totalSize += len(text)
objIndex += 1
if currentSection:
sections.append(currentSection)
for ti, table in enumerate(doc.tables):
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="text",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"table:{ti+1}",
),
))
objIndex += 1
structure = {
"paragraphs": len(doc.paragraphs),
"tables": len(doc.tables),
"sections": sections,
}
return structure, summaries, len(summaries), totalSize
# ---------------------------------------------------------------------------
# PPTX scanner
# ---------------------------------------------------------------------------
async def _scanPptx(fileData: bytes, fileName: str):
try:
from pptx import Presentation
except ImportError:
return _fallbackStructure(fileData, fileName)
prs = Presentation(io.BytesIO(fileData))
summaries: List[ContentObjectSummary] = []
slideMap: List[Dict[str, Any]] = []
totalSize = 0
objIndex = 0
for i, slide in enumerate(prs.slides):
title = ""
textLen = 0
imageCount = 0
for shape in slide.shapes:
if hasattr(shape, "text"):
textLen += len(shape.text)
if shape.has_text_frame and not title:
title = shape.text.strip()[:80]
if shape.shape_type == 13:
imageCount += 1
slideMap.append({
"slideIndex": i,
"title": title,
"textLength": textLen,
"imageCount": imageCount,
})
if textLen > 0:
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="text",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"slide:{i+1}",
slideIndex=i,
),
charCount=textLen,
))
totalSize += textLen
objIndex += 1
structure = {
"slides": len(prs.slides),
"slideMap": slideMap,
}
return structure, summaries, len(summaries), totalSize
# ---------------------------------------------------------------------------
# XLSX scanner
# ---------------------------------------------------------------------------
async def _scanXlsx(fileData: bytes, fileName: str):
try:
import openpyxl
except ImportError:
return _fallbackStructure(fileData, fileName)
wb = openpyxl.load_workbook(io.BytesIO(fileData), data_only=True, read_only=True)
summaries: List[ContentObjectSummary] = []
sheetMap: List[Dict[str, Any]] = []
totalSize = 0
objIndex = 0
for sheetName in wb.sheetnames:
ws = wb[sheetName]
rowCount = ws.max_row or 0
colCount = ws.max_column or 0
sheetMap.append({
"sheetName": sheetName,
"rows": rowCount,
"columns": colCount,
})
summaries.append(ContentObjectSummary(
id=f"co-{objIndex}",
contentType="text",
contextRef=ContentContextRef(
containerPath=fileName,
location=f"sheet:{sheetName}",
sheetName=sheetName,
),
charCount=rowCount * colCount * 10,
))
totalSize += rowCount * colCount * 10
objIndex += 1
wb.close()
structure = {"sheets": len(wb.sheetnames), "sheetMap": sheetMap}
return structure, summaries, len(summaries), totalSize
# ---------------------------------------------------------------------------
# Minimal / fallback scanner
# ---------------------------------------------------------------------------
async def _scanMinimal(fileData: bytes, fileName: str):
return _fallbackStructure(fileData, fileName)
def _fallbackStructure(fileData: bytes, fileName: str):
summary = ContentObjectSummary(
id="co-0",
contentType="other",
contextRef=ContentContextRef(containerPath=fileName, location="file"),
charCount=len(fileData),
)
structure = {"type": "single", "size": len(fileData)}
return structure, [summary], 1, len(fileData)
# ---------------------------------------------------------------------------
# Scanner map
# ---------------------------------------------------------------------------
_SCANNER_MAP: Dict[str, Any] = {
"application/pdf": _scanPdf,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": _scanDocx,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": _scanPptx,
"application/vnd.ms-powerpoint": _scanPptx,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": _scanXlsx,
}
_EXTENSION_SCANNER_MAP: Dict[str, Any] = {
"pdf": _scanPdf,
"docx": _scanDocx,
"pptx": _scanPptx,
"ppt": _scanPptx,
"xlsx": _scanXlsx,
"xlsm": _scanXlsx,
}

View file

@ -375,7 +375,7 @@ USER PROVIDED:
- Language: {language or "Not specified"}
Extract and provide a JSON response with:
1. instruction: Formulate directly, WHAT you want to find on the web. Do not include URLs in the instruction. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz"
1. instruction: Formulate a concise search query (MAXIMUM 400 characters) stating WHAT you want to find on the web. Do not include URLs in the instruction. Keep it focused on the core question. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz and find all information about..."
2. urls: Put list of URLs found in the prompt text, and URL's you know, that are relevant to the research
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)

View file

@ -1,13 +1,18 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Services Module.
Central service registry that provides access to shared services.
Service Hub.
Consumer-facing aggregation layer for services, DB interfaces, and runtime state.
IMPORTANT: Import-Regelwerk
- Zentrale Module (wie dieses) dürfen KEINE Feature-Container importieren
Architecture:
- serviceHub delegates service resolution to serviceCenter (DI container)
- serviceHub owns DB interface initialization and runtime state
- serviceCenter knows nothing about serviceHub (one-way dependency)
Import-Regelwerk:
- Zentrale Module (wie dieses) duerfen KEINE Feature-Container importieren
- Feature-spezifische Services werden dynamisch geladen
- Nur Shared Services werden direkt geladen
- Shared Services werden via serviceCenter resolved
"""
import os
@ -23,7 +28,6 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
# Path to feature containers
_FEATURES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "features")
@ -54,15 +58,19 @@ class PublicService:
])
class Services:
class ServiceHub:
"""
Central Services class providing access to all services.
Consumer-facing aggregation of services, DB interfaces, and runtime state.
Import-Regelwerk:
- Shared Services are loaded directly (from modules/services/)
- Feature-specific Services are loaded dynamically via filename discovery
Services are lazy-resolved via serviceCenter on first access.
DB interfaces and runtime state are initialized eagerly.
Feature services/interfaces are discovered dynamically from features/.
"""
_SERVICE_CENTER_WRAPPING = {
"ai": {"functionsOnly": False},
}
def __init__(self, user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None):
self.user: User = user
self.workflow = workflow
@ -71,7 +79,14 @@ class Services:
self.currentUserPrompt: str = ""
self.rawUserPrompt: str = ""
# Initialize central interfaces
from modules.serviceCenter.context import ServiceCenterContext
self._serviceCenterContext = ServiceCenterContext(
user=user,
workflow=workflow,
mandate_id=mandateId,
feature_instance_id=featureInstanceId,
)
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
self.interfaceDbApp = getAppInterface(user, mandateId=mandateId)
@ -80,75 +95,40 @@ class Services:
self.rbac = self.interfaceDbApp.rbac if self.interfaceDbApp else None
# ============================================================
# CENTRAL INTERFACE (Chat/Workflow)
# ============================================================
from modules.interfaces.interfaceDbChat import getInterface as getChatInterface
self.interfaceDbChat = getChatInterface(user, mandateId=mandateId, featureInstanceId=featureInstanceId)
# ============================================================
# SHARED SERVICES (from modules/services/)
# ============================================================
from .serviceSharepoint.mainServiceSharepoint import SharepointService
self.sharepoint = PublicService(SharepointService(self))
from .serviceTicket.mainServiceTicket import TicketService
self.ticket = PublicService(TicketService(self))
from .serviceChat.mainServiceChat import ChatService
self.chat = PublicService(ChatService(self))
from .serviceUtils.mainServiceUtils import UtilsService
self.utils = PublicService(UtilsService(self))
from .serviceSecurity.mainServiceSecurity import SecurityService
self.security = PublicService(SecurityService(self))
from .serviceMessaging.mainServiceMessaging import MessagingService
self.messaging = PublicService(MessagingService(self))
from .serviceStreaming.mainServiceStreaming import StreamingService
self.streaming = PublicService(StreamingService(self))
# ============================================================
# AI SERVICES (from modules/services/)
# ============================================================
from .serviceAi.mainServiceAi import AiService
self.ai = PublicService(AiService(self), functionsOnly=False)
from .serviceExtraction.mainServiceExtraction import ExtractionService
self.extraction = PublicService(ExtractionService(self))
from .serviceGeneration.mainServiceGeneration import GenerationService
self.generation = PublicService(GenerationService(self))
from .serviceWeb.mainServiceWeb import WebService
self.web = PublicService(WebService(self))
# ============================================================
# FEATURE INTERFACES (dynamically loaded)
# ============================================================
self._loadFeatureInterfaces()
self._loadFeatureServices()
def __getattr__(self, name: str):
"""Lazy-resolve services via serviceCenter on first access."""
if name.startswith('_'):
raise AttributeError(name)
try:
from modules.serviceCenter import getService
service = getService(name, self._serviceCenterContext)
wrapping = self._SERVICE_CENTER_WRAPPING.get(name, {})
functionsOnly = wrapping.get("functionsOnly", True)
wrapped = PublicService(service, functionsOnly=functionsOnly)
setattr(self, name, wrapped)
return wrapped
except KeyError:
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
def _loadFeatureInterfaces(self):
"""Dynamically load interfaces from feature containers by filename pattern."""
# Find all interfaceFeature*.py files
pattern = os.path.join(_FEATURES_DIR, "*", "interfaceFeature*.py")
for filepath in glob.glob(pattern):
try:
# Extract feature name and interface name
featureDir = os.path.basename(os.path.dirname(filepath))
filename = os.path.basename(filepath)[:-3] # Remove .py
filename = os.path.basename(filepath)[:-3]
# Build module path: modules.features.<feature>.<filename>
modulePath = f"modules.features.{featureDir}.{filename}"
module = importlib.import_module(modulePath)
# Get interface via getInterface()
if hasattr(module, "getInterface"):
interface = module.getInterface(self.user, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
# Derive attribute name: interfaceFeatureAiChat -> interfaceDbChat
attrName = filename.replace("interfaceFeature", "interfaceDb")
setattr(self, attrName, interface)
logger.debug(f"Loaded interface: {attrName} from {modulePath}")
@ -157,35 +137,29 @@ class Services:
def _loadFeatureServices(self):
"""Dynamically load services from feature containers by filename pattern."""
# Find all service*/mainService*.py files in feature containers
pattern = os.path.join(_FEATURES_DIR, "*", "service*", "mainService*.py")
for filepath in glob.glob(pattern):
try:
# Extract paths
serviceDir = os.path.basename(os.path.dirname(filepath))
featureDir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
filename = os.path.basename(filepath)[:-3] # Remove .py
filename = os.path.basename(filepath)[:-3]
# Build module path: modules.features.<feature>.<serviceDir>.<filename>
modulePath = f"modules.features.{featureDir}.{serviceDir}.{filename}"
module = importlib.import_module(modulePath)
# Find service class (ends with "Service")
serviceClass = None
for name in dir(module):
if name.endswith("Service") and not name.startswith("_"):
cls = getattr(module, name)
for attrName in dir(module):
if attrName.endswith("Service") and not attrName.startswith("_"):
cls = getattr(module, attrName)
if isinstance(cls, type):
serviceClass = cls
break
if serviceClass:
# Derive attribute name: serviceAi -> ai, serviceExtraction -> extraction
attrName = serviceDir.replace("service", "").lower()
if not attrName:
attrName = serviceDir.lower()
# Check if it needs functionsOnly=False (for AI service)
functionsOnly = attrName != "ai"
serviceInstance = serviceClass(self)
@ -195,6 +169,10 @@ class Services:
logger.debug(f"Could not load service from {filepath}: {e}")
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> Services:
"""Get Services instance for the given user, mandate, and feature instance context."""
return Services(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)
# Backward-compatible alias
Services = ServiceHub
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> ServiceHub:
"""Get ServiceHub instance for the given user, mandate, and feature instance context."""
return ServiceHub(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)

View file

@ -1,166 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
AIChat Feature Container - Main Module.
Handles feature initialization and RBAC catalog registration.
AIChat is the dynamic chat workflow feature that handles:
- AI-powered document processing
- Dynamic workflow execution
- Automation definitions
"""
import logging
from typing import Dict, List, Any
logger = logging.getLogger(__name__)
# Feature metadata
FEATURE_CODE = "chatworkflow"
FEATURE_LABEL = {"en": "Chat Workflow", "de": "Chat-Workflow", "fr": "Workflow de Chat"}
FEATURE_ICON = "mdi-message-cog"
# UI Objects for RBAC catalog
UI_OBJECTS = [
{
"objectKey": "ui.feature.aichat.workflows",
"label": {"en": "Workflows", "de": "Workflows", "fr": "Workflows"},
"meta": {"area": "workflows"}
},
{
"objectKey": "ui.feature.aichat.automations",
"label": {"en": "Automations", "de": "Automatisierungen", "fr": "Automatisations"},
"meta": {"area": "automations"}
},
{
"objectKey": "ui.feature.aichat.logs",
"label": {"en": "Logs", "de": "Logs", "fr": "Journaux"},
"meta": {"area": "logs"}
},
]
# Resource Objects for RBAC catalog
RESOURCE_OBJECTS = [
{
"objectKey": "resource.feature.aichat.workflow.start",
"label": {"en": "Start Workflow", "de": "Workflow starten", "fr": "Démarrer workflow"},
"meta": {"endpoint": "/api/chat/playground/start", "method": "POST"}
},
{
"objectKey": "resource.feature.aichat.workflow.stop",
"label": {"en": "Stop Workflow", "de": "Workflow stoppen", "fr": "Arrêter workflow"},
"meta": {"endpoint": "/api/chat/playground/stop/{workflowId}", "method": "POST"}
},
{
"objectKey": "resource.feature.aichat.workflow.delete",
"label": {"en": "Delete Workflow", "de": "Workflow löschen", "fr": "Supprimer workflow"},
"meta": {"endpoint": "/api/chat/playground/workflow/{workflowId}", "method": "DELETE"}
},
]
# Template roles for this feature
TEMPLATE_ROLES = [
{
"roleLabel": "workflow-admin",
"description": {
"en": "Workflow Administrator - Full access to workflow configuration and execution",
"de": "Workflow-Administrator - Vollzugriff auf Workflow-Konfiguration und Ausführung",
"fr": "Administrateur workflow - Accès complet à la configuration et exécution"
}
},
{
"roleLabel": "workflow-editor",
"description": {
"en": "Workflow Editor - Create and modify workflows",
"de": "Workflow-Editor - Workflows erstellen und bearbeiten",
"fr": "Éditeur workflow - Créer et modifier les workflows"
}
},
{
"roleLabel": "workflow-viewer",
"description": {
"en": "Workflow Viewer - View workflows and execution results",
"de": "Workflow-Betrachter - Workflows und Ausführungsergebnisse einsehen",
"fr": "Visualiseur workflow - Consulter les workflows et résultats"
}
},
]
def getFeatureDefinition() -> Dict[str, Any]:
"""Return the feature definition for registration."""
return {
"code": FEATURE_CODE,
"label": FEATURE_LABEL,
"icon": FEATURE_ICON
}
def getUiObjects() -> List[Dict[str, Any]]:
"""Return UI objects for RBAC catalog registration."""
return UI_OBJECTS
def getResourceObjects() -> List[Dict[str, Any]]:
"""Return resource objects for RBAC catalog registration."""
return RESOURCE_OBJECTS
def getTemplateRoles() -> List[Dict[str, Any]]:
"""Return template roles for this feature."""
return TEMPLATE_ROLES
def registerFeature(catalogService) -> bool:
"""
Register this feature's RBAC objects in the catalog.
Args:
catalogService: The RBAC catalog service instance
Returns:
True if registration was successful
"""
try:
# Register UI objects
for uiObj in UI_OBJECTS:
catalogService.registerUiObject(
featureCode=FEATURE_CODE,
objectKey=uiObj["objectKey"],
label=uiObj["label"],
meta=uiObj.get("meta")
)
# Register Resource objects
for resObj in RESOURCE_OBJECTS:
catalogService.registerResourceObject(
featureCode=FEATURE_CODE,
objectKey=resObj["objectKey"],
label=resObj["label"],
meta=resObj.get("meta")
)
logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects")
return True
except Exception as e:
logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}")
return False
async def onStart(eventUser) -> None:
"""
Called when the feature container starts.
Initializes AI connectors for model registry.
"""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
modelRegistry.ensureConnectorsRegistered()
logger.info(f"Feature '{FEATURE_CODE}' started - AI connectors initialized")
except Exception as e:
logger.error(f"Feature '{FEATURE_CODE}' failed to initialize AI connectors: {e}")
async def onStop(eventUser) -> None:
"""Called when the feature container stops."""
logger.info(f"Feature '{FEATURE_CODE}' stopped")

File diff suppressed because it is too large Load diff

View file

@ -1,513 +0,0 @@
================================================================================
JSON MERGE OPERATION #1
================================================================================
Timestamp: 2026-01-06T22:24:33.405726
INPUT:
Accumulated length: 40250 chars
New Fragment length: 2471 chars
Accumulated: 373 lines (showing first 5 and last 5)
{
"elements": [
{
"type": "table",
"content": {
... (363 lines omitted) ...
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hin
New Fragment: 33 lines (showing first 5 and last 5)
```json
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
... (23 lines omitted) ...
}
}
]
}
```
Normalized Accumulated (40250 chars)
(showing first 5 and last 5 of 373 lines)
{
"elements": [
{
"type": "table",
"content": {
... (363 lines omitted) ...
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hin
Normalized New Fragment (2459 chars)
(showing first 5 and last 5 of 31 lines)
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
... (21 lines omitted) ...
]
}
}
]
}
STEP: PHASE 1
Description: Finding overlap between JSON strings
⏳ In progress...
Overlap Detection (string (exact)):
Overlap length: 40
✅ Found overlap of 40 chars
Accumulated suffix (COMPLETE, 40 chars):
============================================================================
["06.12.25", "08.12.25", "Decathlon, Hin
============================================================================
Fragment prefix (40 chars, 1 lines)
["06.12.25", "08.12.25", "Decathlon, Hin
Overlap found (40 chars):
Accumulated suffix: ["06.12.25", "08.12.25", "Decathlon, Hin
Fragment prefix: ["06.12.25", "08.12.25", "Decathlon, Hin
STEP: PHASE 2
Description: Merging strings (overlap: 40 chars)
⏳ In progress...
Merged String (42669 chars)
(showing first 5 and last 5 of 403 lines)
{
"elements": [
{
"type": "table",
"content": {
... (393 lines omitted) ...
]
}
}
]
}
STEP: PHASE 3
Description: Returning merged string (may be unclosed)
⏳ In progress...
Returning merged string (preserving incomplete element at end for next iteration)
================================================================================
MERGE RESULT: ✅ SUCCESS
================================================================================
Final result length: 42669 chars
Final result (COMPLETE):
================================================================================
{
"elements": [
{
"type": "table",
"content": {
"headers": [
"Date",
"Valuta",
"Details",
"Currency",
"Amount",
"Amount in CHF",
"Maskierte Kreditkarte"
],
"rows": [
["12.09.25", "15.09.25", "Coop-1911 Ruti, Ruti ZH", "CH", "102.05", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "26.20", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "4.50", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "Gartencenter Meier, Durnten", "CH", "88.40", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "18.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "KONDITOREI VOLAND WALD, WALD ZH", "CH", "16.50", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.02", "0.00", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "50.80", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM", "US", "USD 108.10", "88.60", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "113.35", "", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "3.60", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "Coop-4991 Fallanden, Fallanden", "CH", "116.00", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "5.95", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "7.00", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "32.10", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "14.80", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "370.65", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "11.50", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "Kreuzwirt, Weissensee", "AT", "EUR 278.00", "266.50", "**** **** **** 1234"],
["23.09.25", "24.09.25", "FILIALE, WALD ZH", "CH", "EUR 500.00", "492.15", "**** **** **** 1234"],
["24.09.25", "25.09.25", "P2 Parkhaus Ein- & Ausfah, Zurich", "CH", "5.00", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "A.I.R. Bakery, Zurich", "CH", "18.60", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "23.35", "", "**** **** **** 1234"],
["25.09.25", "26.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "203.20", "", "**** **** **** 1234"],
["25.09.25", "26.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "44.10", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "95.25", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "Puls Apotheke & Drogerie, Hinwil", "CH", "140.60", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "FILIALE, WALD ZH", "CH", "CHF 280.00", "287.00", "**** **** **** 1234"],
["27.09.25", "29.09.25", "NYX*LullySA, Lully", "CH", "1.00", "", "**** **** **** 1234"],
["27.09.25", "29.09.25", "Kisoque de Lully, Lully", "CH", "5.70", "", "**** **** **** 1234"],
["27.09.25", "29.09.25", "TOTAL MKT FR, NANTERRE", "FR", "EUR 79.95", "76.90", "**** **** **** 1234"],
["27.09.25", "29.09.25", "AREA NFC 4261525, 69BRON CEDEX", "FR", "EUR 33.50", "32.20", "**** **** **** 1234"],
["27.09.25", "29.09.25", "HOLIDAY APARTMENTS, PORT SAPLAYA", "ES", "EUR 1'118.15", "1'075.45", "**** **** **** 1234"],
["27.09.25", "29.09.25", "LE BISTROT DEL M, MEZE", "FR", "EUR 210.20", "202.15", "**** **** **** 1234"],
["27.09.25", "29.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "EUR 2.40", "2.30", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 90.09", "86.65", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 4.70", "4.50", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 8.40", "8.10", "**** **** **** 1234"],
["28.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 15.60", "15.00", "**** **** **** 1234"],
["27.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 24.40", "23.45", "**** **** **** 1234"],
["29.09.25", "30.09.25", "OROMARKET SUPERMERCADOS, OROPESA", "ES", "EUR 17.32", "16.65", "**** **** **** 1234"],
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 40.40", "38.85", "**** **** **** 1234"],
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 22.55", "21.70", "**** **** **** 1234"],
["29.09.25", "30.09.25", "ALDI OROPESA, OROPESA", "ES", "EUR 129.39", "124.40", "**** **** **** 1234"],
["30.09.25", "01.10.25", "QUESADA CENTER, OROPESA DEL M", "ES", "EUR 84.05", "80.95", "**** **** **** 1234"],
["30.09.25", "01.10.25", "PASSION CREPES, OROPESA", "ES", "EUR 10.30", "9.90", "**** **** **** 1234"],
["30.09.25", "01.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 17.53", "16.90", "**** **** **** 1234"],
["30.09.25", "01.10.25", "Restaurante DRAGON, OROPESA", "ES", "EUR 75.00", "72.25", "**** **** **** 1234"],
["30.09.25", "01.10.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM", "US", "USD 216.20", "177.55", "**** **** **** 1234"],
["01.10.25", "02.10.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "29.60", "", "**** **** **** 1234"],
["01.10.25", "02.10.25", "RTE PUERTA DEL SOL, OROPESA DEL M", "ES", "EUR 169.20", "163.10", "**** **** **** 1234"],
["01.10.25", "02.10.25", "TREN TURISTICO OROPESA, OROPESA DEL M", "ES", "EUR 15.00", "14.45", "**** **** **** 1234"],
["01.10.25", "02.10.25", "LANGDOCK GMBH, BERLIN", "DE", "EUR 25.00", "24.10", "**** **** **** 1234"],
["01.10.25", "02.10.25", "WWW.PERPLEXITY.AI, WWW.PERPLEXIT", "US", "USD 10.81", "8.90", "**** **** **** 1234"],
["02.10.25", "06.10.25", "GOOGLE *YouTubePremium, g.co/helppay#", "GB", "33.90", "", "**** **** **** 1234"],
["02.10.25", "06.10.25", "WILLY LA CONCHA, OROPESA DEL M", "ES", "EUR 98.93", "95.40", "**** **** **** 1234"],
["03.10.25", "06.10.25", "Netflix.com, Los Gatos", "NL", "20.90", "", "**** **** **** 1234"],
["03.10.25", "06.10.25", "COALIMENT LA CONCHA, OROPESA DEL M", "ES", "EUR 11.74", "11.30", "**** **** **** 1234"],
["03.10.25", "06.10.25", "DONA RESU, OROPESA", "ES", "EUR 7.30", "7.05", "**** **** **** 1234"],
["04.10.25", "06.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 89.50", "86.30", "**** **** **** 1234"],
["04.10.25", "06.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 8.45", "8.15", "**** **** **** 1234"],
["04.10.25", "06.10.25", "HELADERIA LAS DELICIAS, OROPESA DEL M", "ES", "EUR 10.80", "10.40", "**** **** **** 1234"],
["04.10.25", "06.10.25", "REST. BISTROT, OROPESA DEL M", "ES", "EUR 117.90", "113.70", "**** **** **** 1234"],
["04.10.25", "06.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["04.10.25", "06.10.25", "Google Duolingo Langu, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 3.00", "2.90", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 9.00", "8.70", "**** **** **** 1234"],
["05.10.25", "06.10.25", "RESTAURANTE, ORPESA", "ES", "EUR 87.75", "84.60", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 15.50", "14.95", "**** **** **** 1234"],
["06.10.25", "07.10.25", "HABANA, OROPESA", "ES", "EUR 25.00", "24.05", "**** **** **** 1234"],
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 3.95", "3.80", "**** **** **** 1234"],
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 47.75", "45.95", "**** **** **** 1234"],
["07.10.25", "08.10.25", "MAGIC SPORT HALL OLYMPICS, OROPESA DEL M", "ES", "EUR 183.75", "176.70", "**** **** **** 1234"],
["07.10.25", "08.10.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 172.55", "165.90", "**** **** **** 1234"],
["07.10.25", "08.10.25", "Wondershare, Hong Kong", "HK", "25.95", "", "**** **** **** 1234"],
["07.10.25", "08.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 99.13", "95.30", "**** **** **** 1234"],
["07.10.25", "08.10.25", "RECEP HOTEL MAGIC SPORTS, OROPESA DEL M", "ES", "EUR 10.00", "9.60", "**** **** **** 1234"],
["07.10.25", "08.10.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 98.07", "94.00", "**** **** **** 1234"],
["08.10.25", "09.10.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 44.20", "42.35", "**** **** **** 1234"],
["08.10.25", "09.10.25", "A.R.E.A., 69671", "FR", "EUR 11.20", "10.75", "**** **** **** 1234"],
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "113.10", "", "**** **** **** 1234"],
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "6.80", "", "**** **** **** 1234"],
["09.10.25", "10.10.25", "A.R.E.A., 69671", "FR", "EUR 15.00", "14.40", "**** **** **** 1234"],
["08.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 110.00", "105.45", "**** **** **** 1234"],
["09.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 40.00", "38.35", "**** **** **** 1234"],
["10.10.25", "13.10.25", "Coop-1252 Wald, Wald ZH", "CH", "164.85", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "CURSOR, AI POWERED IDE, CURSOR.COM", "US", "USD 20.00", "16.60", "**** **** **** 1234"],
["11.10.25", "13.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Cafe Konditorei Voland, Laupen ZH", "CH", "37.70", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "17.35", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "5.40", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "54.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Rest Volkshaus, Zurich", "CH", "18.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Sora Sushi - HB Zurich, Zurich", "CH", "74.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "176.32 avec, Ruti ZH", "CH", "2.45", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Baradox AG, Zurich", "CH", "15.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Volkshausstiftung Zurich, Zurich", "CH", "3.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "9.20", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "SBB Bahnhof Wald, Wald ZH", "CH", "27.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "URBAN FOOD CLUTURE GMB, ZURICH", "CH", "135.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "Google One, 650-2530000", "US", "100.00", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Ex Libris AG, Dietikon", "CH", "13.00", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "51.45", "", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "5.80", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "16.05", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "14.60", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.55", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.90", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "Coop-1252 Wald, Wald ZH", "CH", "60.75", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "MORE BAR GMBH, BUBIKON", "CH", "70.00", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "6.40", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "4.20", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.45", "", "**** **** **** 1234"],
["22.09.25", "23.09.25", "Migros M Wald, Wald ZH", "CH", "16.80", "", "**** **** **** 1234"],
["22.09.25", "23.09.25", "BLEICHI + HOTEL, WALD", "CH", "43.00", "", "**** **** **** 1234"],
["23.09.25", "24.09.25", "Coop-1252 Wald, Wald ZH", "CH", "155.75", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 177.35", "170.35", "**** **** **** 1234"],
["27.09.25", "29.09.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "21.50", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "15.75", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "AREAS LA SELVA, BARCELONA", "ES", "EUR 19.11", "18.40", "**** **** **** 1234"],
["02.10.25", "06.10.25", "GOOGLE *YouTube Member, g.co/helppay#", "GB", "15.00", "", "**** **** **** 1234"],
["01.10.25", "06.10.25", "Eventfrog.c 737909203525, Olten", "CH", "114.95", "", "**** **** **** 1234"],
["06.10.25", "07.10.25", "digitec Galaxus (Online), Zurich", "CH", "23.80", "", "**** **** **** 1234"],
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 29.58", "28.35", "**** **** **** 1234"],
["10.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "72.45", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "Ticketcorner*89987227, 410900800800", "CH", "199.80", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "SP NORAYA, RUMISBERG", "CH", "79.90", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "139.95", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "TEMU.COM, BASEL", "CH", "81.20", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Rest Volkshaus, Zurich", "CH", "9.00", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Shell Heuberg, Forch", "CH", "100.10", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Parkhaus Helvetiaplatz, Zurich", "CH", "8.00", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "P2 Parkhaus Ein- & Ausfah, Zurich CH", "CHF", "5.00", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "Migros Zurich Airport, Zurich CH", "CHF", "16.35", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "GITHUB, INC., GITHUB.COM US", "USD", "0.30", "0.25", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Dosenbach Schuhe & Sport, Hinwil CH", "CHF", "50.00", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "257.20", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Landi, Wald CH", "CHF", "67.85", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Puls Apotheke & Drogerie, Hinwil CH", "CHF", "9.20", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM US", "USD", "108.10", "89.50", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "7.80", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "14.50", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "4.20", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "Universitatsspital Zurich, Zurich CH", "CHF", "30.00", "", "**** **** **** 1234"],
["18.10.25", "20.10.25", "HubSpot Germany GmbH, Berlin DE", "EUR", "267.55", "256.05", "**** **** **** 1234"],
["18.10.25", "20.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "7.20", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "20.30", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "11.10", "", "**** **** **** 1234"],
["18.10.25", "20.10.25", "ANTHROPIC, ANTHROPIC.COM US", "USD", "108.10", "88.75", "**** **** **** 1234"],
["20.10.25", "21.10.25", "APCOA, Dubendorf CH", "CHF", "20.00", "", "**** **** **** 1234"],
["20.10.25", "21.10.25", "STWEG Ambassador House, Glattbrugg CH", "CHF", "5.00", "", "**** **** **** 1234"],
["23.10.25", "24.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "199.85", "", "**** **** **** 1234"],
["24.10.25", "24.10.25", "Ticketcorner*90004263, 410900800800 CH", "CHF", "159.75", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Google Duolingo Langu, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "1.50", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "814.10", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "REMO WUEST BACK. KOND., GALGENEN CH", "CHF", "20.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "12.90", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "15.30", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "6.50", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "139.85", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Coop-4054 Hinwil Restaura, Hinwil CH", "CHF", "34.95", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "Coop-1911 Ruti, Ruti ZH CH", "CHF", "66.50", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM US", "USD", "216.20", "178.70", "**** **** **** 1234"],
["01.11.25", "03.11.25", "GOOGLE *ADS5192965135, cc§google.com IE", "", "79.15", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "99.60", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "LANGDOCK GMBH, BERLIN DE", "EUR", "25.00", "23.90", "**** **** **** 1234"],
["01.11.25", "03.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "GOOGLE *YouTubePremium, g.co/helppay# GB", "", "33.90", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "119.45", "", "**** **** **** 1234"],
["03.11.25", "03.11.25", "Netflix.com, Los Gatos NL", "", "20.90", "", "**** **** **** 1234"],
["03.11.25", "04.11.25", "www.fust.ch, Oberburen CH", "CHF", "1'560.90", "", "**** **** **** 1234"],
["06.11.25", "07.11.25", "Grand Casino Luzern AG, Luzern CH", "CHF", "100.00", "108.00", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "0.40", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "15.90", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "wondershare.com, Hong Kong HK", "", "25.95", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "9.85", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Google One, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Steiner-Beck AG, Wald ZH CH", "CHF", "32.20", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["09.11.25", "10.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "25.80", "", "**** **** **** 1234"],
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "Coop-2253 Jona Eisenhof, Jona CH", "CHF", "161.25", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Hess AG Erdbau + Recy, Laupen ZH CH", "CHF", "39.20", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Jumbo-6017 Hinwil, Hinwil CH", "CHF", "173.70", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "57.90", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "140.10", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "22.30", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "UDIO.COM, UDIO.COM US", "EUR", "36.00", "34.35", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "4.95", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "61.50", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "12.95", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "32.30", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "17.95", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "54.00", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "Candrian Catering AG 2, Zurich CH", "CHF", "15.50", "", "**** **** **** 1234"],
["20.10.25", "21.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "178.95", "", "**** **** **** 1234"],
["21.10.25", "22.10.25", "Denner Ruti ZH, Ruti ZH CH", "CHF", "50.15", "", "**** **** **** 1234"],
["24.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "100.65", "", "**** **** **** 1234"],
["24.10.25", "27.10.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "70.35", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "47.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "3.20", "", "**** **** **** 1234"],
["26.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "63.10", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "ONLY, Hinwil CH", "CHF", "222.60", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "104.10", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "24.95", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "177.25", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "H & M, Hinwil CH", "CHF", "43.85", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "52.30", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "59.05", "", "**** **** **** 1234"],
["28.10.25", "29.10.25", "Migros MM Rapperswil, Rapperswil SG CH", "CHF", "23.35", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "ROSSMANN Schweiz AG, Wallisellen CH", "CHF", "13.95", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Migros MR Glattzentrum, Glattzentrum CH", "CHF", "42.20", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Calzedonia, Wallisellen CH", "CHF", "178.25", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Intimissimi, Wallisellen CH", "CHF", "90.20", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "76.80", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "7.95", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Golden Bar GmbH, Wald ZH CH", "CHF", "40.00", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "12.60", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "4.20", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Halle 622, Zurich CH", "CHF", "15.75", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "Eventfrog.c 739003945141, Olten CH", "CHF", "67.85", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "AMERON ZUERICH, ZUERICH CH", "CHF", "30.00", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "SKYLINE EVENTS, ZUERICH CH", "CHF", "13.50", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "AURA Event Saal, Zuerich CH", "CHF", "15.75", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "GOOGLE *YouTube Member, g.co/helppay# GB", "", "15.00", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "VBZ Bellevue, Zurich CH", "CHF", "2.80", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "WAL*CLUB BELLEVUE, HOERI CH", "CHF", "16.50", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "MCDONALDS ZUERICH 2016, ZUERICH CH", "CHF", "10.50", "", "**** **** **** 1234"],
["03.11.25", "04.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "191.15", "", "**** **** **** 1234"],
["05.11.25", "06.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "51.35", "", "**** **** **** 1234"],
["06.11.25", "07.11.25", "Ticketcorner*90024523, 410900800800 CH", "CHF", "158.75", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "SUMUP *JW BROW&LASH, LACHEN CH", "CHF", "290.00", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "104.50", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "10.30", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Pizza Thal GmbH, Murgenthal CH", "CHF", "19.50", "", "**** **** **** 1234"],
["09.11.25", "10.11.25", "TEMU.COM, BASEL CH", "CHF", "190.85", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "Sinora GmbH, Bonstetten CH", "CHF", "115.20", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "33.85", "", "**** **** **** 1234"],
["11.11.25", "12.11.25", "Bleiche Fitness, Wald ZH CH", "CHF", "90.00", "", "**** **** **** 1234"],
["11.11.25", "12.11.25", "Parkhaus Urania, Zurich CH", "CHF", "14.00", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "24.80", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "56.00", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "5.95", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "15.25", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Santa Lucia Altstetten, Zurich", "CH", "38.00", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Agrola TopShop Wald, Wald ZH", "CH", "126.80", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.70", "0.60", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Jumbo-6017 Hinwil, Hinwil", "CH", "53.85", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "57.00", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "13.95", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "NEGISHI ALTSTETTEN BAH, ZUERICH", "CH", "31.90", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "CANVA* I04701-26464248, CANVA.COM", "US", "12.00", "", "**** **** **** 1234"],
["17.11.25", "18.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 270.25", "220.65", "**** **** **** 1234"],
["18.11.25", "19.11.25", "Coop-1252 Wald, Wald ZH", "CH", "7.80", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.30", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "343.30", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 5.41", "4.45", "**** **** **** 1234"],
["18.11.25", "20.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.35", "", "**** **** **** 1234"],
["19.11.25", "20.11.25", "Wuest Partner, Zurich", "CH", "324.30", "", "**** **** **** 1234"],
["19.11.25", "21.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.40", "11.80", "**** **** **** 1234"],
["20.11.25", "21.11.25", "Coop-1252 Wald, Wald ZH", "CH", "85.35", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "17.95", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "6.30", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "STWEG Ambassador House, Glattbrugg", "CH", "7.50", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "16.95", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "MCDONALDS RESTAURANT G, WALLISELLEN", "CH", "13.00", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "Ski- und Snowboard-Center, Neuhaus SG", "CH", "128.00", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "408.25", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "GOOGLE *Duolingo Langu, g.co/HelpPay#", "US", "9.20", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "48.60", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "8.50", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros ELS Santispark PH, Abtwil SG", "CH", "3.00", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "121.80", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "10.50", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "62.80", "", "**** **** **** 1234"],
["23.11.25", "25.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 9.30", "8.90", "**** **** **** 1234"],
["24.11.25", "25.11.25", "Landi, Wald", "CH", "27.15", "", "**** **** **** 1234"],
["24.11.25", "26.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
["26.11.25", "27.11.25", "MyPlace, Affoltern am", "CH", "10.30", "", "**** **** **** 1234"],
["27.11.25", "28.11.25", "Coop-1911 Ruti, Ruti ZH", "CH", "57.20", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "10.10", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "136.25", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "205.35", "", "**** **** **** 1234"],
["01.12.25", "02.12.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "59.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "112.50", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Coop-1252 Wald, Wald ZH", "CH", "117.70", "", "**** **** **** 1234"],
["03.12.25", "03.12.25", "Autodesk ADY, Dublin 2", "IE", "1'989.05", "", "**** **** **** 1234"],
["03.12.25", "03.12.25", "NETFLIX.COM, Amsterdam", "NL", "22.90", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 17.48", "14.50", "**** **** **** 1234"],
["02.12.25", "03.12.25", "GOOGLE *YouTubePremium, g.co/HelpPay#", "US", "33.90", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "103.20", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.80", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "MICROSOFT#G127221615, MSBILL.INFO", "CH", "55.20", "", "**** **** **** 1234"],
["04.12.25", "08.12.25", "Ristorante Amalfi AG, Zurich", "CH", "67.00", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Landi, Wald", "CH", "11.90", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Notariat Wald, Wald ZH", "CH", "40.00", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Coop-1252 Wald, Wald ZH", "CH", "149.75", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "80.30", "", "**** **** **** 1234"],
["07.12.25", "08.12.25", "HERAHELP.COM, 0044330027088", "CY", "EUR 19.95", "19.25", "**** **** **** 1234"],
["07.12.25", "08.12.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["10.12.25", "11.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 43.26", "35.95", "**** **** **** 1234"],
["11.12.25", "12.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "247.40", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "ONLY, Zurich", "CH", "101.75", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "SUMUP *MARYS COSMETICS, USTER", "CH", "419.00", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "S2P*Calzedonia, 0447554090", "IT", "86.75", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Parkhaus Urania, Zurich", "CH", "12.00", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "JustEat, Zurich", "CH", "193.70", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "ONLY, Hinwil", "CH", "126.10", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "242.70", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Manor AG, Hinwil", "CH", "35.35", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Valentyna Nails, R?ti", "CH", "160.00", "", "**** **** **** 1234"],
["13.11.25", "17.11.25", "redcare-apotheke, Sevenum", "NL", "79.90", "", "**** **** **** 1234"],
["16.11.25", "17.11.25", "NORDSTERN, Basel", "CH", "64.20", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "La Makeup Sp. z. o.o., Warsaw", "PL", "104.85", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "94.60", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 7.39", "7.05", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 4.39", "4.20", "**** **** **** 1234"],
["22.11.25", "24.11.25", "Coop-1252 Wald, Wald ZH", "CH", "57.85", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "ASFINAG S16 HMS ST JAKOB, ST.ANTON/ARLB", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
["24.11.25", "25.11.25", "Posthotel Achenkirc, Achenkirch", "AT", "EUR 1'211.80", "1'160.25", "**** **** **** 1234"],
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "15.00", "", "**** **** **** 1234"],
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "8.40", "", "**** **** **** 1234"],
["24.11.25", "25.11.25", "BKG*BOOKING.COM HOTEL, (888)850-3958", "NL", "187.95", "", "**** **** **** 1234"],
["25.11.25", "26.11.25", "Coop-1252 Wald, Wald ZH", "CH", "63.00", "", "**** **** **** 1234"],
["25.11.25", "26.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["26.11.25", "27.11.25", "Hallenbad Wald, Wald ZH", "CH", "54.00", "", "**** **** **** 1234"],
["27.11.25", "28.11.25", "Bestseller AS, Amsterdam", "NL", "35.90", "", "**** **** **** 1234"],
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "84.90", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "126.15", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "3.70", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "GOOGLE *YouTube Member, g.co/HelpPay#", "US", "15.00", "", "**** **** **** 1234"],
["03.12.25", "04.12.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "54.90", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
["11.12.25", "13.12.25", "Manor, Zürich", "CH", "75.00", "", "**** **** **** 1234"],
["12.12.25", "14.12.25", "Zalando, zalando.ch", "CH", "90.00", "", "**** **** **** 1234"],
["13.12.25", "15.12.25", "SBB CFF FFS, Bern", "CH", "60.00", "", "**** **** **** 1234"],
["14.12.25", "16.12.25", "Apple Store, Zürich", "CH", "999.00", "", "**** **** **** 1234"],
["15.12.25", "17.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "150.00", "", "**** **** **** 1234"],
["16.12.25", "18.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "250.00", "", "**** **** **** 1234"],
["17.12.25", "19.12.25", "Shell Waldhof, Wald ZH", "CH", "60.00", "", "**** **** **** 1234"],
["18.12.25", "20.12.25", "Zürich HB, Zürich", "CH", "30.00", "", "**** **** **** 1234"],
["19.12.25", "21.12.25", "Amazon Marketplace, amazon.de", "DE", "80.00", "", "**** **** **** 1234"],
["20.12.25", "22.12.25", "IKEA, Dietlikon", "CH", "400.00", "", "**** **** **** 1234"],
["21.12.25", "23.12.25", "Manor, Zürich", "CH", "100.00", "", "**** **** **** 1234"],
["22.12.25", "24.12.25", "Zalando, zalando.ch", "CH", "110.00", "", "**** **** **** 1234"],
["23.12.25", "25.12.25", "SBB CFF FFS, Bern", "CH", "70.00", "", "**** **** **** 1234"],
["24.12.25", "26.12.25", "Apple Store, Zürich", "CH", "1200.00", "", "**** **** **** 1234"],
["25.12.25", "27.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "180.00", "", "**** **** **** 1234"],
["26.12.25", "28.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "300.00", "", "**** **** **** 1234"],
["27.12.25", "29.12.25", "Shell Waldhof, Wald ZH", "CH", "70.00", "", "**** **** **** 1234"],
["28.12.25", "30.12.25", "Zürich HB, Zürich", "CH", "40.00", "", "**** **** **** 1234"],
["29.12.25", "31.12.25", "Amazon Marketplace, amazon.de", "DE", "100.00", "", "**** **** **** 1234"],
["30.12.25", "01.01.26", "IKEA, Dietlikon", "CH", "450.00", "", "**** **** **** 1234"],
["31.12.25", "02.01.26", "Manor, Zürich", "CH", "125.00", "", "**** **** **** 1234"]
]
}
}
]
}
================================================================================

View file

@ -1,239 +0,0 @@
# AI Call Iteration Flow - JSON Merging System
This document describes the iteration flow for handling large JSON responses from AI that may be truncated and need to be merged across multiple iterations.
## Overview
When an AI response is too large, it may be truncated (cut) at an arbitrary point. The iteration system:
1. Detects incomplete JSON
2. Requests continuation from the AI
3. Merges the continuation with the existing JSON
4. Repeats until complete or max failures reached
---
## Key Variables
| Variable | Type | Purpose |
|----------|------|---------|
| `jsonBase` | `str \| None` | The merged JSON string (CUT version for overlap matching) |
| `candidateJson` | `str` | Temporary holder for merged result until validated |
| `lastValidCompletePart` | `str \| None` | Fallback - last successfully parsed CLOSED JSON |
| `lastOverlapContext` | `str` | Context for retry/continuation prompts |
| `lastHierarchyContextForPrompt` | `str` | Context for retry/continuation prompts |
| `mergeFailCount` | `int` | Global counter (max 3 failures) |
---
## Key Distinction: hierarchyContext vs completePart
| Field | Description | Use Case |
|-------|-------------|----------|
| `hierarchyContext` | **CUT JSON** - truncated at cut point | Used as `jsonBase` for merging with next AI fragment |
| `completePart` | **CLOSED JSON** - all structures properly closed | Used for validation, parsing, and fallback |
**Why this matters:**
- The next AI fragment starts with an **overlap** that matches the CUT point
- If we used `completePart` (closed), the overlap detection would FAIL
- We must use `hierarchyContext` (cut) so overlap matching works correctly
---
## Flow Steps
### Step 1: BUILD PROMPT
**Location:** `subAiCallLooping.py` lines 163-212
**Function:** `buildContinuationContext()` from `modules/shared/jsonUtils.py`
- **First iteration:** Use original prompt
- **Continuation:** `buildContinuationContext(allSections, lastRawResponse, ...)`
- Internally calls `getContexts(lastRawResponse)` to get overlap/hierarchy
- Builds continuation prompt with `overlapContext` + `hierarchyContextForPrompt`
### Step 2: CALL AI
**Location:** `subAiCallLooping.py` lines 214-299
**Function:** `self.aiService.callAi(request)`
- Returns `response.content` as `result`
- NOTE: Do NOT update `lastRawResponse` yet! (only after successful merge)
### Step 4: MERGE
**Location:** `subAiCallLooping.py` lines 338-396
**Function:** `JsonResponseHandler.mergeJsonStringsWithOverlap()` from `modules/services/serviceAi/subJsonResponseHandling.py`
```
IF first iteration (jsonBase is None):
→ candidateJson = result
ELSE:
→ mergedJsonString, hasOverlap = mergeJsonStringsWithOverlap(jsonBase, result)
IF hasOverlap = False (MERGE FAILED):
→ mergeFailCount++
→ If mergeFailCount >= 3: return lastValidCompletePart (fallback)
→ Else: continue (retry with unchanged jsonBase AND lastRawResponse!)
ELSE:
→ candidateJson = mergedJsonString (don't update jsonBase yet!)
→ lastRawResponse = candidateJson (ONLY after first iteration or successful merge!)
TRY DIRECT PARSE of candidateJson:
IF parse succeeds:
→ jsonBase = candidateJson (commit)
→ FINISHED! Return normalized result
ELSE:
→ Proceed to Step 5
```
### Step 5: GET CONTEXTS
**Location:** `subAiCallLooping.py` lines 420-427
**Function:** `getContexts()` from `modules/shared/jsonContinuation.py`
```python
contexts = getContexts(candidateJson)
```
Returns `JsonContinuationContexts`:
- `overlapContext`: `""` if JSON is complete (no cut point)
- `hierarchyContext`: CUT JSON (for merging with next fragment)
- `hierarchyContextForPrompt`: CUT JSON with budget limits (for prompts)
- `completePart`: CLOSED JSON (repaired if needed)
- `jsonParsingSuccess`: `True` if completePart is valid JSON
**Enhancement:** If original JSON is already complete → `overlapContext = ""`
This signals "JSON is complete, no more continuation needed"
### Step 6: DECIDE
**Location:** `subAiCallLooping.py` lines 429-528
#### Case A: `jsonParsingSuccess=true` AND `overlapContext=""`
**→ FINISHED**
- JSON is complete (no cut point)
- `jsonBase = contexts.completePart` (use CLOSED version for final result)
- Return `completePart` as result
#### Case B: `jsonParsingSuccess=true` AND `overlapContext!=""`
**→ CONTINUE to next iteration**
- JSON parseable but has cut point
- `jsonBase = contexts.hierarchyContext` ← **CUT version for next merge!**
- `lastValidCompletePart = contexts.completePart` ← **CLOSED version for fallback**
- Store contexts for next prompt
- `mergeFailCount = 0` (reset on success)
- `lastRawResponse = jsonBase`
- Continue to next iteration
#### Case C: `jsonParsingSuccess=false`
**→ RETRY with same prompt**
- Do NOT update `jsonBase` (keep previous valid state)
- `mergeFailCount++`
- If `mergeFailCount >= 3`: return `lastValidCompletePart` (fallback)
- Else: continue (retry with unchanged jsonBase/lastRawResponse)
---
## Flow Diagram
```
┌───────────────────────────────────────────────────────────────┐
│ ITERATION START │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 1: BUILD PROMPT │
│ - First: original prompt │
│ - Next: buildContinuationContext(lastRawResponse) │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 2: CALL AI → result │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 4: MERGE jsonBase + result → candidateJson │
└───────────────────────────┬───────────────────────────────────┘
┌────────────▼────────────┐
│ Merge OK? │
└────────────┬────────────┘
┌─────────────────────┼─────────────────────┐
│ NO │ YES │
▼ ▼ │
┌──────────────┐ ┌──────────────────┐ │
│ fails++ │ │ TRY DIRECT PARSE │ │
│ if >=3: │ │ of candidateJson │ │
│ RETURN │ └────────┬─────────┘ │
│ fallback │ │ │
│ else: RETRY │ ┌────────▼─────────┐ │
│ (continue) │ │ Parse OK? │ │
└──────────────┘ └────────┬─────────┘ │
│ │
┌─────────────────────┼─────────────────────┐
│ YES │ NO │
▼ ▼ │
┌──────────────┐ ┌──────────────────────────────┐
│ FINISHED ✓ │ │ STEP 5: getContexts() │
│ Return │ │ → jsonParsingSuccess │
│ normalized │ │ → overlapContext │
│ result │ └────────────┬─────────────────┘
└──────────────┘ │
┌────────────▼────────────────────┐
│ STEP 6: DECIDE │
└────────────┬────────────────────┘
┌────────────────────────────┼────────────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────────┐ ┌───────────────────────┐ ┌───────────────────┐
│ success=true │ │ success=true │ │ success=false │
│ overlap="" │ │ overlap!="" │ │ │
│ ───────────── │ │ ───────────────── │ │ ───────────── │
│ FINISHED ✓ │ │ CONTINUE │ │ RETRY │
│ │ │ │ │ │
│ jsonBase = │ │ jsonBase = │ │ jsonBase unchanged│
│ completePart │ │ hierarchyContext │ │ fails++ │
│ (CLOSED) │ │ (CUT for merge!) │ │ │
│ │ │ │ │ if >=3: fallback │
│ Return result │ │ fallback = │ │ else: retry │
│ │ │ completePart │ │ │
│ │ │ (CLOSED) │ │ │
│ │ │ │ │ │
│ │ │ Next iteration → │ │ │
└───────────────────┘ └───────────────────────┘ └───────────────────┘
```
---
## Files Involved
| File | Purpose |
|------|---------|
| `modules/services/serviceAi/subAiCallLooping.py` | Main iteration loop |
| `modules/shared/jsonContinuation.py` | `getContexts()` - context extraction & repair |
| `modules/shared/jsonUtils.py` | `buildContinuationContext()` - prompt building |
| `modules/services/serviceAi/subJsonResponseHandling.py` | `mergeJsonStringsWithOverlap()` |
| `modules/services/serviceAi/subJsonMerger.py` | `ModularJsonMerger` - actual merge logic |
| `modules/datamodels/datamodelAi.py` | `JsonContinuationContexts` model |
---
## Error Handling
### Merge Failures
- Max 3 consecutive failures allowed
- On failure: retry with unchanged `jsonBase` (previous valid state)
- After 3 failures: return `lastValidCompletePart` as fallback
### Parse Failures
- If `getContexts()` cannot produce valid JSON: increment fail counter
- Retry with same prompt (don't update jsonBase)
- After 3 failures: return `lastValidCompletePart` as fallback
### Fallback Strategy
- `lastValidCompletePart` stores the last successfully parsed CLOSED JSON
- Always available as fallback when things go wrong
- Ensures we return valid JSON even after multiple failures

Some files were not shown because too many files have changed in this diff Show more