Merge pull request #161 from valueonag/feat/demo-system-readieness

Feat/demo system readieness
This commit is contained in:
Patrick Motsch 2026-05-12 23:34:19 +02:00 committed by GitHub
commit ab43b42aa9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
56 changed files with 2180 additions and 523 deletions

3
app.py
View file

@ -604,6 +604,9 @@ app.include_router(promptRouter)
from modules.routes.routeDataConnections import router as connectionsRouter from modules.routes.routeDataConnections import router as connectionsRouter
app.include_router(connectionsRouter) app.include_router(connectionsRouter)
from modules.routes.routeRagInventory import router as ragInventoryRouter
app.include_router(ragInventoryRouter)
from modules.routes.routeTableViews import router as tableViewsRouter from modules.routes.routeTableViews import router as tableViewsRouter
app.include_router(tableViewsRouter) app.include_router(tableViewsRouter)

View file

@ -19,7 +19,7 @@ APP_JWT_KEY_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpERjlrSktmZHVuQnJ1VVJDdndLaUcxZGJsT2Z
APP_TOKEN_EXPIRY=300 APP_TOKEN_EXPIRY=300
# CORS Configuration # CORS Configuration
APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://playground.poweron.swiss APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://nyla.poweron.swiss,https://nyla-int.poweron.swiss,https://nyla.poweron-center.net,https://nyla-int.poweron-center.net
# Logging configuration # Logging configuration
APP_LOGGING_LOG_LEVEL = DEBUG APP_LOGGING_LOG_LEVEL = DEBUG
@ -32,18 +32,18 @@ APP_LOGGING_ROTATION_SIZE = 10485760
APP_LOGGING_BACKUP_COUNT = 5 APP_LOGGING_BACKUP_COUNT = 5
# OAuth: Auth app (login/JWT) vs Data app (Microsoft Graph / Google APIs). Same IDs until you split apps in Azure / GCP. # OAuth: Auth app (login/JWT) vs Data app (Microsoft Graph / Google APIs). Same IDs until you split apps in Azure / GCP.
Service_MSFT_AUTH_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_AUTH_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_AUTH_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk= Service_MSFT_AUTH_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQnFBa1kxaG9WY1FJaWdCbVFVaTllUlJfU3Y3MmJkRmkzMDVDWUNtZEhlNVhISzJPcy00ZUVZcklYLXFMV0dIODV3NXNSSFBKQ0ZsZllES3diTEgySDF0T1ZCbFZHREZtcXFGSWNZN1NJbzJzczRRQWxoeVNsNzlsa0VzMHJPWHUydjBBclo=
Service_MSFT_AUTH_REDIRECT_URI = http://localhost:8000/api/msft/auth/login/callback Service_MSFT_AUTH_REDIRECT_URI = http://localhost:8000/api/msft/auth/login/callback
Service_MSFT_DATA_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_DATA_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_DATA_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm83T29rV1pQelMtc1p1MXR4NTFpa19CTEhHQ0xfNmdPUmZqcWp5UHBMS0hYTGl4c1pPdmhTNTJVWUl5WnlnUUZhV0VTRzVCb0d5YjR1NnZPZk5CZ0dGazNGdUJVbjkxeVdrYlNiVjJUYzF2aVFtQnVxTHFqTTJqZlF0RTFGNmE1OGN1TEk= Service_MSFT_DATA_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQnFBa1kyUW96aXFVOVJlLUdyRlVvT1hVU09ILWtMZnV2M19mVUxGMnFPV3FzNTdQa3dTbHVGTDBHTk01ZThLcjh6QUR5VldVZUpfcDlZNTh5YldtLWtjTll6VzJNQ3JCQ3ZubHdmd2JvaExDOXdvQ1pjWDVQTUtFWVAtUHhwS1lFQnJXWk4=
Service_MSFT_DATA_REDIRECT_URI = http://localhost:8000/api/msft/auth/connect/callback Service_MSFT_DATA_REDIRECT_URI = http://localhost:8000/api/msft/auth/connect/callback
Service_GOOGLE_AUTH_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_AUTH_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_AUTH_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETDJhbGVQMHlFQzNPVFI1ZzBMa3pNMGlQUHhaQm10eVl1bFlSeTBybzlTOWE2MURXQ0hkRlo0NlNGbHQxWEl1OVkxQnVKYlhhOXR1cUF4T3k0WDdscktkY1oyYllRTmdDTWpfbUdwWGtSd1JvNlYxeTBJdEtaaS1vYnItcW0yaFM= Service_GOOGLE_AUTH_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQnFBa1kyd1hPd09vcVFtbVg0Sm5Nd1VYVEEtWjZMZkFndmFVS0ZlcTU0dzJnYVYzRkZWbjh0QldyZkhseDV2cUgxYkNHTzF6MXhqQlZ2N0UtbmhPeWRKUHBVdzV0Q1ROaWNuN2xjMmVzMjNZQ2ZYZ3dOTHgxaU5sTGRjVHpfakhYeWF0ZGU=
Service_GOOGLE_AUTH_REDIRECT_URI = http://localhost:8000/api/google/auth/login/callback Service_GOOGLE_AUTH_REDIRECT_URI = http://localhost:8000/api/google/auth/login/callback
Service_GOOGLE_DATA_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_DATA_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_DATA_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpETDJhbGVQMHlFQzNPVFI1ZzBMa3pNMGlQUHhaQm10eVl1bFlSeTBybzlTOWE2MURXQ0hkRlo0NlNGbHQxWEl1OVkxQnVKYlhhOXR1cUF4T3k0WDdscktkY1oyYllRTmdDTWpfbUdwWGtSd1JvNlYxeTBJdEtaaS1vYnItcW0yaFM= Service_GOOGLE_DATA_CLIENT_SECRET = DEV_ENC:Z0FBQUFBQnFBa1kySXoyd1BmTnhOd1owTUJOWm53WlZMMjFHNGJhSUwyd2NDUW9BanlRWVJPLU5jYzRlcm5QeW96d0JYUkVWVWd2dGNBVEpJbElZY2lWb0o5S0gyNnhoV1pnNXhpSFEyaklZZjcwX2lVU0ktMEJGN01DMDhXQ3k4R1BXc1Q3ejFjOEg=
Service_GOOGLE_DATA_REDIRECT_URI = http://localhost:8000/api/google/auth/connect/callback Service_GOOGLE_DATA_REDIRECT_URI = http://localhost:8000/api/google/auth/connect/callback
# ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly. # ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly.
@ -87,13 +87,6 @@ APP_DEBUG_CHAT_WORKFLOW_DIR = D:/Athi/Local/Web/poweron/local/debug
APP_DEBUG_ACCOUNTING_SYNC_ENABLED = True APP_DEBUG_ACCOUNTING_SYNC_ENABLED = True
APP_DEBUG_ACCOUNTING_SYNC_DIR = D:/Athi/Local/Web/poweron/local/debug/sync APP_DEBUG_ACCOUNTING_SYNC_DIR = D:/Athi/Local/Web/poweron/local/debug/sync
# Manadate Pre-Processing Servers
PREPROCESS_ALTHAUS_CHAT_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGbEphQ3ZUMlFMQ2EwSGpoSE9NNzRJNTJtaGk1N0RGakdIYnVVeVFHZmF5OXB3QTVWLVNaZk9wNkhfQkZWRnVwRGRxem9iRzJIWXdpX1NIN2FwSExfT3c9PQ==
# Preprocessor API Configuration
PP_QUERY_API_KEY=ouho02j0rj2oijroi3rj2oijro23jr0990
PP_QUERY_BASE_URL=https://poweron-althaus-preprocess-prod-e3fegaatc7faency.switzerlandnorth-01.azurewebsites.net/api/v1/dataquery/query
# Azure Communication Services Email Configuration # Azure Communication Services Email Configuration
MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt
MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss

View file

@ -21,7 +21,7 @@ APP_JWT_KEY_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNUctb2RwU25iR3ZnanBOdHZhWUtIajZ1RnZ
APP_TOKEN_EXPIRY=300 APP_TOKEN_EXPIRY=300
# CORS Configuration # CORS Configuration
APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://playground.poweron.swiss,https://playground-int.poweron.swiss,https://nyla.poweron.swiss,https://nyla-int.poweron.swiss,https://nyla.poweron-center.net,https://nyla-int.poweron-center.net APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://nyla.poweron.swiss,https://nyla-int.poweron.swiss,https://nyla.poweron-center.net,https://nyla-int.poweron-center.net
# Logging configuration # Logging configuration
APP_LOGGING_LOG_LEVEL = DEBUG APP_LOGGING_LOG_LEVEL = DEBUG
@ -34,18 +34,18 @@ APP_LOGGING_ROTATION_SIZE = 10485760
APP_LOGGING_BACKUP_COUNT = 5 APP_LOGGING_BACKUP_COUNT = 5
# OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs) # OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs)
Service_MSFT_AUTH_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_AUTH_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_AUTH_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk= Service_MSFT_AUTH_CLIENT_SECRET = INT_ENC:Z0FBQUFBQnFBa1kydlVubld1d1h6SUNSWW1aZ3p4X3Zod1NDTjhZVnVYS2lqOERGTFp2OXJ4TGRiNlRLVFpzLUVDTUhkZGhGUWdxa1djdEV5UWkyblN1UHZoaFBjaExNTEpGMG1PRGJEbDdHVll0Ungwcl9JemZ4ZXFzZUNFQmFlZi1DZFlCekU1S3E=
Service_MSFT_AUTH_REDIRECT_URI = https://gateway-int.poweron.swiss/api/msft/auth/login/callback Service_MSFT_AUTH_REDIRECT_URI = https://gateway-int.poweron.swiss/api/msft/auth/login/callback
Service_MSFT_DATA_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_DATA_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_DATA_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm83T29rMDZvcV9qTG5xb1FzUkdqS1llbzRxSEJXbmpONFFtcUtfZXdtZjQybmJSMjBjMEpnRVhiOGRuczZvVFBFdVVTQV80SG9PSnRQTEpLdVViNm5wc2E5aGRLWjZ4TGF1QjVkNmdRSzBpNWNkYXVublFYclVEdEM5TVBBZWVVMW5RVWk= Service_MSFT_DATA_CLIENT_SECRET = INT_ENC:Z0FBQUFBQnFBa1kyS1hWZXEzUzZTTE5MUlJncVowMU95Y0hmV1hveDBZOWdLU1RIUWt3SGlXNGxVTXVKc2QyQmtmWTlJRU43ZnRDdnlDTGxQY0hTU25CWWFFdDhUem9HU0VYcTFJTVFEbVk0dUhmVzJNVlEzNTNWdjdmaW9WeUVDVW5PRmNFZEQzNTY=
Service_MSFT_DATA_REDIRECT_URI = https://gateway-int.poweron.swiss/api/msft/auth/connect/callback Service_MSFT_DATA_REDIRECT_URI = https://gateway-int.poweron.swiss/api/msft/auth/connect/callback
Service_GOOGLE_AUTH_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_AUTH_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_AUTH_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNThGeVRNd3hacThtRnE0bzlDa0JPUWQyaEd6QjlFckdsMGZjRlRfUks2bXV3aDdVRTF3LVRlZVY5WjVzSXV4ZGNnX002RDl3dkNYdGFzZkxVUW01My1wTHRCanVCLUozZEx4TlduQlB5MnpvNTR2SGlvbFl1YkhzTEtsSi1SOEo= Service_GOOGLE_AUTH_CLIENT_SECRET = INT_ENC:Z0FBQUFBQnFBa1kyUTUwNXNGaHRNaGxxbF9sdWJ3Q0xLYU5yOHB4Yk8zMDZvQ29yaEhWOE5JMENXRk5jb2ZBdzRKQ2ZTTld6ZlIxemhOYzN1VE10TjBDRWZEMXlLVWRNYjZ0VG5RZ3I3NWt0SEJzMzdsUmRzcVNmbktRNHZqTUF6a2EyUkVUSFJnZFE=
Service_GOOGLE_AUTH_REDIRECT_URI = https://gateway-int.poweron.swiss/api/google/auth/login/callback Service_GOOGLE_AUTH_REDIRECT_URI = https://gateway-int.poweron.swiss/api/google/auth/login/callback
Service_GOOGLE_DATA_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_DATA_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_DATA_CLIENT_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjNThGeVRNd3hacThtRnE0bzlDa0JPUWQyaEd6QjlFckdsMGZjRlRfUks2bXV3aDdVRTF3LVRlZVY5WjVzSXV4ZGNnX002RDl3dkNYdGFzZkxVUW01My1wTHRCanVCLUozZEx4TlduQlB5MnpvNTR2SGlvbFl1YkhzTEtsSi1SOEo= Service_GOOGLE_DATA_CLIENT_SECRET = INT_ENC:Z0FBQUFBQnFBa1kyV1FRVjF0c0d3d0dyWU1TdW9HdXVkdHdsVWZKYTJjbGZPRDhMRjA2M0FkaUZIVmhIUmFKNjg2ekFodHd6NG80VTI3TC1icW1LZ01jWVZuQ1pKRm5nMW5UREJEaGp2Wl9oRDRCSmZVT0JpTnkwXzgwY0pkV29yczQ5akF2d1ZGcVY=
Service_GOOGLE_DATA_REDIRECT_URI = https://gateway-int.poweron.swiss/api/google/auth/connect/callback Service_GOOGLE_DATA_REDIRECT_URI = https://gateway-int.poweron.swiss/api/google/auth/connect/callback
# ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly. # ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly.
@ -87,13 +87,6 @@ APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE
APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync
# Manadate Pre-Processing Servers
PREPROCESS_ALTHAUS_CHAT_SECRET = INT_ENC:Z0FBQUFBQnBaSnM4UkNBelhvckxCQUVjZm94N3BZUDcxaEMyckE2dm1lRVhqODhrWU1SUjNXZ3dQZlVJOWhveXFkZXpobW5xT0NneGZ2SkNUblFmYXd0WTBYNTl3UmRnSWc9PQ==
# Preprocessor API Configuration
PP_QUERY_API_KEY=ouho02j0rj2oijroi3rj2oijro23jr0990
PP_QUERY_BASE_URL=https://poweron-althaus-preprocess-prod-e3fegaatc7faency.switzerlandnorth-01.azurewebsites.net/api/v1/dataquery/query
# Azure Communication Services Email Configuration # Azure Communication Services Email Configuration
MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt
MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss

View file

@ -32,19 +32,19 @@ APP_LOGGING_ROTATION_SIZE = 10485760
APP_LOGGING_BACKUP_COUNT = 5 APP_LOGGING_BACKUP_COUNT = 5
# OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs) # OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs)
Service_MSFT_AUTH_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_AUTH_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBESkk2T25scFU1T1pNd2FENTFRM3kzcEpSXy1HT0trQkR2Wnl3U3RYbExzRy1YUTkxd3lPZE84U2lhX3FZanp5TjhYRGluLXVjU3hjaWRBUnZLbVhtRDItZ3FxNXJ3MUxicUZTXzJWZVNrR0VKN3ZlNEtET1ppOFk0MzNmbkwyRmROUk4= Service_MSFT_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kyeUZORDYxOFdlNHk1N25kV3pSQVJMUVFwLUFlMzlzQjQ1eVljOTlzX184RndsTmtTV1FjdWkyQlBiUkdCbGt5S2ltZjJxa2I2dHBMdnJqZnhFSnBCampHYjB3RG5URDM1YzZSLVd6TGdaRXRVcEdadE5zM2thNV9SZy1KZDdLSHY=
Service_MSFT_AUTH_REDIRECT_URI=https://api.poweron.swiss/api/msft/auth/login/callback Service_MSFT_AUTH_REDIRECT_URI=https://api.poweron.swiss/api/msft/auth/login/callback
Service_MSFT_DATA_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_DATA_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBESkk2T25scFU1T1pNd2FENTFRM3kzcEpSXy1HT0trQkR2Wnl3U3RYbExzRy1YUTkxd3lPZE84U2lhX3FZanp5TjhYRGluLXVjU3hjaWRBUnZLbVhtRDItZ3FxNXJ3MUxicUZTXzJWZVNrR0VKN3ZlNEtET1ppOFk0MzNmbkwyRmROUk4= Service_MSFT_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kySk5uMmlWczBWTE00MHBIcWlBbVJmVmc3MlBWbDA1YTFaS3psZjVLd3d1X2FvRHV0X0c5blpLV0FpY05aMTJMMzUtcG8wakF2TlM3SGQ2VjFZM3JLT1MwTlZ0bm9BRlpkbHVPQTFNaXJvazlQRzN4M2ZZNEVhV1JHV190dWluSUk=
Service_MSFT_DATA_REDIRECT_URI = https://api.poweron.swiss/api/msft/auth/connect/callback Service_MSFT_DATA_REDIRECT_URI = https://api.poweron.swiss/api/msft/auth/connect/callback
Service_GOOGLE_AUTH_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_AUTH_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3eWFwSEZ4YnRJcjU1OW5kcXZKdkt1Z3gzWDFhVW5Eelh3VnpnNlppcWxweHY5UUQzeDIyVk83cW1XNVE4bllVWnR2MjlSQzFrV1UyUVV6OUt5b3Vqa3QzMUIwNFBqc2FVSXRxTlQ1OHVJZVFibnhBQ2puXzBwSXp5NUZhZjM1d1o= Service_GOOGLE_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kybjVVZ0FldUE1NTJiY2U1N0I0aVU0Z2hfeWlYc2tTdmlxTS1NdGxsRnFHdjZVcW5RRHZkUFhzUTVyX2RaZHlrQThRdTdCRmVBelBOcDlsbFQyd19SZExuWEM5aTcwQ0FvY3ctMUlWU1pndDE0MkdzeTZZRHkwLWU3aW56LW1jS20=
Service_GOOGLE_AUTH_REDIRECT_URI = Service_GOOGLE_AUTH_REDIRECT_URI = https://api.poweron.swiss/api/google/auth/login/callback
Service_GOOGLE_DATA_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_DATA_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3eWFwSEZ4YnRJcjU1OW5kcXZKdkt1Z3gzWDFhVW5Eelh3VnpnNlppcWxweHY5UUQzeDIyVk83cW1XNVE4bllVWnR2MjlSQzFrV1UyUVV6OUt5b3Vqa3QzMUIwNFBqc2FVSXRxTlQ1OHVJZVFibnhBQ2puXzBwSXp5NUZhZjM1d1o= Service_GOOGLE_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kyMnFma3VPOVJtTFFrNDRLN0NkWHY2dUZDWlJzdDVMd3p3N19IY0tWdURRRzExOGZCMjJOYmpKT1E0cTVwYlgtcVJINTY0anZPc1VoTW00cHl6NVh3ZHVTek1oT1RqWUhtamRkZ1dENWlwNTlZSU1oNWczeGdEOC1Gbk5XU2RBcmI=
Service_GOOGLE_DATA_REDIRECT_URI = Service_GOOGLE_DATA_REDIRECT_URI = https://api.poweron.swiss/api/google/auth/connect/callback
# ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly. # ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly.
Service_CLICKUP_CLIENT_ID = O3FX3H602A30MQN4I4SBNGJLIDBD5SL4 Service_CLICKUP_CLIENT_ID = O3FX3H602A30MQN4I4SBNGJLIDBD5SL4
@ -86,13 +86,6 @@ APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE
APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync
# Manadate Pre-Processing Servers
PREPROCESS_ALTHAUS_CHAT_SECRET = PROD_ENC:Z0FBQUFBQnBaSnM4RVRmYW5IelNIbklTUDZIMEoycEN4ZFF0YUJoWWlUTUh2M0dhSXpYRXcwVkRGd1VieDNsYkdCRlpxMUR5Rjk1RDhPRkE5bmVtc2VDMURfLW9QNkxMVHN0M1JhbU9sa3JHWmdDZnlHS3BQRVBGTERVMHhXOVdDOWVqNkhfSUQyOHo=
# Preprocessor API Configuration
PP_QUERY_API_KEY=ouho02j0rj2oijroi3rj2oijro23jr0990
PP_QUERY_BASE_URL=https://poweron-althaus-preprocess-prod-e3fegaatc7faency.switzerlandnorth-01.azurewebsites.net/api/v1/dataquery/query
# Azure Communication Services Email Configuration # Azure Communication Services Email Configuration
MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt
MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss

View file

@ -20,7 +20,7 @@ APP_JWT_KEY_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3elhfV0Rnd2pQRjlMdkVwX1FnSmRhSzNZUl
APP_TOKEN_EXPIRY=300 APP_TOKEN_EXPIRY=300
# CORS Configuration # CORS Configuration
APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://playground.poweron.swiss,https://playground-int.poweron.swiss,https://nyla.poweron.swiss,https://nyla-int.poweron.swiss,https://nyla.poweron-center.net,https://nyla-int.poweron-center.net APP_ALLOWED_ORIGINS=http://localhost:8080,http://localhost:5176,https://nyla.poweron.swiss,https://nyla-int.poweron.swiss,https://nyla.poweron-center.net,https://nyla-int.poweron-center.net
# Logging configuration # Logging configuration
APP_LOGGING_LOG_LEVEL = DEBUG APP_LOGGING_LOG_LEVEL = DEBUG
@ -33,18 +33,18 @@ APP_LOGGING_ROTATION_SIZE = 10485760
APP_LOGGING_BACKUP_COUNT = 5 APP_LOGGING_BACKUP_COUNT = 5
# OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs) # OAuth: Auth app (login/JWT) vs Data app (Graph / Google APIs)
Service_MSFT_AUTH_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_AUTH_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBESkk2T25scFU1T1pNd2FENTFRM3kzcEpSXy1HT0trQkR2Wnl3U3RYbExzRy1YUTkxd3lPZE84U2lhX3FZanp5TjhYRGluLXVjU3hjaWRBUnZLbVhtRDItZ3FxNXJ3MUxicUZTXzJWZVNrR0VKN3ZlNEtET1ppOFk0MzNmbkwyRmROUk4= Service_MSFT_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kySFR2NjBKM084QTNpeUlyUmM4R0N0SU1BZ2x4MmVTZTVHQkVzRE9GdmFkV041MzhudFhobjU0RWNnd3lqeXpKUXA5aGtNZkhtYU12QjBtX0NjemVmdEZBdC1TbXVBSXJTcF9vMlJXd0ZNRTRKRFBMUXNjTF85eTBxakR4RVNfYmU=
Service_MSFT_AUTH_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/msft/auth/login/callback Service_MSFT_AUTH_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/msft/auth/login/callback
Service_MSFT_DATA_CLIENT_ID = c7e7112d-61dc-4f3a-8cd3-08cc4cd7504c Service_MSFT_DATA_CLIENT_ID = 840b759a-4d79-4a7a-9598-f3ed204d99d8
Service_MSFT_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBESkk2T25scFU1T1pNd2FENTFRM3kzcEpSXy1HT0trQkR2Wnl3U3RYbExzRy1YUTkxd3lPZE84U2lhX3FZanp5TjhYRGluLXVjU3hjaWRBUnZLbVhtRDItZ3FxNXJ3MUxicUZTXzJWZVNrR0VKN3ZlNEtET1ppOFk0MzNmbkwyRmROUk4= Service_MSFT_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kyNVU4cVRIZFdjS3l2S1RJVTVlc1ozQ1liZXZDX1VwdFZQUzFtS0N6UWYyeGxkNGNmY1hoaWxEUDBXVU5QR2t3Vi1ZV1A2QkxqbnpobzJwOXdzYTBZaFZYdnNkeDE1VVl0bm4weHFiLXdON2gtZzAwMTkxNWRoZldFM2djSkNHVS0=
Service_MSFT_DATA_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/msft/auth/connect/callback Service_MSFT_DATA_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/msft/auth/connect/callback
Service_GOOGLE_AUTH_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_AUTH_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3eWFwSEZ4YnRJcjU1OW5kcXZKdkt1Z3gzWDFhVW5Eelh3VnpnNlppcWxweHY5UUQzeDIyVk83cW1XNVE4bllVWnR2MjlSQzFrV1UyUVV6OUt5b3Vqa3QzMUIwNFBqc2FVSXRxTlQ1OHVJZVFibnhBQ2puXzBwSXp5NUZhZjM1d1o= Service_GOOGLE_AUTH_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kyUmJleVpTOF9OaFV3NGVfcWVBX2oxSjUwMWRGOFZRWFRIN1FZRzZ6U3VQMlg5a21RY1drTHh3U254LW4zM1A1cXQ1TTFWYlNoek9hSHJIeE4tbm1wU1lKRXlKNU5HVWI4VGZwTVE0VnJGaV8wZmNvdkVrMjJGeXdmZ3UyNmVXN1E=
Service_GOOGLE_AUTH_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/google/auth/login/callback Service_GOOGLE_AUTH_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/google/auth/login/callback
Service_GOOGLE_DATA_CLIENT_ID = 354925410565-aqs2b2qaiqmm73qpjnel6al8eid78uvg.apps.googleusercontent.com Service_GOOGLE_DATA_CLIENT_ID = 813678306829-3f23dnf1cs4aaftubjfickt46tlmkgjm.apps.googleusercontent.com
Service_GOOGLE_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3eWFwSEZ4YnRJcjU1OW5kcXZKdkt1Z3gzWDFhVW5Eelh3VnpnNlppcWxweHY5UUQzeDIyVk83cW1XNVE4bllVWnR2MjlSQzFrV1UyUVV6OUt5b3Vqa3QzMUIwNFBqc2FVSXRxTlQ1OHVJZVFibnhBQ2puXzBwSXp5NUZhZjM1d1o= Service_GOOGLE_DATA_CLIENT_SECRET = PROD_ENC:Z0FBQUFBQnFBa1kyY2pxMDh0U0RqWERianBMTTNtSUZPSzhKUzh4S0RTenR2MmxnRDlvQzJjbDVTczRWLUJtVnhxWTE2MmUxQjJia2xJcVUzVlFlUnpma040NFdHRzVNRUt0OXR0c2JkTkRmQ1RIYllXbXFFaExIQWNycFVHbUxHbmtYOVhOVUV2MFY=
Service_GOOGLE_DATA_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/google/auth/connect/callback Service_GOOGLE_DATA_REDIRECT_URI = https://gateway-prod.poweron.swiss/api/google/auth/connect/callback
# ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly. # ClickUp OAuth (Verbindungen / automation). Create an app in ClickUp: Settings → Apps → API; set redirect URL to Service_CLICKUP_OAUTH_REDIRECT_URI exactly.
@ -87,13 +87,6 @@ APP_DEBUG_CHAT_WORKFLOW_DIR = ./test-chat
APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE APP_DEBUG_ACCOUNTING_SYNC_ENABLED = FALSE
APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync APP_DEBUG_ACCOUNTING_SYNC_DIR = ./debug/sync
# Manadate Pre-Processing Servers
PREPROCESS_ALTHAUS_CHAT_SECRET = PROD_ENC:Z0FBQUFBQnBaSnM4RVRmYW5IelNIbklTUDZIMEoycEN4ZFF0YUJoWWlUTUh2M0dhSXpYRXcwVkRGd1VieDNsYkdCRlpxMUR5Rjk1RDhPRkE5bmVtc2VDMURfLW9QNkxMVHN0M1JhbU9sa3JHWmdDZnlHS3BQRVBGTERVMHhXOVdDOWVqNkhfSUQyOHo=
# Preprocessor API Configuration
PP_QUERY_API_KEY=ouho02j0rj2oijroi3rj2oijro23jr0990
PP_QUERY_BASE_URL=https://poweron-althaus-preprocess-prod-e3fegaatc7faency.switzerlandnorth-01.azurewebsites.net/api/v1/dataquery/query
# Azure Communication Services Email Configuration # Azure Communication Services Email Configuration
MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt MESSAGING_ACS_CONNECTION_STRING = endpoint=https://mailing-poweron-prod.switzerland.communication.azure.com/;accesskey=4UizRfBKBgMhDgQ92IYINM6dJsO1HIeL6W1DvIX9S0GtaS1PjIXqJQQJ99CAACULyCpHwxUcAAAAAZCSuSCt
MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss MESSAGING_ACS_SENDER_EMAIL = DoNotReply@poweron.swiss

View file

@ -19,6 +19,30 @@ from modules.shared.voiceCatalog import getDefaultVoice as _catalogDefaultVoice
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _buildPrimarySttRecognitionFields(
*,
model: str,
lightweight: bool,
) -> Dict[str, Any]:
"""Shared fields for batch + streaming primary RecognitionConfig."""
base: Dict[str, Any] = {
"enable_automatic_punctuation": True,
"model": model,
}
if lightweight:
base["enable_word_time_offsets"] = False
base["enable_word_confidence"] = False
base["max_alternatives"] = 1
base["use_enhanced"] = False
else:
base["enable_word_time_offsets"] = True
base["enable_word_confidence"] = True
base["max_alternatives"] = 3
base["use_enhanced"] = True
return base
# Gemini-TTS speaker IDs from voices.list use short names (e.g. "Kore") and require # Gemini-TTS speaker IDs from voices.list use short names (e.g. "Kore") and require
# SynthesisInput.prompt + VoiceSelectionParams.model_name (google-cloud-texttospeech >= 2.24.0). # SynthesisInput.prompt + VoiceSelectionParams.model_name (google-cloud-texttospeech >= 2.24.0).
_GEMINI_TTS_DEFAULT_MODEL = "gemini-2.5-flash-tts" _GEMINI_TTS_DEFAULT_MODEL = "gemini-2.5-flash-tts"
@ -73,7 +97,10 @@ class ConnectorGoogleSpeech:
sampleRate: int = None, channels: int = None, sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False, skipFallbacks: bool = False,
phraseHints: Optional[list] = None, phraseHints: Optional[list] = None,
alternativeLanguages: Optional[list] = None) -> Dict: alternativeLanguages: Optional[list] = None,
model: str = "latest_long",
lightweight: bool = False,
audioFormat: Optional[str] = None) -> Dict:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -82,6 +109,9 @@ class ConnectorGoogleSpeech:
language: Language code (e.g., 'de-DE', 'en-US') language: Language code (e.g., 'de-DE', 'en-US')
sample_rate: Audio sample rate (auto-detected if None) sample_rate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None) channels: Number of audio channels (auto-detected if None)
model: Google recognition model (e.g. latest_long, latest_short)
lightweight: If True, omit word timings/confidence, single alternative, no enhanced model
audioFormat: If set (webm_opus, linear16, mp3, flac, wav), skip auto-detection
Returns: Returns:
Dict containing transcribed text, confidence, and metadata Dict containing transcribed text, confidence, and metadata
@ -92,8 +122,24 @@ class ConnectorGoogleSpeech:
logger.warning(f"Invalid sampleRate={sampleRate}, treating as unknown for auto-detection") logger.warning(f"Invalid sampleRate={sampleRate}, treating as unknown for auto-detection")
sampleRate = None sampleRate = None
# Auto-detect audio format if not provided explicitFormat = (audioFormat or "").strip().lower() or None
if sampleRate is None or channels is None: if explicitFormat:
if channels is None:
channels = 1
if sampleRate is None:
if explicitFormat == "webm_opus":
sampleRate = 48000
elif explicitFormat == "linear16":
sampleRate = 16000
elif explicitFormat in ("mp3", "flac"):
sampleRate = 44100
elif explicitFormat == "wav":
sampleRate = 16000
else:
sampleRate = 16000
audioFormat = explicitFormat
logger.info(f"STT explicit format: {audioFormat}, {sampleRate}Hz, {channels}ch")
elif sampleRate is None or channels is None:
validation = self.validateAudioFormat(audioContent) validation = self.validateAudioFormat(audioContent)
if not validation["valid"]: if not validation["valid"]:
return { return {
@ -156,12 +202,7 @@ class ConnectorGoogleSpeech:
"encoding": encoding, "encoding": encoding,
"audio_channel_count": channels, "audio_channel_count": channels,
"language_code": language, "language_code": language,
"enable_automatic_punctuation": True, **_buildPrimarySttRecognitionFields(model=model, lightweight=lightweight),
"model": "latest_long",
"enable_word_time_offsets": True,
"enable_word_confidence": True,
"max_alternatives": 3,
"use_enhanced": True,
} }
if phraseHints: if phraseHints:
@ -205,8 +246,7 @@ class ConnectorGoogleSpeech:
sample_rate_hertz=16000, sample_rate_hertz=16000,
audio_channel_count=1, audio_channel_count=1,
language_code=language, language_code=language,
enable_automatic_punctuation=True, **_buildPrimarySttRecognitionFields(model=model, lightweight=lightweight),
model="latest_long"
) )
try: try:
response = await asyncio.to_thread( response = await asyncio.to_thread(
@ -343,7 +383,7 @@ class ConnectorGoogleSpeech:
"error": "No recognition results (silence or unclear audio)" "error": "No recognition results (silence or unclear audio)"
} }
models = ["latest_long", "phone_call", "latest_short"] models = list(dict.fromkeys([model, "latest_long", "phone_call", "latest_short"]))
for fallback_config in fallback_configs: for fallback_config in fallback_configs:
for model in models: for model in models:
@ -419,6 +459,9 @@ class ConnectorGoogleSpeech:
audioQueue: asyncio.Queue, audioQueue: asyncio.Queue,
language: str = "de-DE", language: str = "de-DE",
phraseHints: Optional[list] = None, phraseHints: Optional[list] = None,
model: str = "latest_long",
lightweight: bool = False,
singleUtterance: bool = False,
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
""" """
Stream audio chunks to Google Cloud Speech-to-Text Streaming API. Stream audio chunks to Google Cloud Speech-to-Text Streaming API.
@ -429,9 +472,13 @@ class ConnectorGoogleSpeech:
Send (b"", True) to signal end of stream. Send (b"", True) to signal end of stream.
language: Language code language: Language code
phraseHints: Optional boost phrases phraseHints: Optional boost phrases
model: Google recognition model (e.g. latest_long, latest_short)
lightweight: If True, use non-enhanced primary config (lower latency)
singleUtterance: If True, end stream after first utterance (client should reconnect)
Yields: Yields:
Dicts with keys: isFinal, transcript, confidence, stabilityScore, audioDurationSec Dicts with keys: isFinal, transcript, confidence, stabilityScore, audioDurationSec;
optionally endOfSingleUtterance, reconnectRequired
""" """
STREAM_LIMIT_SEC = 290 STREAM_LIMIT_SEC = 290
streamStartTs = time.time() streamStartTs = time.time()
@ -442,9 +489,7 @@ class ConnectorGoogleSpeech:
"sample_rate_hertz": 48000, "sample_rate_hertz": 48000,
"audio_channel_count": 1, "audio_channel_count": 1,
"language_code": language, "language_code": language,
"enable_automatic_punctuation": True, **_buildPrimarySttRecognitionFields(model=model, lightweight=lightweight),
"model": "latest_long",
"use_enhanced": True,
} }
if phraseHints: if phraseHints:
configParams["speech_contexts"] = [speech.SpeechContext(phrases=phraseHints, boost=15.0)] configParams["speech_contexts"] = [speech.SpeechContext(phrases=phraseHints, boost=15.0)]
@ -453,7 +498,7 @@ class ConnectorGoogleSpeech:
streamingConfig = speech.StreamingRecognitionConfig( streamingConfig = speech.StreamingRecognitionConfig(
config=recognitionConfig, config=recognitionConfig,
interim_results=True, interim_results=True,
single_utterance=False, single_utterance=singleUtterance,
) )
import queue as threadQueue import queue as threadQueue
@ -490,7 +535,22 @@ class ConnectorGoogleSpeech:
) )
for response in responseStream: for response in responseStream:
elapsed = time.time() - streamStartTs elapsed = time.time() - streamStartTs
estimatedDurationSec = totalAudioBytes / (48000 * 1 * 2) if totalAudioBytes else 0
durationFromResults = 0.0
for result in response.results:
rt = getattr(result, "result_end_time", None)
if rt is None:
continue
if hasattr(rt, "total_seconds"):
durationFromResults = max(durationFromResults, float(rt.total_seconds()))
else:
durationFromResults = max(
durationFromResults,
float(getattr(rt, "seconds", 0)) + float(getattr(rt, "nanos", 0)) * 1e-9,
)
estimatedDurationSec = durationFromResults if durationFromResults > 0 else (
totalAudioBytes / (48000 * 1 * 2) if totalAudioBytes else 0.0
)
finalTexts = [] finalTexts = []
interimTexts = [] interimTexts = []
@ -524,6 +584,13 @@ class ConnectorGoogleSpeech:
"stabilityScore": 0.0, "stabilityScore": 0.0,
"audioDurationSec": estimatedDurationSec, "audioDurationSec": estimatedDurationSec,
}), loop) }), loop)
speechEvt = getattr(response, "speech_event_type", None)
if speechEvt and "END_OF_SINGLE_UTTERANCE" in str(speechEvt):
asyncio.run_coroutine_threadsafe(resultOutQ.put({
"endOfSingleUtterance": True,
"audioDurationSec": estimatedDurationSec,
}), loop)
if elapsed >= STREAM_LIMIT_SEC: if elapsed >= STREAM_LIMIT_SEC:
logger.info("Streaming STT approaching 5-min limit, client should reconnect") logger.info("Streaming STT approaching 5-min limit, client should reconnect")
asyncio.run_coroutine_threadsafe(resultOutQ.put({ asyncio.run_coroutine_threadsafe(resultOutQ.put({

View file

@ -62,15 +62,15 @@ class DataSource(PowerOnModel):
description="Owner user ID", description="Owner user ID",
json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}}, json_schema_extra={"label": "Benutzer-ID", "fk_target": {"db": "poweron_app", "table": "UserInDB", "labelField": "username"}},
) )
autoSync: bool = Field( ragIndexEnabled: bool = Field(
default=False, default=False,
description="Automatically sync on schedule", description="When true this tree element is indexed into the RAG knowledge store",
json_schema_extra={"label": "Auto-Sync"}, json_schema_extra={"label": "Im RAG indexieren", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
) )
lastSynced: Optional[float] = Field( lastIndexed: Optional[float] = Field(
default=None, default=None,
description="Last sync timestamp", description="Timestamp of last successful RAG indexing run",
json_schema_extra={"label": "Letzter Sync", "frontend_type": "timestamp"}, json_schema_extra={"label": "Letzte Indexierung", "frontend_type": "timestamp"},
) )
scope: str = Field( scope: str = Field(
default="personal", default="personal",

View file

@ -484,10 +484,10 @@ class UserConnection(PowerOnModel):
default=None, default=None,
description=( description=(
"Per-connection knowledge ingestion preferences. schemaVersion=1 keys: " "Per-connection knowledge ingestion preferences. schemaVersion=1 keys: "
"neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), " "mailContentDepth (metadata|snippet|full), mailIndexAttachments (bool), "
"mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), " "filesIndexBinaries (bool), clickupScope (titles|title_description|with_comments), "
"clickupScope (titles|title_description|with_comments), " "clickupIndexAttachments (bool), maxAgeDays (int). "
"surfaceToggles (dict per authority), maxAgeDays (int)." "Neutralization is controlled per DataSource.neutralize (not here)."
), ),
json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"}, json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"},
) )

View file

@ -1080,6 +1080,8 @@ class CommcoachService:
audioContent=audioContent, audioContent=audioContent,
language=language, language=language,
skipFallbacks=True, skipFallbacks=True,
model="latest_short",
lightweight=True,
) )
transcribedText = "" transcribedText = ""

View file

@ -40,6 +40,8 @@ class BrowserBotConnector:
botAccountPassword: Optional[str] = None, botAccountPassword: Optional[str] = None,
transferMode: str = "auto", transferMode: str = "auto",
debugMode: bool = False, debugMode: bool = False,
avatarMediaData: Optional[str] = None,
avatarMediaType: Optional[str] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Send join command to the Browser Bot service. Send join command to the Browser Bot service.
@ -79,12 +81,16 @@ class BrowserBotConnector:
"debugMode": debugMode, "debugMode": debugMode,
} }
# Add authenticated join credentials if configured
if botAccountEmail and botAccountPassword: if botAccountEmail and botAccountPassword:
payload["botAccountEmail"] = botAccountEmail payload["botAccountEmail"] = botAccountEmail
payload["botAccountPassword"] = botAccountPassword payload["botAccountPassword"] = botAccountPassword
logger.info(f"Bot will join authenticated as {botAccountEmail}") logger.info(f"Bot will join authenticated as {botAccountEmail}")
if avatarMediaData and avatarMediaType:
payload["avatarMediaData"] = avatarMediaData
payload["avatarMediaType"] = avatarMediaType
logger.info(f"Avatar media attached: {avatarMediaType}, {len(avatarMediaData)} chars")
try: try:
async with aiohttp.ClientSession(timeout=_BOT_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=_BOT_TIMEOUT) as session:
async with session.post(f"{self.botUrl}/api/bot", json=payload) as resp: async with session.post(f"{self.botUrl}/api/bot", json=payload) as resp:

View file

@ -111,6 +111,18 @@ class TeamsbotMeetingModule(PowerOnModel):
defaultDirectorPrompts: Optional[str] = Field(default=None, description="JSON list of default director prompts") defaultDirectorPrompts: Optional[str] = Field(default=None, description="JSON list of default director prompts")
goals: Optional[str] = Field(default=None, description="Free-text goals") goals: Optional[str] = Field(default=None, description="Free-text goals")
kpiTargets: Optional[str] = Field(default=None, description="JSON object with structured KPI targets") kpiTargets: Optional[str] = Field(default=None, description="JSON object with structured KPI targets")
defaultMeetingLink: Optional[str] = Field(
default=None,
description="Default Teams meeting URL for new sessions in this module (user can override)",
)
defaultBotName: Optional[str] = Field(
default=None,
description="Default display name for the bot when starting a session from this module",
)
defaultAvatarFileId: Optional[str] = Field(
default=None,
description="FileItem ID for the default avatar image/video shown in the meeting",
)
status: TeamsbotModuleStatus = Field(default=TeamsbotModuleStatus.ACTIVE) status: TeamsbotModuleStatus = Field(default=TeamsbotModuleStatus.ACTIVE)
@ -217,6 +229,7 @@ class TeamsbotUserSettings(PowerOnModel):
triggerCooldownSeconds: Optional[int] = Field(default=None, description="Trigger cooldown override") triggerCooldownSeconds: Optional[int] = Field(default=None, description="Trigger cooldown override")
contextWindowSegments: Optional[int] = Field(default=None, description="Context window override") contextWindowSegments: Optional[int] = Field(default=None, description="Context window override")
debugMode: Optional[bool] = Field(default=None, description="Debug mode override") debugMode: Optional[bool] = Field(default=None, description="Debug mode override")
avatarFileId: Optional[str] = Field(default=None, description="FileItem ID for bot avatar image/video override")
# ============================================================================ # ============================================================================
@ -240,6 +253,7 @@ class TeamsbotConfig(BaseModel):
triggerCooldownSeconds: int = Field(default=3, ge=1, le=30, description="Minimum seconds between AI calls") triggerCooldownSeconds: int = Field(default=3, ge=1, le=30, description="Minimum seconds between AI calls")
contextWindowSegments: int = Field(default=20, ge=5, le=100, description="Number of transcript segments to include in AI context") contextWindowSegments: int = Field(default=20, ge=5, le=100, description="Number of transcript segments to include in AI context")
debugMode: bool = Field(default=False, description="Enable debug mode: screenshots at every join step for diagnostics") debugMode: bool = Field(default=False, description="Enable debug mode: screenshots at every join step for diagnostics")
avatarFileId: Optional[str] = Field(default=None, description="FileItem ID for bot avatar image/video shown in the meeting")
def _getEffectiveBrowserBotUrl(self) -> Optional[str]: def _getEffectiveBrowserBotUrl(self) -> Optional[str]:
"""Resolve the effective browser bot URL: per-instance config takes priority, then env variable.""" """Resolve the effective browser bot URL: per-instance config takes priority, then env variable."""
@ -257,6 +271,7 @@ class TeamsbotStartSessionRequest(BaseModel):
"""Request to start a new Teams Bot session.""" """Request to start a new Teams Bot session."""
meetingLink: str = Field(description="Teams meeting join link (e.g., https://teams.microsoft.com/l/meetup-join/...)") meetingLink: str = Field(description="Teams meeting join link (e.g., https://teams.microsoft.com/l/meetup-join/...)")
botName: Optional[str] = Field(default=None, description="Override bot name for this session") botName: Optional[str] = Field(default=None, description="Override bot name for this session")
moduleId: Optional[str] = Field(default=None, description="Optional MeetingModule to attach this session to")
connectionId: Optional[str] = Field(default=None, description="Microsoft connection ID for Graph API access") connectionId: Optional[str] = Field(default=None, description="Microsoft connection ID for Graph API access")
joinMode: Optional[TeamsbotJoinMode] = Field(default=None, description="How the bot joins: systemBot, anonymous, or userAccount. Defaults to systemBot if credentials configured, else anonymous.") joinMode: Optional[TeamsbotJoinMode] = Field(default=None, description="How the bot joins: systemBot, anonymous, or userAccount. Defaults to systemBot if credentials configured, else anonymous.")
sessionContext: Optional[str] = Field(default=None, description="Custom context/knowledge to provide to the bot for this session (e.g. meeting agenda, documents, background info)") sessionContext: Optional[str] = Field(default=None, description="Custom context/knowledge to provide to the bot for this session (e.g. meeting agenda, documents, background info)")
@ -277,6 +292,9 @@ class CreateMeetingModuleRequest(BaseModel):
defaultDirectorPrompts: Optional[str] = None defaultDirectorPrompts: Optional[str] = None
goals: Optional[str] = None goals: Optional[str] = None
kpiTargets: Optional[str] = None kpiTargets: Optional[str] = None
defaultMeetingLink: Optional[str] = None
defaultBotName: Optional[str] = None
defaultAvatarFileId: Optional[str] = None
class UpdateMeetingModuleRequest(BaseModel): class UpdateMeetingModuleRequest(BaseModel):
@ -287,6 +305,9 @@ class UpdateMeetingModuleRequest(BaseModel):
defaultDirectorPrompts: Optional[str] = None defaultDirectorPrompts: Optional[str] = None
goals: Optional[str] = None goals: Optional[str] = None
kpiTargets: Optional[str] = None kpiTargets: Optional[str] = None
defaultMeetingLink: Optional[str] = None
defaultBotName: Optional[str] = None
defaultAvatarFileId: Optional[str] = None
status: Optional[TeamsbotModuleStatus] = None status: Optional[TeamsbotModuleStatus] = None
@ -304,6 +325,7 @@ class TeamsbotConfigUpdateRequest(BaseModel):
triggerCooldownSeconds: Optional[int] = None triggerCooldownSeconds: Optional[int] = None
contextWindowSegments: Optional[int] = None contextWindowSegments: Optional[int] = None
debugMode: Optional[bool] = None debugMode: Optional[bool] = None
avatarFileId: Optional[str] = None
# ============================================================================ # ============================================================================

View file

@ -25,6 +25,7 @@ from .datamodelTeamsbot import (
TeamsbotDirectorPromptStatus, TeamsbotDirectorPromptStatus,
TeamsbotDirectorPromptMode, TeamsbotDirectorPromptMode,
TeamsbotMeetingModule, TeamsbotMeetingModule,
TeamsbotModuleStatus,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -338,6 +339,8 @@ class TeamsbotObjects:
def getModules(self, instanceId: str) -> List[Dict[str, Any]]: def getModules(self, instanceId: str) -> List[Dict[str, Any]]:
"""Get all meeting modules for a feature instance.""" """Get all meeting modules for a feature instance."""
records = self.db.getRecordset(TeamsbotMeetingModule, recordFilter={"instanceId": instanceId}) records = self.db.getRecordset(TeamsbotMeetingModule, recordFilter={"instanceId": instanceId})
for r in records:
r.setdefault("status", TeamsbotModuleStatus.ACTIVE.value)
records.sort(key=lambda r: r.get("sysCreatedAt") or "", reverse=True) records.sort(key=lambda r: r.get("sysCreatedAt") or "", reverse=True)
return records return records

View file

@ -290,6 +290,19 @@ def _runMigrations():
migrated = False migrated = False
# M2: MeetingModule default meeting link / bot name (additive columns)
if _tableExists("TeamsbotMeetingModule"):
for col, sqlType in (
("defaultMeetingLink", "TEXT"),
("defaultBotName", "TEXT"),
):
if not _columnExists("TeamsbotMeetingModule", col):
cur.execute(
f'ALTER TABLE "TeamsbotMeetingModule" ADD COLUMN "{col}" {sqlType} NULL',
)
logger.info(f"Migration M2: Added TeamsbotMeetingModule.{col}")
migrated = True
# M1: Create default Adhoc modules for orphaned sessions # M1: Create default Adhoc modules for orphaned sessions
# (only runs if TeamsbotSession table exists with moduleId column # (only runs if TeamsbotSession table exists with moduleId column
# and there are sessions without a moduleId) # and there are sessions without a moduleId)

View file

@ -40,6 +40,7 @@ from .datamodelTeamsbot import (
TeamsbotDirectorPromptMode, TeamsbotDirectorPromptMode,
TeamsbotDirectorPromptStatus, TeamsbotDirectorPromptStatus,
TeamsbotMeetingModule, TeamsbotMeetingModule,
TeamsbotModuleStatus,
CreateMeetingModuleRequest, CreateMeetingModuleRequest,
UpdateMeetingModuleRequest, UpdateMeetingModuleRequest,
DIRECTOR_PROMPT_FILE_LIMIT, DIRECTOR_PROMPT_FILE_LIMIT,
@ -203,6 +204,7 @@ async def createModule(
data["instanceId"] = instanceId data["instanceId"] = instanceId
data["mandateId"] = mandateId data["mandateId"] = mandateId
data["ownerUserId"] = str(context.user.id) data["ownerUserId"] = str(context.user.id)
data.setdefault("status", TeamsbotModuleStatus.ACTIVE.value)
module = interface.createModule(data) module = interface.createModule(data)
return {"module": module} return {"module": module}
@ -281,6 +283,11 @@ async def startSession(
interface = _getInterface(context, instanceId) interface = _getInterface(context, instanceId)
config = _getInstanceConfig(instanceId) config = _getInstanceConfig(instanceId)
if body.moduleId:
mod = interface.getModule(body.moduleId)
if not mod or str(mod.get("instanceId") or "") != str(instanceId):
raise HTTPException(status_code=400, detail="Invalid moduleId for this instance")
# Extract and validate meeting URL from user input (handles SafeLinks, invitation text, etc.) # Extract and validate meeting URL from user input (handles SafeLinks, invitation text, etc.)
cleanMeetingUrl = _extractTeamsMeetingUrl(body.meetingLink) cleanMeetingUrl = _extractTeamsMeetingUrl(body.meetingLink)
@ -288,6 +295,7 @@ async def startSession(
sessionData = TeamsbotSession( sessionData = TeamsbotSession(
instanceId=instanceId, instanceId=instanceId,
mandateId=mandateId, mandateId=mandateId,
moduleId=body.moduleId,
meetingLink=cleanMeetingUrl, meetingLink=cleanMeetingUrl,
botName=body.botName or config.botName, botName=body.botName or config.botName,
sessionContext=body.sessionContext, sessionContext=body.sessionContext,
@ -426,6 +434,54 @@ async def listSessions(
return {"sessions": sessions} return {"sessions": sessions}
@router.get("/{instanceId}/dashboard/stream")
@limiter.limit("60/minute")
async def streamDashboard(
request: Request,
instanceId: str,
context: RequestContext = Depends(getRequestContext),
):
"""
SSE channel for the Teamsbot dashboard: repeated snapshots of sessions and meeting modules.
Push interval: 3s while any own session is pending/joining/active, otherwise 20s.
Same session visibility rules as GET /sessions (own sessions unless platform admin).
"""
_validateInstanceAccess(instanceId, context)
interface = _getInterface(context, instanceId)
userId = None if context.isPlatformAdmin else str(context.user.id)
activeStatuses = {
TeamsbotSessionStatus.PENDING.value,
TeamsbotSessionStatus.JOINING.value,
TeamsbotSessionStatus.ACTIVE.value,
}
async def eventGenerator():
while True:
sessionRows = []
try:
sessionRows = interface.getSessions(instanceId, includeEnded=True, userId=userId)
moduleRows = interface.getModules(instanceId)
payload = {"type": "dashboardState", "sessions": sessionRows, "modules": moduleRows}
yield f"data: {json.dumps(payload, default=str)}\n\n"
except asyncio.CancelledError:
raise
except Exception as ex:
logger.warning("dashboard stream tick failed: %s", ex)
yield f"data: {json.dumps({'type': 'error', 'message': 'dashboard_tick_failed'})}\n\n"
hasActive = any((s.get("status") in activeStatuses) for s in sessionRows)
await asyncio.sleep(3.0 if hasActive else 20.0)
return StreamingResponse(
eventGenerator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@router.get("/{instanceId}/sessions/{sessionId}") @router.get("/{instanceId}/sessions/{sessionId}")
@limiter.limit("30/minute") @limiter.limit("30/minute")
async def getSession( async def getSession(
@ -634,12 +690,10 @@ def _getEffectiveConfig(instanceId: str, userId: str, interface) -> TeamsbotConf
if not userSettings: if not userSettings:
return baseConfig return baseConfig
# Merge: user settings override instance defaults (only non-None values) # Merge: user settings override instance defaults (only non-None values).
# Derive mergeable fields from TeamsbotConfig so new fields are picked up automatically.
overrides = {} overrides = {}
for field in ["botName", "aiSystemPrompt", "responseMode", for field in TeamsbotConfig.model_fields:
"responseChannel", "transferMode", "language", "voiceId",
"triggerIntervalSeconds", "triggerCooldownSeconds", "contextWindowSegments",
"debugMode"]:
value = userSettings.get(field) value = userSettings.get(field)
if value is not None: if value is not None:
overrides[field] = value overrides[field] = value

View file

@ -83,10 +83,10 @@ _EPHEMERAL_PHRASE_INTENTS: Dict[str, str] = {
), ),
"agentRound": ( "agentRound": (
"One short sentence (max ~14 words) the assistant says BETWEEN rounds " "One short sentence (max ~14 words) the assistant says BETWEEN rounds "
"of a longer agent task to signal that work is still in progress. " "of a longer agent task to update the audience on what it is doing. "
"Include the placeholder tokens '{round}' and '{maxRounds}' so the " "Include the placeholder token '{activity}' which will be filled with "
"caller can substitute the actual numbers — e.g. 'Step {round} of " "the current activity — e.g. 'I am {activity}, one moment...' or "
"{maxRounds}, still working.'" "'Currently {activity}, almost there...'. Do NOT include step numbers."
), ),
} }
@ -602,6 +602,13 @@ class TeamsbotService:
self._lastTranscriptText: Optional[str] = None self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None self._lastTranscriptId: Optional[str] = None
self._lastSttTime: float = 0.0 self._lastSttTime: float = 0.0
# Audio chunk aggregation: collect chunks and send to STT only
# after a speech pause or when the buffer reaches a target duration.
self._audioBuffer: bytes = b""
self._audioBufferStartTime: float = 0.0
self._audioBufferLastChunkTime: float = 0.0
self._audioBufferSampleRate: int = 16000
self._lastBotResponseText: Optional[str] = None self._lastBotResponseText: Optional[str] = None
self._lastBotResponseTs: float = 0.0 self._lastBotResponseTs: float = 0.0
@ -732,6 +739,12 @@ class TeamsbotService:
hasAuth = bool(botAccountEmail and botAccountPassword) hasAuth = bool(botAccountEmail and botAccountPassword)
logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}") logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}")
avatarMediaData = None
avatarMediaType = None
avatarFileId = self._resolveAvatarFileId(session, interface)
if avatarFileId:
avatarMediaData, avatarMediaType = self._loadAvatarFileData(avatarFileId, interface)
result = await self.browserBotConnector.joinMeeting( result = await self.browserBotConnector.joinMeeting(
sessionId=sessionId, sessionId=sessionId,
meetingUrl=meetingLink, meetingUrl=meetingLink,
@ -743,6 +756,8 @@ class TeamsbotService:
botAccountPassword=botAccountPassword, botAccountPassword=botAccountPassword,
transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto", transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto",
debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False, debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False,
avatarMediaData=avatarMediaData,
avatarMediaType=avatarMediaType,
) )
if result.get("success"): if result.get("success"):
@ -767,6 +782,37 @@ class TeamsbotService:
}) })
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)}) await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})
def _resolveAvatarFileId(self, session, interface):
"""Resolve avatarFileId: module override > config default."""
moduleId = session.get("moduleId")
if moduleId:
module = interface.getModule(moduleId)
if module and module.get("defaultAvatarFileId"):
return module["defaultAvatarFileId"]
return getattr(self.config, "avatarFileId", None)
def _loadAvatarFileData(self, fileId, _teamsbotInterface):
"""Load avatar file as base64 data + mime type. Returns (data, mimeType) or (None, None)."""
import base64
from modules.interfaces import interfaceDbManagement
try:
mgmt = interfaceDbManagement.getInterface(self.currentUser, self.mandateId)
fileRecord = mgmt.getFile(fileId)
if not fileRecord:
logger.warning(f"Avatar file {fileId} not found")
return None, None
mimeType = getattr(fileRecord, "mimeType", None) or "image/png"
rawBytes = mgmt.getFileData(fileId)
if not rawBytes:
logger.warning(f"Avatar file {fileId} has no data")
return None, None
b64 = base64.b64encode(rawBytes).decode("ascii")
logger.info(f"Avatar file loaded: {fileId}, {mimeType}, {len(b64)} chars base64")
return b64, mimeType
except Exception as e:
logger.error(f"Failed to load avatar file {fileId}: {e}")
return None, None
async def leaveMeeting(self, sessionId: str): async def leaveMeeting(self, sessionId: str):
"""Send leave command to the Browser Bot service.""" """Send leave command to the Browser Bot service."""
from . import interfaceFeatureTeamsbot as interfaceDb from . import interfaceFeatureTeamsbot as interfaceDb
@ -1164,6 +1210,14 @@ class TeamsbotService:
interface.updateSession(sessionId, updates) interface.updateSession(sessionId, updates)
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage}) await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
# Flush remaining audio buffer before generating summary
if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
if self._audioBuffer:
logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
self._audioBuffer = b""
self._audioBufferStartTime = 0.0
self._audioBufferLastChunkTime = 0.0
# Generate summary when session ends # Generate summary when session ends
if dbStatus == TeamsbotSessionStatus.ENDED.value: if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId)) asyncio.create_task(self._generateMeetingSummary(sessionId))
@ -1178,11 +1232,18 @@ class TeamsbotService:
voiceInterface, voiceInterface,
websocket: WebSocket, websocket: WebSocket,
): ):
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline.""" """Process an audio chunk from WebRTC capture. The bot-side VAD
(AudioWorklet / ScriptProcessor) already segments speech into 1-8s
voiced chunks. Here we apply a minimum-duration safety net: very short
chunks (<1s) are buffered until they reach 1s; everything else goes
straight to STT. A wall-clock timeout flushes stale buffers."""
import base64 import base64
_MIN_CHUNK_SEC = 1.0
_STALE_TIMEOUT_SEC = 3.0
try: try:
audioBytes = base64.b64decode(audioBase64) audioBytes = base64.b64decode(audioBase64)
if len(audioBytes) < 1000: if len(audioBytes) < 500:
return return
if captureDiagnostics: if captureDiagnostics:
@ -1195,14 +1256,12 @@ class TeamsbotService:
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
) )
# Use RMS from capture diagnostics to skip real silence. isSilent = False
# Byte-variation heuristics produced false positives and dropped valid speech.
if captureDiagnostics and captureDiagnostics.get("rms") is not None: if captureDiagnostics and captureDiagnostics.get("rms") is not None:
try: try:
rmsVal = float(captureDiagnostics.get("rms")) rmsVal = float(captureDiagnostics.get("rms"))
if rmsVal < 0.0003: if rmsVal < 0.0003:
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})") isSilent = True
return
except Exception: except Exception:
pass pass
@ -1210,21 +1269,51 @@ class TeamsbotService:
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
return return
# Treat sampleRate=0 as unknown (triggers auto-detection) now = time.time()
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
if not isSilent:
if not self._audioBuffer:
self._audioBufferStartTime = now
self._audioBuffer += audioBytes
self._audioBufferLastChunkTime = now
self._audioBufferSampleRate = effectiveRate
bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
shouldFlush = (
self._audioBuffer
and (
bufferDuration >= _MIN_CHUNK_SEC
or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
)
)
if not shouldFlush:
return
flushBytes = self._audioBuffer
flushRate = self._audioBufferSampleRate
self._audioBuffer = b""
self._audioBufferStartTime = 0.0
self._audioBufferLastChunkTime = 0.0
flushDuration = len(flushBytes) / (flushRate * 2)
logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")
phraseHints = list(self._knownSpeakers) phraseHints = list(self._knownSpeakers)
if self.config.botName: if self.config.botName:
phraseHints.append(self.config.botName) phraseHints.append(self.config.botName)
sttResult = await voiceInterface.speechToText( sttResult = await voiceInterface.speechToText(
audioContent=audioBytes, audioContent=flushBytes,
language=self.config.language or "de-DE", language=self.config.language or "de-DE",
sampleRate=effectiveSampleRate, sampleRate=flushRate,
channels=1, channels=1,
skipFallbacks=True, skipFallbacks=True,
phraseHints=phraseHints if phraseHints else None, phraseHints=phraseHints if phraseHints else None,
alternativeLanguages=["en-US"], audioFormat="linear16",
) )
if sttResult and sttResult.get("success") and sttResult.get("text"): if sttResult and sttResult.get("success") and sttResult.get("text"):
@ -1252,19 +1341,18 @@ class TeamsbotService:
def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""): def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""):
"""Track current speaker from captions for STT attribution. """Track current speaker from captions for STT attribution.
When the first non-bot caption arrives, retroactively attributes Retroactively attributes any unattributed STT segments whenever a
any STT segments that were created before a speaker was known.""" new non-bot caption speaker arrives (not just the first time)."""
if not speaker: if not speaker:
return return
normalizedSpeaker = speaker.strip() normalizedSpeaker = speaker.strip()
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker): if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
return return
prevSpeaker = self._lastCaptionSpeaker
self._lastCaptionSpeaker = normalizedSpeaker self._lastCaptionSpeaker = normalizedSpeaker
self._knownSpeakers.add(normalizedSpeaker) self._knownSpeakers.add(normalizedSpeaker)
if prevSpeaker is None and self._unattributedTranscriptIds: if self._unattributedTranscriptIds:
from . import interfaceFeatureTeamsbot as interfaceDb from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
for tid in self._unattributedTranscriptIds: for tid in self._unattributedTranscriptIds:
@ -3243,17 +3331,53 @@ class TeamsbotService:
return await self._pickEphemeralPhrase("agentBusy") return await self._pickEphemeralPhrase("agentBusy")
async def _interimAgentRoundMessage( async def _interimAgentRoundMessage(
self, roundNum: int, maxRounds: int self, lastToolLabel: Optional[str] = None
) -> Optional[str]: ) -> Optional[str]:
"""Per-round progress notice for long agent runs (meeting voice / """Per-round progress notice for long agent runs (meeting voice /
chat, ephemeral). Phrasing is AI-localised once per session; chat, ephemeral). Generates a single short phrase in the bot's
``{round}`` and ``{maxRounds}`` placeholders are substituted at configured language that describes the current activity. Unlike
render time. Returns ``None`` if generation failed.""" the cached ephemeral phrases, this is a per-call AI generation
return await self._pickEphemeralPhrase( to avoid mixing English displayLabels into non-English speech."""
"agentRound", targetLang = (self.config.language or "").strip() or "en-US"
substitutions={"round": roundNum, "maxRounds": maxRounds}, botName = (self.config.botName or "the assistant").strip()
activityHint = lastToolLabel or "working on the task"
prompt = (
f"You are a meeting assistant named '{botName}'.\n"
f"Target spoken language (BCP-47): {targetLang}\n\n"
f"The assistant is currently busy with: {activityHint}\n\n"
f"Generate ONE short sentence (max 12 words) in {targetLang} "
f"that tells the audience what the assistant is doing right now. "
f"Natural, spoken style. No step numbers. No quotes around the output.\n"
f"Output ONLY the sentence, nothing else."
) )
try:
aiService = createAiService(
self.currentUser, self.mandateId, self.instanceId
)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=prompt,
context="",
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
),
)
response = await aiService.callAi(request)
except Exception as aiErr:
logger.debug(f"Agent round phrase generation failed: {aiErr}")
return None
if not response or response.errorCount != 0 or not response.content:
return None
result = response.content.strip().strip('"').strip("'")
if len(result) > 200:
result = result[:200]
return result
async def _notifyMeetingEphemeral(self, sessionId: str, text: str) -> None: async def _notifyMeetingEphemeral(self, sessionId: str, text: str) -> None:
"""Deliver a short line to the meeting (TTS + chat per config) without """Deliver a short line to the meeting (TTS + chat per config) without
persisting botResponses/transcripts, so the main agent answer stays the persisting botResponses/transcripts, so the main agent answer stays the
@ -3370,6 +3494,7 @@ class TeamsbotService:
finalText: str = "" finalText: str = ""
rounds = 0 rounds = 0
lastToolLabel: Optional[str] = None
try: try:
async for event in agentService.runAgent( async for event in agentService.runAgent(
prompt=taskText, prompt=taskText,
@ -3390,11 +3515,9 @@ class TeamsbotService:
"round": roundNum, "round": roundNum,
"maxRounds": maxR, "maxRounds": maxR,
}) })
# Runde 1: schon allgemeiner Start-Hinweis; ab Runde 2 ins Meeting melden.
# Director prompts bleiben still — keine Zwischen-Updates ins Meeting.
if roundNum >= 2 and not directorPromptMode: if roundNum >= 2 and not directorPromptMode:
try: try:
roundText = await self._interimAgentRoundMessage(roundNum, maxR) roundText = await self._interimAgentRoundMessage(lastToolLabel)
if roundText: if roundText:
await self._notifyMeetingEphemeral(sessionId, roundText) await self._notifyMeetingEphemeral(sessionId, roundText)
except Exception as roundNoticeErr: except Exception as roundNoticeErr:
@ -3402,12 +3525,26 @@ class TeamsbotService:
f"Session {sessionId}: Per-round agent notice failed: {roundNoticeErr}" f"Session {sessionId}: Per-round agent notice failed: {roundNoticeErr}"
) )
elif event.type == AgentEventTypeEnum.TOOL_CALL: elif event.type == AgentEventTypeEnum.TOOL_CALL:
toolName = (event.data or {}).get("toolName") if event.data else None evtData = event.data or {}
toolName = evtData.get("toolName")
lastToolLabel = evtData.get("displayLabel")
await _emitSessionEvent(sessionId, "agentRun", { await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel, "source": sourceLabel,
"promptId": promptId, "promptId": promptId,
"status": "toolCall", "status": "toolCall",
"toolName": toolName, "toolName": toolName,
"displayLabel": lastToolLabel,
})
elif event.type == AgentEventTypeEnum.TOOL_RESULT:
evtData = event.data or {}
resultSnippet = (evtData.get("data") or "")[:200]
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "toolResult",
"toolName": evtData.get("toolName", ""),
"success": evtData.get("success", True),
"summary": resultSnippet,
}) })
elif event.type == AgentEventTypeEnum.FILE_CREATED: elif event.type == AgentEventTypeEnum.FILE_CREATED:
await _emitSessionEvent(sessionId, "documentCreated", event.data or {}) await _emitSessionEvent(sessionId, "documentCreated", event.data or {})

View file

@ -33,11 +33,6 @@ UI_OBJECTS = [
"label": t("Einstellungen", context="UI"), "label": t("Einstellungen", context="UI"),
"meta": {"area": "settings"} "meta": {"area": "settings"}
}, },
{
"objectKey": "ui.feature.workspace.rag-insights",
"label": t("Wissens-Insights", context="UI"),
"meta": {"area": "rag-insights"},
},
] ]
RESOURCE_OBJECTS = [ RESOURCE_OBJECTS = [
@ -86,7 +81,6 @@ TEMPLATE_ROLES = [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
{"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"}, {"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"},
] ]
}, },
@ -97,7 +91,6 @@ TEMPLATE_ROLES = [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True},

View file

@ -2192,49 +2192,4 @@ async def putWorkspaceUserSettings(
# ========================================================================= # =========================================================================
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs) # RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
# =========================================================================
def _collectWorkspaceFileIdsForStats(instanceId: str, mandateId: Optional[str]) -> List[str]:
"""All FileItem ids for this feature instance (any user). Knowledge rows are often stored
without featureInstanceId; we correlate by file id from the Management DB."""
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
co = ComponentObjects()
rows = co.db.getRecordset(FileItem, recordFilter={"featureInstanceId": instanceId})
out: List[str] = []
m = str(mandateId) if mandateId else ""
for r in rows or []:
rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None)
if not rid:
continue
if m:
mid = r.get("mandateId") if isinstance(r, dict) else getattr(r, "mandateId", "") or ""
if mid and mid != m:
continue
out.append(str(rid))
return out
@router.get("/{instanceId}/rag-statistics")
@limiter.limit("60/minute")
async def getRagStatistics(
request: Request,
instanceId: str = Path(...),
days: int = Query(90, ge=7, le=365, description="Timeline window in days"),
context: RequestContext = Depends(getRequestContext),
):
"""Aggregated, non-identifying knowledge-store metrics for this workspace instance."""
mandateId, _instanceConfig = _validateInstanceAccess(instanceId, context)
workspaceFileIds = _collectWorkspaceFileIdsForStats(instanceId, mandateId)
kdb = getKnowledgeInterface(context.user)
stats = kdb.getRagStatisticsForInstance(
featureInstanceId=instanceId,
mandateId=str(mandateId) if mandateId else "",
timelineDays=days,
workspaceFileIds=workspaceFileIds,
)
if isinstance(stats, dict):
stats.setdefault("scope", {})
stats["scope"]["workspaceFileIdsResolved"] = len(workspaceFileIds)
return JSONResponse(stats)

View file

@ -133,6 +133,60 @@ class KnowledgeObjects:
return {"indexRows": indexCount, "chunks": chunkCount} return {"indexRows": indexCount, "chunks": chunkCount}
def deleteFileContentIndexByDataSource(self, dataSourceId: str) -> Dict[str, int]:
"""Delete all FileContentIndex rows whose provenance.dataSourceId matches.
Used when a user disables ragIndexEnabled on a DataSource to purge
only those chunks that were ingested from that specific tree element.
"""
if not dataSourceId:
return {"indexRows": 0, "chunks": 0}
allRows = self.db.getRecordset(FileContentIndex)
matchedRows = []
for row in allRows:
prov = row.get("provenance") if isinstance(row, dict) else getattr(row, "provenance", None)
if isinstance(prov, dict) and prov.get("dataSourceId") == dataSourceId:
matchedRows.append(row)
mandateIds: set = set()
chunkCount = 0
indexCount = 0
for row in matchedRows:
fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None)
mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "")
if not fid:
continue
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid})
for chunk in chunks:
if self.db.recordDelete(ContentChunk, chunk["id"]):
chunkCount += 1
if self.db.recordDelete(FileContentIndex, fid):
indexCount += 1
if mid:
mandateIds.add(str(mid))
for mid in mandateIds:
try:
from modules.interfaces.interfaceDbBilling import _getRootInterface
_getRootInterface().reconcileMandateStorageBilling(mid)
except Exception as ex:
logger.warning("reconcileMandateStorageBilling after datasource purge failed: %s", ex)
return {"indexRows": indexCount, "chunks": chunkCount}
def listFileContentIndexByDataSource(self, dataSourceId: str) -> List[Dict[str, Any]]:
"""List all FileContentIndex rows whose provenance.dataSourceId matches."""
if not dataSourceId:
return []
allRows = self.db.getRecordset(FileContentIndex)
out = []
for row in allRows:
prov = row.get("provenance") if isinstance(row, dict) else getattr(row, "provenance", None)
if isinstance(prov, dict) and prov.get("dataSourceId") == dataSourceId:
out.append(dict(row) if not isinstance(row, dict) else row)
return out
def deleteFileContentIndex(self, fileId: str) -> bool: def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks.""" """Delete a FileContentIndex and all associated ContentChunks."""
existing = self.getFileContentIndex(fileId) existing = self.getFileContentIndex(fileId)

View file

@ -1274,17 +1274,20 @@ class ComponentObjects:
if getattr(permissions, "update", None) != AccessLevel.ALL: if getattr(permissions, "update", None) != AccessLevel.ALL:
raise PermissionError("Setting global scope requires ALL permission") raise PermissionError("Setting global scope requires ALL permission")
self.db.recordModify(FileFolder, folderId, {"scope": scope}) allFolderIds = self._collectChildFolderIds(folderId)
for fid in allFolderIds:
self.db.recordModify(FileFolder, fid, {"scope": scope})
filesUpdated = 0 filesUpdated = 0
if cascadeToFiles: if cascadeToFiles:
items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId}) for fid in allFolderIds:
for item in items: items = self.db.getRecordset(FileItem, recordFilter={"folderId": fid})
owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None) for item in items:
if owner == self.userId: owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) if owner == self.userId:
self.db.recordModify(FileItem, iid, {"scope": scope}) iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
filesUpdated += 1 self.db.recordModify(FileItem, iid, {"scope": scope})
filesUpdated += 1
return {"folderId": folderId, "scope": scope, "filesUpdated": filesUpdated} return {"folderId": folderId, "scope": scope, "filesUpdated": filesUpdated}
@ -1294,16 +1297,19 @@ class ComponentObjects:
raise FileNotFoundError(f"Folder {folderId} not found") raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "update") self._requireFolderWriteAccess(folder, folderId, "update")
self.db.recordModify(FileFolder, folderId, {"neutralize": neutralize}) allFolderIds = self._collectChildFolderIds(folderId)
for fid in allFolderIds:
self.db.recordModify(FileFolder, fid, {"neutralize": neutralize})
items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId})
filesUpdated = 0 filesUpdated = 0
for item in items: for fid in allFolderIds:
owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None) items = self.db.getRecordset(FileItem, recordFilter={"folderId": fid})
if owner == self.userId: for item in items:
iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
self.db.recordModify(FileItem, iid, {"neutralize": neutralize}) if owner == self.userId:
filesUpdated += 1 iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
self.db.recordModify(FileItem, iid, {"neutralize": neutralize})
filesUpdated += 1
return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": filesUpdated} return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": filesUpdated}

View file

@ -69,7 +69,10 @@ class VoiceObjects:
sampleRate: int = None, channels: int = None, sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False, skipFallbacks: bool = False,
phraseHints: list = None, phraseHints: list = None,
alternativeLanguages: list = None) -> Dict[str, Any]: alternativeLanguages: list = None,
model: str = "latest_long",
lightweight: bool = False,
audioFormat: Optional[str] = None) -> Dict[str, Any]:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -81,6 +84,9 @@ class VoiceObjects:
skipFallbacks: If True, skip fallback attempts (use when audio format is known) skipFallbacks: If True, skip fallback attempts (use when audio format is known)
phraseHints: Optional list of phrases to boost recognition (names, terms) phraseHints: Optional list of phrases to boost recognition (names, terms)
alternativeLanguages: Optional list of additional language codes for multi-language alternativeLanguages: Optional list of additional language codes for multi-language
model: Google STT model (e.g. latest_long, latest_short)
lightweight: If True, omit word-level features and enhanced model
audioFormat: If set (webm_opus, linear16, ...), skip format auto-detection
Returns: Returns:
Dict containing transcribed text, confidence, and metadata Dict containing transcribed text, confidence, and metadata
@ -97,6 +103,9 @@ class VoiceObjects:
skipFallbacks=skipFallbacks, skipFallbacks=skipFallbacks,
phraseHints=phraseHints, phraseHints=phraseHints,
alternativeLanguages=alternativeLanguages, alternativeLanguages=alternativeLanguages,
model=model,
lightweight=lightweight,
audioFormat=audioFormat,
) )
if result["success"]: if result["success"]:
@ -120,13 +129,23 @@ class VoiceObjects:
audioQueue: asyncio.Queue, audioQueue: asyncio.Queue,
language: str = "de-DE", language: str = "de-DE",
phraseHints: Optional[list] = None, phraseHints: Optional[list] = None,
model: str = "latest_long",
lightweight: bool = False,
singleUtterance: bool = False,
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
""" """
Stream audio to Google Streaming STT and yield interim/final results. Stream audio to Google Streaming STT and yield interim/final results.
Billing is recorded for each final result. Billing is recorded for each final result.
""" """
connector = self._getGoogleSpeechConnector() connector = self._getGoogleSpeechConnector()
async for event in connector.streamingRecognize(audioQueue, language, phraseHints): async for event in connector.streamingRecognize(
audioQueue,
language,
phraseHints,
model=model,
lightweight=lightweight,
singleUtterance=singleUtterance,
):
if event.get("isFinal") and self.billingCallback: if event.get("isFinal") and self.billingCallback:
durationSec = event.get("audioDurationSec", 0) durationSec = event.get("audioDurationSec", 0)
priceCHF = connector.calculateSttCostCHF(durationSec) priceCHF = connector.calculateSttCostCHF(durationSec)

View file

@ -1986,10 +1986,10 @@ def getUserViewTransactions(
if not pagination: if not pagination:
raise HTTPException(status_code=400, detail="pagination required for groupSummary") raise HTTPException(status_code=400, detail="pagination required for groupSummary")
import json as _json import json as _json
from collections import defaultdict
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
from modules.routes.routeHelpers import ( from modules.routes.routeHelpers import (
applyViewToParams, applyViewToParams,
build_group_summary_groups,
effective_group_by_levels, effective_group_by_levels,
resolveView, resolveView,
) )
@ -2018,28 +2018,7 @@ def getUserViewTransactions(
summary_params, summary_params,
ctx.user, ctx.user,
) )
counts: Dict[str, int] = defaultdict(int) groups_out = build_group_summary_groups(all_rows, field, null_label, groupByLevels=levels)
labels: Dict[str, str] = {}
null_key = "\x00NULL"
for item in all_rows:
raw = item.get(field)
if raw is None or raw == "":
nk = null_key
labels[nk] = null_label
else:
nk = str(raw)
if nk not in labels:
labels[nk] = nk
counts[nk] += 1
groups_out: List[Dict[str, Any]] = []
for nk in sorted(counts.keys(), key=lambda x: (x == null_key, labels.get(x, x).lower())):
groups_out.append(
{
"value": None if nk == null_key else nk,
"label": labels.get(nk, nk),
"totalCount": counts[nk],
}
)
return JSONResponse(content={"groups": groups_out}) return JSONResponse(content={"groups": groups_out})
paginationParams = None paginationParams = None

View file

@ -130,7 +130,7 @@ def get_auth_authority_options(
# ============================================================================ # ============================================================================
@router.get("/") @router.get("/")
@limiter.limit("30/minute") @limiter.limit("60/minute")
async def get_connections( async def get_connections(
request: Request, request: Request,
pagination: Optional[str] = Query(None, description="JSON-encoded PaginationParams object"), pagination: Optional[str] = Query(None, description="JSON-encoded PaginationParams object"),
@ -197,7 +197,9 @@ async def get_connections(
"lastChecked": connection.lastChecked, "lastChecked": connection.lastChecked,
"expiresAt": connection.expiresAt, "expiresAt": connection.expiresAt,
"tokenStatus": tokenStatus, "tokenStatus": tokenStatus,
"tokenExpiresAt": tokenExpiresAt "tokenExpiresAt": tokenExpiresAt,
"knowledgeIngestionEnabled": getattr(connection, "knowledgeIngestionEnabled", False),
"knowledgePreferences": getattr(connection, "knowledgePreferences", None) or {},
}) })
return items return items
@ -264,7 +266,7 @@ async def get_connections(
}) })
enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection) enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection)
filtered = apply_strategy_b_filters_and_sort(enhanced_connections_dict, paginationParams, currentUser) filtered = apply_strategy_b_filters_and_sort(enhanced_connections_dict, paginationParams, currentUser)
groups_out = build_group_summary_groups(filtered, field, null_label) groups_out = build_group_summary_groups(filtered, field, null_label, groupByLevels=groupByLevels)
return JSONResponse(content={"groups": groups_out}) return JSONResponse(content={"groups": groups_out})
try: try:
@ -725,3 +727,171 @@ def delete_connection(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to delete connection: {str(e)}" detail=f"Failed to delete connection: {str(e)}"
) )
# =========================================================================
# Knowledge Consent & Control Endpoints
# =========================================================================
def _findOwnConnection(interface, userId: str, connectionId: str):
"""Find a connection owned by the user. Returns None if not found."""
connections = interface.getUserConnections(userId)
for conn in connections:
if conn.id == connectionId:
return conn
return None
@router.patch("/{connectionId}/knowledge-consent")
@limiter.limit("10/minute")
def _updateKnowledgeConsent(
request: Request,
connectionId: str = Path(..., description="Connection ID"),
enabled: bool = Body(..., embed=True),
currentUser: User = Depends(getCurrentUser),
) -> Dict[str, Any]:
"""Master switch: can PowerOn ingest data from this connection into the RAG knowledge store?
enabled=False: purge ALL chunks for this connection + cancel running jobs.
enabled=True: set flag; enqueue bootstrap only if rag-enabled DataSources exist.
"""
try:
interface = getInterface(currentUser)
connection = _findOwnConnection(interface, currentUser.id, connectionId)
if not connection:
raise HTTPException(status_code=404, detail=routeApiMsg("Connection not found"))
from modules.interfaces.interfaceDbApp import getRootInterface
rootIf = getRootInterface()
rootIf.db.recordModify(UserConnection, connectionId, {"knowledgeIngestionEnabled": enabled})
purged = None
cancelled = 0
bootstrapEnqueued = False
if not enabled:
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
purged = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId)
from modules.serviceCenter.services.serviceBackgroundJobs import cancelJobsByConnection
cancelled = cancelJobsByConnection(connectionId)
else:
from modules.datamodels.datamodelDataSource import DataSource
dataSources = rootIf.db.getRecordset(DataSource, recordFilter={"connectionId": connectionId, "ragIndexEnabled": True})
if dataSources:
import asyncio
from modules.serviceCenter.services.serviceBackgroundJobs import startJob
authority = connection.authority.value if hasattr(connection.authority, "value") else str(connection.authority or "")
async def _enqueue():
await startJob(
"connection.bootstrap",
{"connectionId": connectionId, "authority": authority.lower()},
triggeredBy=str(currentUser.id),
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(_enqueue())
else:
loop.run_until_complete(_enqueue())
except RuntimeError:
asyncio.run(_enqueue())
bootstrapEnqueued = True
import json as _json
from modules.shared.auditLogger import audit_logger
from modules.datamodels.datamodelAudit import AuditCategory
audit_logger.logEvent(
userId=str(currentUser.id),
mandateId=str(getattr(connection, "mandateId", "") or ""),
category=AuditCategory.PERMISSION.value,
action="knowledge_consent_changed",
details=_json.dumps({"connectionId": connectionId, "enabled": enabled}),
)
logger.info("Knowledge consent %s for connection %s by user %s",
"enabled" if enabled else "disabled", connectionId, currentUser.id)
return {
"connectionId": connectionId,
"knowledgeIngestionEnabled": enabled,
"purged": purged,
"cancelledJobs": cancelled,
"bootstrapEnqueued": bootstrapEnqueued,
}
except HTTPException:
raise
except Exception as e:
logger.error("Error updating knowledge consent: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/{connectionId}/knowledge-preferences")
@limiter.limit("20/minute")
def _updateKnowledgePreferences(
request: Request,
connectionId: str = Path(..., description="Connection ID"),
preferences: Dict[str, Any] = Body(..., embed=True),
currentUser: User = Depends(getCurrentUser),
) -> Dict[str, Any]:
"""Update per-connection knowledge ingestion preferences (mail depth, attachments, etc.)."""
_ALLOWED_KEYS = {"mailContentDepth", "mailIndexAttachments", "filesIndexBinaries",
"clickupScope", "clickupIndexAttachments", "maxAgeDays"}
try:
interface = getInterface(currentUser)
connection = _findOwnConnection(interface, currentUser.id, connectionId)
if not connection:
raise HTTPException(status_code=404, detail=routeApiMsg("Connection not found"))
existing = getattr(connection, "knowledgePreferences", None) or {}
cleaned = {k: v for k, v in preferences.items() if k in _ALLOWED_KEYS}
merged = {**existing, **cleaned, "schemaVersion": 1}
from modules.interfaces.interfaceDbApp import getRootInterface
getRootInterface().db.recordModify(UserConnection, connectionId, {"knowledgePreferences": merged})
logger.info("Knowledge preferences updated for connection %s", connectionId)
return {"connectionId": connectionId, "knowledgePreferences": merged, "updated": True}
except HTTPException:
raise
except Exception as e:
logger.error("Error updating knowledge preferences: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/{connectionId}/knowledge-stop")
@limiter.limit("10/minute")
def _stopKnowledgeJobs(
request: Request,
connectionId: str = Path(..., description="Connection ID"),
currentUser: User = Depends(getCurrentUser),
) -> Dict[str, Any]:
"""Cancel all running/pending bootstrap jobs for this connection."""
try:
interface = getInterface(currentUser)
connection = _findOwnConnection(interface, currentUser.id, connectionId)
if not connection:
raise HTTPException(status_code=404, detail=routeApiMsg("Connection not found"))
from modules.serviceCenter.services.serviceBackgroundJobs import cancelJobsByConnection
cancelled = cancelJobsByConnection(connectionId)
import json as _json
from modules.shared.auditLogger import audit_logger
from modules.datamodels.datamodelAudit import AuditCategory
audit_logger.logEvent(
userId=str(currentUser.id),
mandateId=str(getattr(connection, "mandateId", "") or ""),
category=AuditCategory.PERMISSION.value,
action="knowledge_jobs_stopped",
details=_json.dumps({"connectionId": connectionId, "cancelledCount": cancelled}),
)
logger.info("Stopped %d knowledge jobs for connection %s", cancelled, connectionId)
return {"connectionId": connectionId, "cancelled": cancelled}
except HTTPException:
raise
except Exception as e:
logger.error("Error stopping knowledge jobs: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View file

@ -413,7 +413,7 @@ def patch_folder_scope(
scope = body.get("scope") scope = body.get("scope")
if not scope: if not scope:
raise HTTPException(status_code=400, detail="scope is required") raise HTTPException(status_code=400, detail="scope is required")
cascadeToFiles = body.get("cascadeToFiles", False) cascadeToFiles = body.get("cascadeChildren", body.get("cascadeToFiles", False))
managementInterface = interfaceDbManagement.getInterface( managementInterface = interfaceDbManagement.getInterface(
currentUser, currentUser,
mandateId=str(context.mandateId) if context.mandateId else None, mandateId=str(context.mandateId) if context.mandateId else None,
@ -543,7 +543,7 @@ def get_files(
FileItem, FileItem,
) )
filtered = apply_strategy_b_filters_and_sort(allItems, paginationParams, currentUser) filtered = apply_strategy_b_filters_and_sort(allItems, paginationParams, currentUser)
groups_out = build_group_summary_groups(filtered, field, null_label) groups_out = build_group_summary_groups(filtered, field, null_label, groupByLevels=groupByLevels)
return JSONResponse(content={"groups": groups_out}) return JSONResponse(content={"groups": groups_out})
if mode == "filterValues": if mode == "filterValues":

View file

@ -100,7 +100,7 @@ def get_prompts(
result if isinstance(result, list) else (result.items if hasattr(result, "items") else []) result if isinstance(result, list) else (result.items if hasattr(result, "items") else [])
) )
filtered = apply_strategy_b_filters_and_sort(allItems, paginationParams, currentUser) filtered = apply_strategy_b_filters_and_sort(allItems, paginationParams, currentUser)
groups_out = build_group_summary_groups(filtered, field, null_label) groups_out = build_group_summary_groups(filtered, field, null_label, groupByLevels=groupByLevels)
return JSONResponse(content={"groups": groups_out}) return JSONResponse(content={"groups": groups_out})
if mode == "filterValues": if mode == "filterValues":

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch # Copyright (c) 2025 Patrick Motsch
# All rights reserved. # All rights reserved.
"""PATCH endpoints for DataSource and FeatureDataSource scope/neutralize tagging.""" """PATCH endpoints for DataSource and FeatureDataSource scope/neutralize/rag-index tagging."""
import logging import logging
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@ -125,3 +125,75 @@ def _updateNeutralizeFields(
except Exception as e: except Exception as e:
logger.error("Error updating neutralizeFields: %s", e) logger.error("Error updating neutralizeFields: %s", e)
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@router.patch("/{sourceId}/rag-index")
@limiter.limit("30/minute")
def _updateDataSourceRagIndex(
request: Request,
sourceId: str = Path(..., description="ID of the DataSource"),
ragIndexEnabled: bool = Body(..., embed=True),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Toggle RAG indexing for a DataSource.
true: sets flag + enqueues mini-bootstrap for this DataSource only.
false: sets flag + synchronously purges all chunks from this DataSource.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootIf = getRootInterface()
rec = rootIf.db.getRecord(DataSource, sourceId)
if not rec:
raise HTTPException(status_code=404, detail=f"DataSource {sourceId} not found")
rootIf.db.recordModify(DataSource, sourceId, {"ragIndexEnabled": ragIndexEnabled})
logger.info("Updated ragIndexEnabled=%s for DataSource %s", ragIndexEnabled, sourceId)
if ragIndexEnabled:
from modules.serviceCenter.services.serviceBackgroundJobs import startJob
import asyncio
connectionId = rec.get("connectionId") or rec.get("connection_id") or ""
conn = rootIf.getUserConnectionById(connectionId) if connectionId else None
authority = ""
if conn:
authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority or "")
async def _enqueue():
await startJob(
"connection.bootstrap",
{"connectionId": connectionId, "authority": authority.lower(), "dataSourceIds": [sourceId]},
triggeredBy=str(context.user.id),
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(_enqueue())
else:
loop.run_until_complete(_enqueue())
except RuntimeError:
asyncio.run(_enqueue())
else:
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
purgeResult = getKnowledgeInterface(None).deleteFileContentIndexByDataSource(sourceId)
logger.info("Purged %d index rows / %d chunks for DataSource %s",
purgeResult.get("indexRows", 0), purgeResult.get("chunks", 0), sourceId)
import json
from modules.shared.auditLogger import audit_logger
from modules.datamodels.datamodelAudit import AuditCategory
audit_logger.logEvent(
userId=str(context.user.id),
mandateId=context.mandateId,
category=AuditCategory.PERMISSION.value,
action="rag_index_toggled",
details=json.dumps({"sourceId": sourceId, "ragIndexEnabled": ragIndexEnabled}),
)
return {"sourceId": sourceId, "ragIndexEnabled": ragIndexEnabled, "updated": True}
except HTTPException:
raise
except Exception as e:
logger.error("Error updating datasource ragIndexEnabled: %s", e)
raise HTTPException(status_code=500, detail=str(e))

View file

@ -825,45 +825,106 @@ def build_group_summary_groups(
items: List[Dict[str, Any]], items: List[Dict[str, Any]],
field: str, field: str,
null_label: str = "", null_label: str = "",
groupByLevels: List[Dict[str, Any]] | None = None,
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Build {"value", "label", "totalCount"} for mode=groupSummary (single grouping level). Build {"value", "label", "totalCount"} summaries for mode=groupSummary.
When *groupByLevels* contains more than one level the function produces one
entry per unique combination of all level values (flat permutations).
``value`` becomes a ``///``-joined composite key and ``label`` the ``/``-joined
human-readable label so the frontend can split them back.
""" """
from collections import defaultdict from collections import defaultdict
counts: Dict[str, int] = defaultdict(int) fields: list[dict] = []
display_by_key: Dict[str, str] = {} if groupByLevels and len(groupByLevels) > 1:
null_key = "\x00NULL" for lvl in groupByLevels:
label_attr = f"{field}Label" f = lvl.get("field", "")
nl = str(lvl.get("nullLabel") or null_label)
if f:
fields.append({"field": f, "nullLabel": nl})
if not fields:
fields = [{"field": field, "nullLabel": null_label}]
nullKey = "\x00NULL"
if len(fields) == 1:
f = fields[0]["field"]
nl = fields[0]["nullLabel"]
counts: Dict[str, int] = defaultdict(int)
displayByKey: Dict[str, str] = {}
labelAttr = f"{f}Label"
for item in items:
raw = item.get(f)
if raw is None or raw == "":
nk = nullKey
display = nl
else:
nk = str(raw)
display = None
lbl = item.get(labelAttr)
if lbl is not None and lbl != "":
display = str(lbl)
if display is None:
display = nk
counts[nk] += 1
if nk not in displayByKey:
displayByKey[nk] = display
orderedKeys = sorted(
counts.keys(),
key=lambda x: (x == nullKey, str(displayByKey.get(x, x)).lower()),
)
return [
{
"value": None if nk == nullKey else nk,
"label": displayByKey.get(nk, nk),
"totalCount": counts[nk],
}
for nk in orderedKeys
]
counts = defaultdict(int)
displayByComposite: Dict[str, list] = {}
filtersByComposite: Dict[str, dict] = {}
for item in items: for item in items:
raw = item.get(field) parts: list[str] = []
if raw is None or raw == "": labels: list[str] = []
nk = null_key filterMap: dict = {}
display = null_label for fd in fields:
else: f = fd["field"]
nk = str(raw) nl = fd["nullLabel"]
display = None labelAttr = f"{f}Label"
lbl = item.get(label_attr) raw = item.get(f)
if lbl is not None and lbl != "": if raw is None or raw == "":
display = str(lbl) parts.append(nullKey)
if display is None: labels.append(nl)
display = nk filterMap[f] = None
counts[nk] += 1 else:
if nk not in display_by_key: parts.append(str(raw))
display_by_key[nk] = display lbl = item.get(labelAttr)
labels.append(str(lbl) if lbl not in (None, "") else str(raw))
filterMap[f] = str(raw)
compositeKey = "///".join(parts)
counts[compositeKey] += 1
if compositeKey not in displayByComposite:
displayByComposite[compositeKey] = labels
filtersByComposite[compositeKey] = filterMap
ordered_keys = sorted( orderedKeys = sorted(
counts.keys(), counts.keys(),
key=lambda x: (x == null_key, str(display_by_key.get(x, x)).lower()), key=lambda x: tuple(
(seg == nullKey, seg.lower()) for seg in x.split("///")
),
) )
return [ return [
{ {
"value": None if nk == null_key else nk, "value": ck.replace(nullKey, "__null__") if nullKey in ck else ck,
"label": display_by_key.get(nk, nk), "label": " / ".join(displayByComposite[ck]),
"totalCount": counts[nk], "totalCount": counts[ck],
"filters": filtersByComposite[ck],
} }
for nk in ordered_keys for ck in orderedKeys
] ]

View file

@ -0,0 +1,277 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""RAG Inventory API — global knowledge-store visibility for users, admins, platform."""
import logging
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException, Depends, Request
from modules.auth import limiter, getCurrentUser, getRequestContext, RequestContext
from modules.datamodels.datamodelUam import User
from modules.shared.i18nRegistry import apiRouteContext
routeApiMsg = apiRouteContext("routeRagInventory")
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/api/rag/inventory",
tags=["RAG Inventory"],
responses={
401: {"description": "Unauthorized"},
403: {"description": "Forbidden"},
500: {"description": "Internal server error"},
},
)
def _buildConnectionInventory(connections, rootIf, knowledgeIf, jobService) -> List[Dict[str, Any]]:
from modules.datamodels.datamodelDataSource import DataSource
from modules.datamodels.datamodelKnowledge import FileContentIndex
out = []
for conn in connections:
connectionId = str(conn.id)
dataSources = rootIf.db.getRecordset(DataSource, recordFilter={"connectionId": connectionId})
connIndexRows = knowledgeIf.db.getRecordset(FileContentIndex, recordFilter={"connectionId": connectionId})
connChunkTotal = len(connIndexRows)
chunksByDs: Dict[str, int] = {}
unassigned = 0
for idx in connIndexRows:
prov = (idx.get("provenance") if isinstance(idx, dict) else getattr(idx, "provenance", None)) or {}
dsIdRef = prov.get("dataSourceId", "") if isinstance(prov, dict) else ""
if dsIdRef:
chunksByDs[dsIdRef] = chunksByDs.get(dsIdRef, 0) + 1
else:
unassigned += 1
dsItems = []
for ds in dataSources:
dsId = ds.get("id") if isinstance(ds, dict) else getattr(ds, "id", "")
dsItems.append({
"id": dsId,
"label": ds.get("label") if isinstance(ds, dict) else getattr(ds, "label", ""),
"path": ds.get("path") if isinstance(ds, dict) else getattr(ds, "path", ""),
"sourceType": ds.get("sourceType") if isinstance(ds, dict) else getattr(ds, "sourceType", ""),
"ragIndexEnabled": ds.get("ragIndexEnabled") if isinstance(ds, dict) else getattr(ds, "ragIndexEnabled", False),
"neutralize": ds.get("neutralize") if isinstance(ds, dict) else getattr(ds, "neutralize", False),
"lastIndexed": ds.get("lastIndexed") if isinstance(ds, dict) else getattr(ds, "lastIndexed", None),
"chunkCount": chunksByDs.get(dsId, 0),
})
if unassigned > 0 and len(dsItems) == 1:
dsItems[0]["chunkCount"] += unassigned
jobs = jobService.listJobs(jobType="connection.bootstrap", limit=5)
connJobs = [j for j in jobs if (j.get("payload") or {}).get("connectionId") == connectionId]
runningJobs = [
{"jobId": j["id"], "progress": j.get("progress", 0), "progressMessage": j.get("progressMessage", "")}
for j in connJobs
if j.get("status") in ("PENDING", "RUNNING")
]
lastError = None
for j in connJobs:
if j.get("status") == "ERROR":
lastError = {"jobId": j["id"], "errorMessage": j.get("errorMessage", "")}
break
out.append({
"id": connectionId,
"authority": conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority),
"externalEmail": getattr(conn, "externalEmail", ""),
"knowledgeIngestionEnabled": getattr(conn, "knowledgeIngestionEnabled", False),
"preferences": getattr(conn, "knowledgePreferences", None) or {},
"dataSources": dsItems,
"totalChunks": connChunkTotal,
"runningJobs": runningJobs,
"lastError": lastError,
})
return out
@router.get("/me")
@limiter.limit("30/minute")
def _getInventoryMe(
request: Request,
currentUser: User = Depends(getCurrentUser),
) -> Dict[str, Any]:
"""Personal RAG inventory: own connections + DataSources + chunk counts."""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.serviceCenter.services.serviceBackgroundJobs import mainBackgroundJobService as jobService
rootIf = getRootInterface()
knowledgeIf = getKnowledgeInterface(None)
connections = rootIf.getUserConnections(currentUser.id)
items = _buildConnectionInventory(connections, rootIf, knowledgeIf, jobService)
totalChunks = sum(c.get("totalChunks", 0) for c in items)
return {"connections": items, "totals": {"chunks": totalChunks}}
except Exception as e:
logger.error("Error in RAG inventory /me: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/mandate")
@limiter.limit("20/minute")
def _getInventoryMandate(
request: Request,
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Mandate-level RAG aggregation (requires mandate membership)."""
if not context.mandateId:
raise HTTPException(status_code=403, detail=routeApiMsg("Mandate context required"))
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface, aggregateMandateRagTotalBytes
from modules.serviceCenter.services.serviceBackgroundJobs import mainBackgroundJobService as jobService
rootIf = getRootInterface()
knowledgeIf = getKnowledgeInterface(None)
mandateId = str(context.mandateId) if context.mandateId else ""
from modules.datamodels.datamodelUam import UserConnection
allConnections = rootIf.db.getRecordset(UserConnection, recordFilter={"mandateId": mandateId})
connectionObjects = [type("C", (), row)() if isinstance(row, dict) else row for row in allConnections]
items = _buildConnectionInventory(connectionObjects, rootIf, knowledgeIf, jobService)
totalChunks = sum(c.get("totalChunks", 0) for c in items)
totalBytes = aggregateMandateRagTotalBytes(mandateId)
return {"connections": items, "totals": {"chunks": totalChunks, "bytes": totalBytes}}
except HTTPException:
raise
except Exception as e:
logger.error("Error in RAG inventory /mandate: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/platform")
@limiter.limit("10/minute")
def _getInventoryPlatform(
request: Request,
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Platform-wide RAG statistics (sysadmin only)."""
if not context.isSysAdmin:
raise HTTPException(status_code=403, detail=routeApiMsg("Platform admin required"))
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.serviceCenter.services.serviceBackgroundJobs import mainBackgroundJobService as jobService
from modules.datamodels.datamodelUam import UserConnection
rootIf = getRootInterface()
knowledgeIf = getKnowledgeInterface(None)
allConnections = rootIf.db.getRecordset(UserConnection)
connectionObjects = [type("C", (), row)() if isinstance(row, dict) else row for row in allConnections]
items = _buildConnectionInventory(connectionObjects, rootIf, knowledgeIf, jobService)
totalChunks = sum(c.get("totalChunks", 0) for c in items)
return {"connections": items, "totals": {"chunks": totalChunks}}
except HTTPException:
raise
except Exception as e:
logger.error("Error in RAG inventory /platform: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/reindex/{connectionId}")
@limiter.limit("10/minute")
def _reindexConnection(
request: Request,
connectionId: str,
currentUser: User = Depends(getCurrentUser),
) -> Dict[str, Any]:
"""Re-trigger bootstrap for a connection (re-index all ragIndexEnabled DataSources).
Submits a new connection.bootstrap job, regardless of previous failures.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.serviceCenter.services.serviceBackgroundJobs import startJob
from modules.datamodels.datamodelDataSource import DataSource
import asyncio
rootIf = getRootInterface()
conn = rootIf.getUserConnectionById(connectionId)
if conn is None:
raise HTTPException(status_code=404, detail="Connection not found")
if str(conn.userId) != str(currentUser.id):
raise HTTPException(status_code=403, detail="Not your connection")
dataSources = rootIf.db.getRecordset(DataSource, recordFilter={"connectionId": connectionId})
ragDs = [ds for ds in dataSources if (ds.get("ragIndexEnabled") if isinstance(ds, dict) else getattr(ds, "ragIndexEnabled", False))]
if not ragDs:
return {"status": "skipped", "reason": "no_rag_enabled_datasources"}
authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority or "")
dsIds = [(ds.get("id") if isinstance(ds, dict) else getattr(ds, "id", "")) for ds in ragDs]
async def _enqueue():
return await startJob(
"connection.bootstrap",
{"connectionId": connectionId, "authority": authority.lower(), "dataSourceIds": dsIds},
triggeredBy=str(currentUser.id),
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
future = asyncio.ensure_future(_enqueue())
jobId = None
else:
jobId = loop.run_until_complete(_enqueue())
except RuntimeError:
jobId = asyncio.run(_enqueue())
logger.info("Reindex triggered for connection %s (%d DataSources)", connectionId, len(dsIds))
return {"status": "queued", "connectionId": connectionId, "dataSourceCount": len(dsIds), "jobId": jobId}
except HTTPException:
raise
except Exception as e:
logger.error("Error triggering reindex: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/jobs")
@limiter.limit("60/minute")
def _getActiveJobs(
request: Request,
currentUser: User = Depends(getCurrentUser),
) -> List[Dict[str, Any]]:
"""Active RAG jobs for the current user (used by header badge)."""
try:
from modules.serviceCenter.services.serviceBackgroundJobs import listJobs
from modules.interfaces.interfaceDbApp import getRootInterface
rootIf = getRootInterface()
connections = rootIf.getUserConnections(currentUser.id)
connectionMap = {str(c.id): c for c in connections}
connectionIds = set(connectionMap.keys())
jobs = listJobs(jobType="connection.bootstrap", limit=50)
active = []
for j in jobs:
if j.get("status") not in ("PENDING", "RUNNING"):
continue
payload = j.get("payload") or {}
connId = payload.get("connectionId")
if connId in connectionIds:
conn = connectionMap[connId]
active.append({
"jobId": j["id"],
"connectionId": connId,
"connectionLabel": getattr(conn, "displayLabel", None) or getattr(conn, "authority", connId),
"jobType": j.get("jobType", "connection.bootstrap"),
"progress": j.get("progress", 0),
"progressMessage": j.get("progressMessage", ""),
})
return active
except Exception as e:
logger.error("Error in RAG inventory /jobs: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=str(e))

View file

@ -155,12 +155,13 @@ async def sttStream(
Protocol: Protocol:
Client sends JSON: Client sends JSON:
{"type": "open", "language": "de-DE"} {"type": "open", "language": "de-DE", "model": "latest_short", "lightweight": true, "singleUtterance": true}
{"type": "audio", "chunk": "<base64>"} {"type": "audio", "chunk": "<base64>"}
{"type": "close"} {"type": "close"}
Server sends JSON: Server sends JSON:
{"type": "interim", "text": "..."} {"type": "interim", "text": "..."}
{"type": "final", "text": "...", "confidence": 0.95} {"type": "final", "text": "...", "confidence": 0.95}
{"type": "end_of_single_utterance", "audioDurationSec": 0.0}
{"type": "error", "message": "..."} {"type": "error", "message": "..."}
{"type": "closed"} {"type": "closed"}
""" """
@ -205,7 +206,12 @@ async def sttStream(
logger.warning(f"STT billing pre-flight skipped: {e}") logger.warning(f"STT billing pre-flight skipped: {e}")
audioQueue: asyncio.Queue = asyncio.Queue() audioQueue: asyncio.Queue = asyncio.Queue()
language = "de-DE" sttOpenOptions: Dict[str, Any] = {
"language": "de-DE",
"model": "latest_long",
"lightweight": False,
"singleUtterance": False,
}
streamingTask: Optional[asyncio.Task] = None streamingTask: Optional[asyncio.Task] = None
voiceInterface: Optional[VoiceObjects] = None voiceInterface: Optional[VoiceObjects] = None
@ -233,10 +239,23 @@ async def sttStream(
voiceInterface.billingCallback = _billingCb voiceInterface.billingCallback = _billingCb
try: try:
async for event in voiceInterface.streamingSpeechToText(audioQueue, language): async for event in voiceInterface.streamingSpeechToText(
audioQueue,
sttOpenOptions["language"],
phraseHints=None,
model=sttOpenOptions["model"],
lightweight=sttOpenOptions["lightweight"],
singleUtterance=sttOpenOptions["singleUtterance"],
):
if event.get("reconnectRequired"): if event.get("reconnectRequired"):
await _sendJson({"type": "reconnect_required"}) await _sendJson({"type": "reconnect_required"})
return return
if event.get("endOfSingleUtterance"):
await _sendJson({
"type": "end_of_single_utterance",
"audioDurationSec": event.get("audioDurationSec", 0.0),
})
continue
if event.get("isFinal"): if event.get("isFinal"):
if event.get("transcript"): if event.get("transcript"):
await _sendJson({"type": "final", "text": event["transcript"], "confidence": event.get("confidence", 0.0)}) await _sendJson({"type": "final", "text": event["transcript"], "confidence": event.get("confidence", 0.0)})
@ -258,7 +277,10 @@ async def sttStream(
msgType = (msg.get("type") or "").strip() msgType = (msg.get("type") or "").strip()
if msgType == "open": if msgType == "open":
language = msg.get("language") or "de-DE" sttOpenOptions["language"] = msg.get("language") or "de-DE"
sttOpenOptions["model"] = msg.get("model") or "latest_long"
sttOpenOptions["lightweight"] = bool(msg.get("lightweight"))
sttOpenOptions["singleUtterance"] = bool(msg.get("singleUtterance"))
if streamingTask and not streamingTask.done(): if streamingTask and not streamingTask.done():
await audioQueue.put((b"", True)) await audioQueue.put((b"", True))
streamingTask.cancel() streamingTask.cancel()

View file

@ -335,9 +335,14 @@ async def runAgentLoop(
# Execute tool calls # Execute tool calls
for tc in toolCalls: for tc in toolCalls:
toolDef = toolRegistry.getTool(tc.name)
yield AgentEvent( yield AgentEvent(
type=AgentEventTypeEnum.TOOL_CALL, type=AgentEventTypeEnum.TOOL_CALL,
data={"toolName": tc.name, "args": tc.args} data={
"toolName": tc.name,
"displayLabel": toolDef.displayLabel if toolDef else None,
"args": tc.args,
}
) )
results = await _executeToolCalls(toolCalls, toolRegistry, { results = await _executeToolCalls(toolCalls, toolRegistry, {

View file

@ -184,4 +184,5 @@ def _registerConnectionTools(registry: ToolRegistry, services):
"required": ["connectionId", "to", "subject", "body"], "required": ["connectionId", "to", "subject", "body"],
}, },
readOnly=False, readOnly=False,
displayLabel="composing an email",
) )

View file

@ -297,6 +297,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
}, },
}, },
readOnly=False, readOnly=False,
displayLabel="creating a document",
) )
# ── textToSpeech tool ────────────────────────────────────────────── # ── textToSpeech tool ──────────────────────────────────────────────
@ -573,6 +574,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
"required": ["prompt"], "required": ["prompt"],
}, },
readOnly=False, readOnly=False,
displayLabel="generating an image",
) )
# ── createChart tool ───────────────────────────────────────────────── # ── createChart tool ─────────────────────────────────────────────────
@ -770,6 +772,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
"required": ["datasets"], "required": ["datasets"],
}, },
readOnly=False, readOnly=False,
displayLabel="creating a chart",
) )
# ── Phase 3: speechToText, detectLanguage, neutralizeData, executeCode ── # ── Phase 3: speechToText, detectLanguage, neutralizeData, executeCode ──
@ -917,5 +920,6 @@ def _registerMediaTools(registry: ToolRegistry, services):
}, },
"required": ["code"] "required": ["code"]
}, },
readOnly=True readOnly=True,
displayLabel="running calculations",
) )

View file

@ -310,11 +310,15 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="writeFile", success=False, error="name is required for mode=create") return ToolResult(toolCallId="", toolName="writeFile", success=False, error="name is required for mode=create")
fileItem, _ = dbMgmt.saveUploadedFile(content.encode("utf-8"), name) fileItem, _ = dbMgmt.saveUploadedFile(content.encode("utf-8"), name)
fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "") fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "")
updateFields: Dict[str, Any] = {}
if fiId: if fiId:
dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId}) updateFields["featureInstanceId"] = fiId
# File group tree removed — groupId arg and instance-group assignment no longer apply if args.get("folderId"):
updateFields["folderId"] = args["folderId"]
if args.get("tags"): if args.get("tags"):
dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]}) updateFields["tags"] = args["tags"]
if updateFields:
dbMgmt.updateFile(fileItem.id, updateFields)
chatDocId = _attachFileAsChatDocument( chatDocId = _attachFileAsChatDocument(
services, fileItem, services, fileItem,
@ -359,7 +363,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
}, },
"required": ["fileId"] "required": ["fileId"]
}, },
readOnly=True readOnly=True,
displayLabel="reviewing a document",
) )
registry.register( registry.register(
@ -406,7 +411,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"properties": {"query": {"type": "string", "description": "Search query"}}, "properties": {"query": {"type": "string", "description": "Search query"}},
"required": ["query"] "required": ["query"]
}, },
readOnly=True readOnly=True,
displayLabel="researching on the web",
) )
registry.register( registry.register(
@ -427,7 +433,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"writeFile", _writeFile, "writeFile", _writeFile,
description=( description=(
"Create, append, or overwrite a file. Modes:\n" "Create, append, or overwrite a file. Modes:\n"
"- create (default): create a new file (name required).\n" "- create (default): create a new file (name required). Use folderId to place it in a specific folder.\n"
"- append: append content to an existing file (fileId required). " "- append: append content to an existing file (fileId required). "
"Use for large content that exceeds a single tool call (~8000 chars per call).\n" "Use for large content that exceeds a single tool call (~8000 chars per call).\n"
"- overwrite: replace entire file content (fileId required).\n" "- overwrite: replace entire file content (fileId required).\n"
@ -443,7 +449,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"content": {"type": "string", "description": "Content to write/append"}, "content": {"type": "string", "description": "Content to write/append"},
"mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"}, "mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"},
"fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"}, "fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"},
"groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."}, "folderId": {"type": "string", "description": "Folder ID to place the file in (mode=create only). Use listFolders to find IDs. Omit for root."},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"}, "tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"},
}, },
"required": ["content"] "required": ["content"]
@ -581,7 +587,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
}, },
"required": ["url"] "required": ["url"]
}, },
readOnly=True readOnly=True,
displayLabel="reading a webpage",
) )
registry.register( registry.register(
@ -701,7 +708,147 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False readOnly=False
) )
# Group tree tools removed — file grouping now uses view-based display grouping (TableListView) # ---- Folder management tools ----
async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]):
name = args.get("name", "")
parentId = args.get("parentId") or None
if not name:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required")
try:
chatService = services.chat
dbMgmt = chatService.interfaceDbComponent
folder = dbMgmt.createFolder(name, parentId=parentId)
folderId = folder.get("id") if isinstance(folder, dict) else getattr(folder, "id", None)
folderName = folder.get("name") if isinstance(folder, dict) else getattr(folder, "name", name)
return ToolResult(
toolCallId="", toolName="createFolder", success=True,
data=f"Folder '{folderName}' created (id: {folderId})" + (f" inside parent {parentId}" if parentId else ""),
sideEvents=[{"type": "folderCreated", "data": {"folderId": folderId, "folderName": folderName, "parentId": parentId}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e))
async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]):
try:
chatService = services.chat
dbMgmt = chatService.interfaceDbComponent
folders = dbMgmt.getOwnFolderTree()
if not folders:
return ToolResult(toolCallId="", toolName="listFolders", success=True, data="No folders found.")
lines = []
folderMap: Dict[Optional[str], List] = {}
for f in folders:
pid = f.get("parentId") if isinstance(f, dict) else getattr(f, "parentId", None)
folderMap.setdefault(pid, []).append(f)
def _walk(parentId: Optional[str], indent: int):
for f in sorted(folderMap.get(parentId, []), key=lambda x: (x.get("name") if isinstance(x, dict) else getattr(x, "name", "")).lower()):
fId = f.get("id") if isinstance(f, dict) else getattr(f, "id", "")
fName = f.get("name") if isinstance(f, dict) else getattr(f, "name", "")
prefix = " " * indent
lines.append(f"{prefix}- {fName} (id: {fId})")
_walk(fId, indent + 1)
_walk(None, 0)
return ToolResult(toolCallId="", toolName="listFolders", success=True, data="\n".join(lines))
except Exception as e:
return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e))
async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
folderId = args.get("folderId")
if not fileId:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required")
try:
chatService = services.chat
dbMgmt = chatService.interfaceDbComponent
file = dbMgmt.getFile(fileId)
if not file:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error=f"File {fileId} not found")
dbMgmt.updateFile(fileId, {"folderId": folderId or None})
targetLabel = f"folder {folderId}" if folderId else "root"
return ToolResult(
toolCallId="", toolName="moveFile", success=True,
data=f"File '{file.fileName}' (id: {fileId}) moved to {targetLabel}",
sideEvents=[{"type": "fileUpdated", "data": {"fileId": fileId, "fileName": file.fileName}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e))
registry.register(
"createFolder", _createFolder,
description=(
"Create a new folder in the workspace file tree. "
"Use parentId to create nested folders. Returns the new folder ID."
),
parameters={
"type": "object",
"properties": {
"name": {"type": "string", "description": "Folder name"},
"parentId": {"type": "string", "description": "Parent folder ID for nesting. Omit to create at root level."},
},
"required": ["name"]
},
readOnly=False
)
registry.register(
"listFolders", _listFolders,
description=(
"List all folders in the workspace as an indented tree. "
"Use to find folder IDs for createFolder (parentId), writeFile (folderId), or moveFile."
),
parameters={"type": "object", "properties": {}},
readOnly=True
)
async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
newName = args.get("newName", "")
if not folderId or not newName:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required")
try:
chatService = services.chat
dbMgmt = chatService.interfaceDbComponent
folder = dbMgmt.renameFolder(folderId, newName)
return ToolResult(
toolCallId="", toolName="renameFolder", success=True,
data=f"Folder {folderId} renamed to '{newName}'",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "folderName": newName}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e))
registry.register(
"renameFolder", _renameFolder,
description="Rename an existing folder in the workspace file tree.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to rename"},
"newName": {"type": "string", "description": "New folder name"},
},
"required": ["folderId", "newName"]
},
readOnly=False
)
registry.register(
"moveFile", _moveFile,
description=(
"Move a file into a specific folder. Set folderId to null or omit to move the file back to the root level."
),
parameters={
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to move"},
"folderId": {"type": "string", "description": "Target folder ID. Omit or null to move to root."},
},
"required": ["fileId"]
},
readOnly=False
)
registry.register( registry.register(
"replaceInFile", _replaceInFile, "replaceInFile", _replaceInFile,

View file

@ -41,6 +41,12 @@ class ToolDefinition(BaseModel):
"""Schema for a tool available to the agent.""" """Schema for a tool available to the agent."""
name: str = Field(description="Unique tool name") name: str = Field(description="Unique tool name")
description: str = Field(description="What this tool does") description: str = Field(description="What this tool does")
displayLabel: Optional[str] = Field(
default=None,
description="Short human-readable activity phrase (e.g. 'researching on the web'). "
"Used for live progress messages in meetings. English gerund phrase; "
"localised by the caller."
)
parameters: Dict[str, Any] = Field( parameters: Dict[str, Any] = Field(
default_factory=dict, default_factory=dict,
description="JSON Schema for tool parameters" description="JSON Schema for tool parameters"

View file

@ -23,7 +23,7 @@ class ToolRegistry:
def register(self, name: str, handler: Callable[..., Awaitable[ToolResult]], def register(self, name: str, handler: Callable[..., Awaitable[ToolResult]],
description: str = "", parameters: Dict[str, Any] = None, description: str = "", parameters: Dict[str, Any] = None,
readOnly: bool = False, featureType: str = None, readOnly: bool = False, featureType: str = None,
toolSet: str = None): toolSet: str = None, displayLabel: str = None):
"""Register a tool with its handler function.""" """Register a tool with its handler function."""
if name in self._tools: if name in self._tools:
logger.warning(f"Tool '{name}' already registered, overwriting") logger.warning(f"Tool '{name}' already registered, overwriting")
@ -31,6 +31,7 @@ class ToolRegistry:
self._tools[name] = ToolDefinition( self._tools[name] = ToolDefinition(
name=name, name=name,
description=description, description=description,
displayLabel=displayLabel,
parameters=parameters or {}, parameters=parameters or {},
readOnly=readOnly, readOnly=readOnly,
featureType=featureType, featureType=featureType,

View file

@ -567,11 +567,14 @@ mit Web-Recherche, E-Mail-Versand, Dokumenten-Erzeugung und Datenquellen-Zugriff
Setze "needsAgent": true und "agentReason": "<kurze Beschreibung der Aufgabe in einem Satz>" Setze "needsAgent": true und "agentReason": "<kurze Beschreibung der Aufgabe in einem Satz>"
WENN die Aufgabe eines oder mehrere dieser Merkmale hat: WENN die Aufgabe eines oder mehrere dieser Merkmale hat:
- Recherche im Internet noetig (z.B. "recherchier was im Internet ueber XY", "schau mal nach", "google das") - Recherche im Internet oder aktuelle Informationen noetig
- E-Mail an Teilnehmer/Kontakte versenden - Informationen beschaffen die du NICHT im Transkript oder in deinem Vorwissen hast
- Dokument (PDF, Word, Excel) generieren oder im SharePoint/Drive ablegen - E-Mail versenden
- Mehrere Schritte oder Tool-Aufrufe noetig (Zusammenfassung + Versand, Recherche + Empfehlung etc.) - Dokument generieren oder in einer Datenquelle ablegen
- Daten aus externen Quellen abrufen (Outlook-Kontakte, SharePoint-Dateien, Kalender etc.) - Mehrere Schritte oder Tool-Aufrufe noetig
- Daten aus externen Quellen abrufen
Wenn du den gewuenschten Inhalt nicht selbst liefern kannst, setze needsAgent=true.
Wenn needsAgent=true: Wenn needsAgent=true:
- Setze shouldRespond=false (der Agent uebernimmt; du sprichst NICHT eigenstaendig). - Setze shouldRespond=false (der Agent uebernimmt; du sprichst NICHT eigenstaendig).

View file

@ -7,6 +7,9 @@ from .mainBackgroundJobService import (
startJob, startJob,
getJobStatus, getJobStatus,
listJobs, listJobs,
cancelJob,
cancelJobsByConnection,
isTerminalStatus,
JobProgressCallback, JobProgressCallback,
) )
@ -15,5 +18,8 @@ __all__ = [
"startJob", "startJob",
"getJobStatus", "getJobStatus",
"listJobs", "listJobs",
"cancelJob",
"cancelJobsByConnection",
"isTerminalStatus",
"JobProgressCallback", "JobProgressCallback",
] ]

View file

@ -30,6 +30,7 @@ clear message. No silent zombies.
import asyncio import asyncio
import logging import logging
import time
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any, Awaitable, Callable, Dict, List, Optional from typing import Any, Awaitable, Callable, Dict, List, Optional
@ -49,7 +50,46 @@ JOBS_DATABASE = APP_CONFIG.get("DB_DATABASE", "poweron_app")
registerDatabase(JOBS_DATABASE) registerDatabase(JOBS_DATABASE)
JobProgressCallback = Callable[[int, Optional[str]], None] _CANCEL_CHECK_INTERVAL_S = 3.0
class JobProgressCallback:
"""Callable progress reporter with cooperative cancel-check for long-running walkers."""
def __init__(self, jobId: str):
self._jobId = jobId
self._cancelledCache: Optional[bool] = None
self._lastCheckedAt: float = 0.0
def __call__(self, progress: int, message: Optional[str] = None) -> None:
try:
clamped = max(0, min(100, int(progress)))
fields: Dict[str, Any] = {"progress": clamped}
if message is not None:
fields["progressMessage"] = message[:500]
_updateJob(self._jobId, fields)
except Exception as ex:
logger.warning("Progress update failed for job %s: %s", self._jobId, ex)
def isCancelled(self) -> bool:
"""Check if this job was cancelled. Reads DB at most every 3s to limit load."""
now = time.time()
if self._cancelledCache is True:
return True
if now - self._lastCheckedAt < _CANCEL_CHECK_INTERVAL_S:
return self._cancelledCache or False
self._lastCheckedAt = now
try:
job = _loadJob(self._jobId)
if job and job.get("status") == BackgroundJobStatusEnum.CANCELLED.value:
self._cancelledCache = True
return True
except Exception:
pass
self._cancelledCache = False
return False
JobHandler = Callable[[Dict[str, Any], JobProgressCallback], Awaitable[Optional[Dict[str, Any]]]] JobHandler = Callable[[Dict[str, Any], JobProgressCallback], Awaitable[Optional[Dict[str, Any]]]]
@ -155,16 +195,7 @@ def _markError(jobId: str, errorMessage: str) -> None:
def _makeProgressCallback(jobId: str) -> JobProgressCallback: def _makeProgressCallback(jobId: str) -> JobProgressCallback:
def _cb(progress: int, message: Optional[str] = None) -> None: return JobProgressCallback(jobId)
try:
clamped = max(0, min(100, int(progress)))
fields: Dict[str, Any] = {"progress": clamped}
if message is not None:
fields["progressMessage"] = message[:500]
_updateJob(jobId, fields)
except Exception as ex:
logger.warning("Progress update failed for job %s: %s", jobId, ex)
return _cb
async def _runJob(jobId: str) -> None: async def _runJob(jobId: str) -> None:
@ -220,12 +251,51 @@ def isTerminalStatus(status: str) -> bool:
return status in {s.value for s in TERMINAL_JOB_STATUSES} return status in {s.value for s in TERMINAL_JOB_STATUSES}
def cancelJob(jobId: str, *, reason: str = "user_requested") -> bool:
"""Mark a job as CANCELLED. Walkers detect this via JobProgressCallback.isCancelled().
Returns False if the job is already in a terminal state or does not exist.
"""
job = _loadJob(jobId)
if not job:
return False
if isTerminalStatus(job.get("status", "")):
return False
_updateJob(jobId, {
"status": BackgroundJobStatusEnum.CANCELLED.value,
"errorMessage": f"cancelled: {reason}"[:1000],
"finishedAt": datetime.now(timezone.utc).timestamp(),
})
logger.info("BackgroundJob %s cancelled (reason=%s)", jobId, reason)
return True
def cancelJobsByConnection(connectionId: str, *, jobType: str = "connection.bootstrap") -> int:
"""Cancel all RUNNING/PENDING jobs whose payload.connectionId matches.
Returns count of jobs marked as cancelled.
"""
db = _getDb()
rows = db.getRecordset(BackgroundJob, recordFilter={"jobType": jobType})
count = 0
for row in rows:
status = row.get("status", "")
if status not in (BackgroundJobStatusEnum.PENDING.value, BackgroundJobStatusEnum.RUNNING.value):
continue
payload = row.get("payload") or {}
if payload.get("connectionId") == connectionId:
if cancelJob(row["id"], reason=f"connection_stop:{connectionId[:8]}"):
count += 1
return count
def recoverInterruptedJobs() -> int: def recoverInterruptedJobs() -> int:
"""Flip any RUNNING jobs to ERROR (called at worker boot). """Flip any RUNNING jobs to ERROR and re-queue bootstrap jobs (called at worker boot).
A RUNNING job in the DB after process restart means the previous worker A RUNNING job in the DB after process restart means the previous worker
died mid-execution; the asyncio task is gone and the job will never died mid-execution; the asyncio task is gone and the job will never
finish on its own. finish on its own. For connection.bootstrap jobs, a fresh job is
automatically re-queued so the user doesn't have to manually retry.
""" """
db = _getDb() db = _getDb()
try: try:
@ -234,12 +304,34 @@ def recoverInterruptedJobs() -> int:
logger.warning("recoverInterruptedJobs: failed to scan RUNNING jobs: %s", ex) logger.warning("recoverInterruptedJobs: failed to scan RUNNING jobs: %s", ex)
return 0 return 0
count = 0 count = 0
requeued = 0
for row in rows: for row in rows:
try: try:
_markError(row["id"], "Interrupted by worker restart") _markError(row["id"], "Interrupted by worker restart")
count += 1 count += 1
except Exception as ex: except Exception as ex:
logger.warning("recoverInterruptedJobs: could not mark %s as ERROR: %s", row.get("id"), ex) logger.warning("recoverInterruptedJobs: could not mark %s as ERROR: %s", row.get("id"), ex)
continue
if row.get("jobType") == "connection.bootstrap":
payload = row.get("payload") or {}
if payload.get("connectionId"):
try:
newJob = BackgroundJob(
jobType="connection.bootstrap",
payload=payload,
triggeredBy="recovery.requeue",
)
record = db.recordCreate(BackgroundJob, _serialiseDatetimes(newJob.model_dump()))
asyncio.create_task(_runJob(record["id"]))
requeued += 1
logger.info(
"recoverInterruptedJobs: re-queued bootstrap for connectionId=%s (new jobId=%s)",
payload["connectionId"], record["id"],
)
except Exception as reqEx:
logger.warning("recoverInterruptedJobs: re-queue failed for %s: %s", row.get("id"), reqEx)
if count: if count:
logger.warning("Recovered %d interrupted background job(s) after restart", count) logger.warning("Recovered %d interrupted background job(s) after restart (re-queued %d)", count, requeued)
return count return count

View file

@ -77,6 +77,7 @@ class ContainerExtractor(Extractor):
"""Extract by recursively unpacking the container.""" """Extract by recursively unpacking the container."""
fileName = context.get("fileName", "archive") fileName = context.get("fileName", "archive")
mimeType = context.get("mimeType", "application/octet-stream") mimeType = context.get("mimeType", "application/octet-stream")
cascadeDepth = context.get("_cascadeDepth", 0)
rootId = makeId() rootId = makeId()
parts: List[ContentPart] = [ parts: List[ContentPart] = [
@ -97,7 +98,7 @@ class ContainerExtractor(Extractor):
parts.extend(lazy) parts.extend(lazy)
return parts return parts
state = {"totalSize": 0, "fileCount": 0} state = {"totalSize": 0, "fileCount": 0, "cascadeDepth": cascadeDepth}
try: try:
childParts = _resolveContainerRecursive( childParts = _resolveContainerRecursive(
fileBytes, mimeType, fileName, rootId, "", 0, state fileBytes, mimeType, fileName, rootId, "", 0, state
@ -209,7 +210,12 @@ def _addFilePart(
if extractor and not isinstance(extractor, ContainerExtractor): if extractor and not isinstance(extractor, ContainerExtractor):
try: try:
childParts = extractor.extract(data, {"fileName": fileName, "mimeType": detectedMime}) cascadeDepth = state.get("cascadeDepth", 0)
childParts = extractor.extract(data, {
"fileName": fileName,
"mimeType": detectedMime,
"_cascadeDepth": cascadeDepth + 1,
})
for part in childParts: for part in childParts:
part.parentId = parentId part.parentId = parentId
if not part.metadata: if not part.metadata:

View file

@ -53,12 +53,13 @@ class EmailExtractor(Extractor):
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]: def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName", "email") fileName = context.get("fileName", "email")
lower = (fileName or "").lower() lower = (fileName or "").lower()
depth = context.get("_cascadeDepth", 0)
if lower.endswith(".msg"): if lower.endswith(".msg"):
return self._extractMsg(fileBytes, fileName) return self._extractMsg(fileBytes, fileName, depth)
return self._extractEml(fileBytes, fileName) return self._extractEml(fileBytes, fileName, depth)
def _extractEml(self, fileBytes: bytes, fileName: str) -> List[ContentPart]: def _extractEml(self, fileBytes: bytes, fileName: str, depth: int = 0) -> List[ContentPart]:
"""Parse standard EML (RFC 822) using stdlib email.""" """Parse standard EML (RFC 822) using stdlib email."""
rootId = makeId() rootId = makeId()
parts: List[ContentPart] = [] parts: List[ContentPart] = []
@ -91,7 +92,7 @@ class EmailExtractor(Extractor):
attachName = part.get_filename() or "attachment" attachName = part.get_filename() or "attachment"
attachData = part.get_payload(decode=True) attachData = part.get_payload(decode=True)
if attachData: if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId)) parts.extend(_delegateAttachment(attachData, attachName, rootId, depth))
continue continue
if contentType == "text/plain": if contentType == "text/plain":
@ -113,7 +114,7 @@ class EmailExtractor(Extractor):
return parts return parts
def _extractMsg(self, fileBytes: bytes, fileName: str) -> List[ContentPart]: def _extractMsg(self, fileBytes: bytes, fileName: str, depth: int = 0) -> List[ContentPart]:
"""Parse Outlook MSG files using extract-msg (optional).""" """Parse Outlook MSG files using extract-msg (optional)."""
rootId = makeId() rootId = makeId()
parts: List[ContentPart] = [] parts: List[ContentPart] = []
@ -179,7 +180,7 @@ class EmailExtractor(Extractor):
attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment" attachName = getattr(attachment, "longFilename", None) or getattr(attachment, "shortFilename", None) or "attachment"
attachData = getattr(attachment, "data", None) attachData = getattr(attachment, "data", None)
if attachData: if attachData:
parts.extend(_delegateAttachment(attachData, attachName, rootId)) parts.extend(_delegateAttachment(attachData, attachName, rootId, depth))
try: try:
msgFile.close() msgFile.close()
@ -199,18 +200,39 @@ def _buildHeaderText(msg) -> str:
return "\n".join(lines) return "\n".join(lines)
def _delegateAttachment(attachData: bytes, attachName: str, parentId: str) -> List[ContentPart]: _MAX_CASCADE_DEPTH = 10
"""Delegate an attachment to the appropriate type-specific extractor."""
def _delegateAttachment(attachData: bytes, attachName: str, parentId: str, depth: int = 0) -> List[ContentPart]:
"""Delegate an attachment to the appropriate type-specific extractor.
Passes ``_cascadeDepth`` through the context so nested EmailContainerEmail
chains share a global depth counter and don't recurse infinitely.
"""
if depth >= _MAX_CASCADE_DEPTH:
logger.warning(f"Cascade depth {depth} reached for {attachName}, skipping extraction")
import base64
encodedData = base64.b64encode(attachData).decode("utf-8") if attachData else ""
return [ContentPart(
id=makeId(), parentId=parentId, label=attachName,
typeGroup="binary", mimeType="application/octet-stream",
data=encodedData,
metadata={"size": len(attachData), "emailAttachment": attachName, "cascadeDepthExceeded": True},
)]
guessedMime, _ = mimetypes.guess_type(attachName) guessedMime, _ = mimetypes.guess_type(attachName)
detectedMime = guessedMime or "application/octet-stream" detectedMime = guessedMime or "application/octet-stream"
from ..subRegistry import ExtractorRegistry from ..subRegistry import getExtractorRegistry
registry = ExtractorRegistry() registry = getExtractorRegistry()
extractor = registry.resolve(detectedMime, attachName) extractor = registry.resolve(detectedMime, attachName)
if extractor and not isinstance(extractor, EmailExtractor): if extractor:
try: try:
childParts = extractor.extract(attachData, {"fileName": attachName, "mimeType": detectedMime}) childParts = extractor.extract(attachData, {
"fileName": attachName,
"mimeType": detectedMime,
"_cascadeDepth": depth + 1,
})
for part in childParts: for part in childParts:
part.parentId = parentId part.parentId = parentId
if not part.metadata: if not part.metadata:

View file

@ -33,6 +33,7 @@ class ExtractionService:
self._interfaceDbComponent = getComponentInterface( self._interfaceDbComponent = getComponentInterface(
context.user, context.user,
mandateId=context.mandate_id, mandateId=context.mandate_id,
featureInstanceId=context.feature_instance_id,
) )
self._extractorRegistry = getExtractorRegistry() self._extractorRegistry = getExtractorRegistry()
if ExtractionService._sharedChunkerRegistry is None: if ExtractionService._sharedChunkerRegistry is None:

View file

@ -122,21 +122,54 @@ def _onConnectionRevoked(
) )
_SOURCE_TYPE_MAP = {
"msft": {
"sharepoint": ("sharepointFolder", "onedriveFolder"),
"outlook": ("outlookFolder", "calendarFolder", "contactFolder"),
},
"google": {
"drive": ("googleDriveFolder",),
"gmail": ("gmailFolder",),
},
"clickup": {
"clickup": ("clickupList",),
},
"infomaniak": {
"kdrive": ("kdriveFolder",),
},
}
def _loadRagEnabledDataSources(connectionId: str, dataSourceIds: Optional[list] = None):
"""Load DataSource rows with ragIndexEnabled=true for a connection.
If dataSourceIds is provided (mini-bootstrap), filter to only those IDs.
"""
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.datamodels.datamodelDataSource import DataSource
rootIf = getRootInterface()
allDs = rootIf.db.getRecordset(DataSource, recordFilter={"connectionId": connectionId})
if dataSourceIds:
return [ds for ds in allDs if ds.get("id") in dataSourceIds and ds.get("ragIndexEnabled")]
return [ds for ds in allDs if ds.get("ragIndexEnabled")]
async def _bootstrapJobHandler( async def _bootstrapJobHandler(
job: Dict[str, Any], job: Dict[str, Any],
progressCb, progressCb,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps.""" """Dispatch bootstrap by authority, iterating only over ragIndexEnabled DataSources."""
payload = job.get("payload") or {} payload = job.get("payload") or {}
connectionId = payload.get("connectionId") connectionId = payload.get("connectionId")
authority = (payload.get("authority") or "").lower() authority = (payload.get("authority") or "").lower()
dataSourceIds = payload.get("dataSourceIds")
if not connectionId: if not connectionId:
raise ValueError("connection.bootstrap requires payload.connectionId") raise ValueError("connection.bootstrap requires payload.connectionId")
progressCb(5, f"resolving {authority} connection") progressCb(5, f"resolving {authority} connection")
# Defensive consent check: if the connection has since disabled knowledge ingestion # Defensive consent check
# (e.g. user toggled setting after the job was enqueued), skip all walkers.
try: try:
from modules.interfaces.interfaceDbApp import getRootInterface from modules.interfaces.interfaceDbApp import getRootInterface
_root = getRootInterface() _root = getRootInterface()
@ -156,6 +189,21 @@ async def _bootstrapJobHandler(
except Exception as _guardErr: except Exception as _guardErr:
logger.debug("Could not load connection for consent guard: %s", _guardErr) logger.debug("Could not load connection for consent guard: %s", _guardErr)
# Load only ragIndexEnabled DataSources for this connection
dataSources = _loadRagEnabledDataSources(connectionId, dataSourceIds)
if not dataSources:
logger.info(
"ingestion.connection.bootstrap.skipped — no rag-enabled DataSources connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connectionId,
"authority": authority,
"reason": "no_data_sources",
},
)
return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "no_data_sources"}
def _normalize(res: Any, label: str) -> Dict[str, Any]: def _normalize(res: Any, label: str) -> Dict[str, Any]:
if isinstance(res, Exception): if isinstance(res, Exception):
logger.error( logger.error(
@ -165,6 +213,10 @@ async def _bootstrapJobHandler(
return {"error": str(res)} return {"error": str(res)}
return res or {} return res or {}
def _filterDs(walkerKey: str) -> list:
sourceTypes = _SOURCE_TYPE_MAP.get(authority, {}).get(walkerKey, ())
return [ds for ds in dataSources if ds.get("sourceType") in sourceTypes]
if authority == "msft": if authority == "msft":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint, bootstrapSharepoint,
@ -174,9 +226,14 @@ async def _bootstrapJobHandler(
) )
progressCb(10, "sharepoint + outlook") progressCb(10, "sharepoint + outlook")
spDs = _filterDs("sharepoint")
olDs = _filterDs("outlook")
async def _noopResult():
return {"skipped": True, "reason": "no_datasources"}
spResult, olResult = await asyncio.gather( spResult, olResult = await asyncio.gather(
bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb), bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb, dataSources=spDs) if spDs else _noopResult(),
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb), bootstrapOutlook(connectionId=connectionId, progressCb=progressCb, dataSources=olDs) if olDs else _noopResult(),
return_exceptions=True, return_exceptions=True,
) )
return { return {
@ -195,9 +252,14 @@ async def _bootstrapJobHandler(
) )
progressCb(10, "drive + gmail") progressCb(10, "drive + gmail")
gdDs = _filterDs("drive")
gmDs = _filterDs("gmail")
async def _noopResult():
return {"skipped": True, "reason": "no_datasources"}
gdResult, gmResult = await asyncio.gather( gdResult, gmResult = await asyncio.gather(
bootstrapGdrive(connectionId=connectionId, progressCb=progressCb), bootstrapGdrive(connectionId=connectionId, progressCb=progressCb, dataSources=gdDs) if gdDs else _noopResult(),
bootstrapGmail(connectionId=connectionId, progressCb=progressCb), bootstrapGmail(connectionId=connectionId, progressCb=progressCb, dataSources=gmDs) if gmDs else _noopResult(),
return_exceptions=True, return_exceptions=True,
) )
return { return {
@ -213,7 +275,8 @@ async def _bootstrapJobHandler(
) )
progressCb(10, "clickup tasks") progressCb(10, "clickup tasks")
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb) cuDs = _filterDs("clickup")
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb, dataSources=cuDs) if cuDs else {"skipped": True, "reason": "no_datasources"}
return { return {
"connectionId": connectionId, "connectionId": connectionId,
"authority": authority, "authority": authority,

View file

@ -9,7 +9,7 @@ is None).
from __future__ import annotations from __future__ import annotations
import logging import logging
from dataclasses import dataclass, field from dataclasses import dataclass
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,10 +21,11 @@ _DEFAULT_CLICKUP_SCOPE = "title_description"
@dataclass @dataclass
class ConnectionIngestionPrefs: class ConnectionIngestionPrefs:
"""Parsed per-connection preferences for knowledge ingestion walkers.""" """Parsed per-connection preferences for knowledge ingestion walkers.
# PII Neutralization is now controlled per DataSource.neutralize (not here).
neutralizeBeforeEmbed: bool = False Surface toggles are obsolete walker iterates only over ragIndexEnabled DataSources.
"""
# Mail (Outlook + Gmail) # Mail (Outlook + Gmail)
mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full" mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full"
@ -32,18 +33,11 @@ class ConnectionIngestionPrefs:
# Files (Drive / SharePoint / OneDrive) # Files (Drive / SharePoint / OneDrive)
filesIndexBinaries: bool = True filesIndexBinaries: bool = True
mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed
# ClickUp # ClickUp
clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments" clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments"
clickupIndexAttachments: bool = False clickupIndexAttachments: bool = False
# Per-authority surface toggles (default everything on)
gmailEnabled: bool = True
driveEnabled: bool = True
sharepointEnabled: bool = True
outlookEnabled: bool = True
# Time window # Time window
maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit
@ -78,22 +72,12 @@ def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs:
v = raw.get(key) v = raw.get(key)
return int(v) if isinstance(v, int) else default return int(v) if isinstance(v, int) else default
surface = raw.get("surfaceToggles") or {}
google_surf = surface.get("google") or {}
msft_surf = surface.get("msft") or {}
return ConnectionIngestionPrefs( return ConnectionIngestionPrefs(
neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False),
mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH), mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH),
mailIndexAttachments=_bool("mailIndexAttachments", False), mailIndexAttachments=_bool("mailIndexAttachments", False),
filesIndexBinaries=_bool("filesIndexBinaries", True), filesIndexBinaries=_bool("filesIndexBinaries", True),
mimeAllowlist=list(raw.get("mimeAllowlist") or []),
clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE), clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE),
clickupIndexAttachments=_bool("clickupIndexAttachments", False), clickupIndexAttachments=_bool("clickupIndexAttachments", False),
gmailEnabled=bool(google_surf.get("gmail", True)),
driveEnabled=bool(google_surf.get("drive", True)),
sharepointEnabled=bool(msft_surf.get("sharepoint", True)),
outlookEnabled=bool(msft_surf.get("outlook", True)),
maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS), maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS),
) )
except Exception as exc: except Exception as exc:

View file

@ -23,7 +23,7 @@ import logging
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -150,8 +150,6 @@ def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -
"data": description, "data": description,
"contextRef": {"part": "description"}, "contextRef": {"part": "description"},
}) })
# text_content is ClickUp's rendered-markdown version; include if it adds
# something beyond the plain description (common for bullet lists, checklists).
textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
if textContent and textContent != description: if textContent and textContent != description:
parts.append({ parts.append({
@ -166,33 +164,35 @@ def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -
async def bootstrapClickup( async def bootstrapClickup(
connectionId: str, connectionId: str,
*, *,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None, dataSources: Optional[List[Dict[str, Any]]] = None,
progressCb: Optional[Any] = None,
adapter: Any = None, adapter: Any = None,
connection: Any = None, connection: Any = None,
knowledgeService: Any = None, knowledgeService: Any = None,
limits: Optional[ClickupBootstrapLimits] = None, limits: Optional[ClickupBootstrapLimits] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Walk workspaces → lists → tasks and ingest each task as a virtual doc.""" """Walk workspaces → lists → tasks and ingest each task as a virtual doc.
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId) Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
Each DataSource defines the neutralize policy for its subtree.
"""
if not dataSources:
return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}
if not limits: if not limits:
limits = ClickupBootstrapLimits( limits = ClickupBootstrapLimits()
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
clickupScope=prefs.clickupScope,
)
startMs = time.time() startMs = time.time()
result = ClickupBootstrapResult(connectionId=connectionId) result = ClickupBootstrapResult(connectionId=connectionId)
logger.info( logger.info(
"ingestion.connection.bootstrap.started part=clickup connectionId=%s", "ingestion.connection.bootstrap.started part=clickup connectionId=%s dataSources=%d",
connectionId, connectionId, len(dataSources),
extra={ extra={
"event": "ingestion.connection.bootstrap.started", "event": "ingestion.connection.bootstrap.started",
"part": "clickup", "part": "clickup",
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceCount": len(dataSources),
}, },
) )
@ -215,30 +215,56 @@ async def bootstrapClickup(
return _finalizeResult(connectionId, result, startMs) return _finalizeResult(connectionId, result, startMs)
teams = (teamsResp or {}).get("teams") or [] teams = (teamsResp or {}).get("teams") or []
for team in teams[: limits.maxWorkspaces]:
cancelled = False
for ds in dataSources:
if result.indexed + result.skippedDuplicate >= limits.maxTasks: if result.indexed + result.skippedDuplicate >= limits.maxTasks:
break break
teamId = str(team.get("id", "") or "") if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
if not teamId: cancelled = True
continue break
result.workspaces += 1
try:
await _walkTeam(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
team=team,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
result.errors.append(f"team({teamId}): {exc}")
return _finalizeResult(connectionId, result, startMs) dsId = ds.get("id", "")
dsNeutralize = ds.get("neutralize", False)
dsLimits = ClickupBootstrapLimits(
maxTasks=limits.maxTasks,
maxWorkspaces=limits.maxWorkspaces,
maxListsPerWorkspace=limits.maxListsPerWorkspace,
maxDescriptionChars=limits.maxDescriptionChars,
maxAgeDays=limits.maxAgeDays,
includeClosed=limits.includeClosed,
neutralize=dsNeutralize,
clickupScope=limits.clickupScope,
)
for team in teams[:dsLimits.maxWorkspaces]:
if result.indexed + result.skippedDuplicate >= dsLimits.maxTasks:
break
teamId = str(team.get("id", "") or "")
if not teamId:
continue
result.workspaces += 1
try:
await _walkTeam(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
team=team,
limits=dsLimits,
result=result,
progressCb=progressCb,
dataSourceId=dsId,
)
except Exception as exc:
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
result.errors.append(f"team({teamId}): {exc}")
finalResult = _finalizeResult(connectionId, result, startMs)
if cancelled:
finalResult["cancelled"] = True
return finalResult
async def _resolveDependencies(connectionId: str): async def _resolveDependencies(connectionId: str):
@ -280,8 +306,12 @@ async def _walkTeam(
team: Dict[str, Any], team: Dict[str, Any],
limits: ClickupBootstrapLimits, limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult, result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
teamId = str(team.get("id", "") or "") teamId = str(team.get("id", "") or "")
spacesResp = await svc.getSpaces(teamId) spacesResp = await svc.getSpaces(teamId)
spaces = (spacesResp or {}).get("spaces") or [] spaces = (spacesResp or {}).get("spaces") or []
@ -294,14 +324,12 @@ async def _walkTeam(
if not spaceId: if not spaceId:
continue continue
# Folderless lists directly under the space
folderless = await svc.getFolderlessLists(spaceId) folderless = await svc.getFolderlessLists(spaceId)
for lst in (folderless or {}).get("lists") or []: for lst in (folderless or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace: if len(listsCollected) >= limits.maxListsPerWorkspace:
break break
listsCollected.append({**lst, "_space": space}) listsCollected.append({**lst, "_space": space})
# Lists inside folders
foldersResp = await svc.getFolders(spaceId) foldersResp = await svc.getFolders(spaceId)
for folder in (foldersResp or {}).get("folders") or []: for folder in (foldersResp or {}).get("folders") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace: if len(listsCollected) >= limits.maxListsPerWorkspace:
@ -318,6 +346,8 @@ async def _walkTeam(
for lst in listsCollected: for lst in listsCollected:
if result.indexed + result.skippedDuplicate >= limits.maxTasks: if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return return
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
result.lists += 1 result.lists += 1
await _walkList( await _walkList(
svc=svc, svc=svc,
@ -330,6 +360,7 @@ async def _walkTeam(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
@ -344,13 +375,16 @@ async def _walkList(
lst: Dict[str, Any], lst: Dict[str, Any],
limits: ClickupBootstrapLimits, limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult, result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
listId = str(lst.get("id", "") or "") listId = str(lst.get("id", "") or "")
if not listId: if not listId:
return return
page = 0 page = 0
while result.indexed + result.skippedDuplicate < limits.maxTasks: while result.indexed + result.skippedDuplicate < limits.maxTasks:
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
resp = await svc.getTasksInList( resp = await svc.getTasksInList(
listId, listId,
page=page, page=page,
@ -371,7 +405,6 @@ async def _walkList(
if not _isRecent(task.get("date_updated"), limits.maxAgeDays): if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
result.skippedPolicy += 1 result.skippedPolicy += 1
continue continue
# Inject the list/folder/space metadata we already loaded.
task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")} task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
task["folder"] = task.get("folder") or lst.get("_folder") or {} task["folder"] = task.get("folder") or lst.get("_folder") or {}
task["space"] = task.get("space") or lst.get("_space") or {} task["space"] = task.get("space") or lst.get("_space") or {}
@ -385,9 +418,10 @@ async def _walkList(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page if len(tasks) < 100:
return return
page += 1 page += 1
@ -402,7 +436,8 @@ async def _ingestTask(
task: Dict[str, Any], task: Dict[str, Any],
limits: ClickupBootstrapLimits, limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult, result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -431,6 +466,7 @@ async def _ingestTask(
neutralize=limits.neutralize, neutralize=limits.neutralize,
provenance={ provenance={
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "clickup", "authority": "clickup",
"service": "clickup", "service": "clickup",
"externalItemId": taskId, "externalItemId": taskId,
@ -456,8 +492,10 @@ async def _ingestTask(
else: else:
result.failed += 1 result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: processed = result.indexed + result.skippedDuplicate
processed = result.indexed + result.skippedDuplicate if progressCb is not None and processed % 50 == 0:
if hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
try: try:
progressCb( progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxTasks))), min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),

View file

@ -12,6 +12,7 @@ via export), runs the standard extraction pipeline and routes results through
from __future__ import annotations from __future__ import annotations
import asyncio
import hashlib import hashlib
import logging import logging
import time import time
@ -30,7 +31,6 @@ SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4 MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365 MAX_AGE_DAYS_DEFAULT = 365
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder" FOLDER_MIME = "application/vnd.google-apps.folder"
@ -41,12 +41,8 @@ class GdriveBootstrapLimits:
maxFileSize: int = MAX_FILE_SIZE_DEFAULT maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT maxDepth: int = MAX_DEPTH_DEFAULT
# Only ingest files modified within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False neutralize: bool = False
# Whether to skip binary/non-text files
filesIndexBinaries: bool = True
@dataclass @dataclass
@ -95,10 +91,8 @@ def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays: if not maxAgeDays:
return True return True
if not modifiedIso: if not modifiedIso:
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
return True return True
try: try:
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00")) ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
except Exception: except Exception:
return True return True
@ -111,34 +105,36 @@ def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
async def bootstrapGdrive( async def bootstrapGdrive(
connectionId: str, connectionId: str,
*, *,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None, dataSources: Optional[List[Dict[str, Any]]] = None,
progressCb: Optional[Any] = None,
adapter: Any = None, adapter: Any = None,
connection: Any = None, connection: Any = None,
knowledgeService: Any = None, knowledgeService: Any = None,
limits: Optional[GdriveBootstrapLimits] = None, limits: Optional[GdriveBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None, runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Walk My Drive starting from the virtual root folder.""" """Walk My Drive starting from the virtual root folder.
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId) Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
Each DataSource defines the root path + neutralize policy for its subtree.
"""
if not dataSources:
return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}
if not limits: if not limits:
limits = GdriveBootstrapLimits( limits = GdriveBootstrapLimits()
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
filesIndexBinaries=prefs.filesIndexBinaries,
)
startMs = time.time() startMs = time.time()
result = GdriveBootstrapResult(connectionId=connectionId) result = GdriveBootstrapResult(connectionId=connectionId)
logger.info( logger.info(
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s", "ingestion.connection.bootstrap.started part=gdrive connectionId=%s dataSources=%d",
connectionId, connectionId, len(dataSources),
extra={ extra={
"event": "ingestion.connection.bootstrap.started", "event": "ingestion.connection.bootstrap.started",
"part": "gdrive", "part": "gdrive",
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceCount": len(dataSources),
}, },
) )
@ -158,25 +154,51 @@ async def bootstrapGdrive(
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try: cancelled = False
await _walkFolder( for ds in dataSources:
adapter=adapter, if result.indexed + result.skippedDuplicate >= limits.maxItems:
knowledgeService=knowledgeService, break
runExtractionFn=runExtractionFn, if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
connectionId=connectionId, cancelled = True
mandateId=mandateId, break
userId=userId,
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"walk: {exc}")
return _finalizeResult(connectionId, result, startMs) dsPath = ds.get("path", "/")
dsId = ds.get("id", "")
dsNeutralize = ds.get("neutralize", False)
dsMaxAgeDays = ds.get("maxAgeDays", limits.maxAgeDays)
dsLimits = GdriveBootstrapLimits(
maxItems=limits.maxItems,
maxBytes=limits.maxBytes,
maxFileSize=limits.maxFileSize,
skipMimePrefixes=limits.skipMimePrefixes,
maxDepth=limits.maxDepth,
maxAgeDays=dsMaxAgeDays,
neutralize=dsNeutralize,
)
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=dsPath,
depth=0,
limits=dsLimits,
result=result,
progressCb=progressCb,
dataSourceId=dsId,
)
except Exception as exc:
logger.error("gdrive walk failed for ds %s path %s: %s", dsId, dsPath, exc, exc_info=True)
result.errors.append(f"walk({dsPath}): {exc}")
finalResult = _finalizeResult(connectionId, result, startMs)
if cancelled:
finalResult["cancelled"] = True
return finalResult
async def _resolveDependencies(connectionId: str): async def _resolveDependencies(connectionId: str):
@ -220,10 +242,13 @@ async def _walkFolder(
depth: int, depth: int,
limits: GdriveBootstrapLimits, limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult, result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
if depth > limits.maxDepth: if depth > limits.maxDepth:
return return
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
try: try:
entries = await adapter.browse(folderPath) entries = await adapter.browse(folderPath)
except Exception as exc: except Exception as exc:
@ -236,6 +261,8 @@ async def _walkFolder(
return return
if result.bytesProcessed >= limits.maxBytes: if result.bytesProcessed >= limits.maxBytes:
return return
if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled():
return
entryPath = getattr(entry, "path", "") or "" entryPath = getattr(entry, "path", "") or ""
metadata = getattr(entry, "metadata", {}) or {} metadata = getattr(entry, "metadata", {}) or {}
@ -254,6 +281,7 @@ async def _walkFolder(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
continue continue
@ -288,6 +316,7 @@ async def _walkFolder(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
@ -306,7 +335,8 @@ async def _ingestOne(
revision: Optional[str], revision: Optional[str],
limits: GdriveBootstrapLimits, limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult, result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -321,14 +351,13 @@ async def _ingestOne(
result.errors.append(f"download({entryPath}): {exc}") result.errors.append(f"download({entryPath}): {exc}")
return return
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
fileBytes: bytes fileBytes: bytes
if isinstance(downloaded, (bytes, bytearray)): if isinstance(downloaded, (bytes, bytearray)):
fileBytes = bytes(downloaded) fileBytes = bytes(downloaded)
else: else:
fileBytes = bytes(getattr(downloaded, "data", b"") or b"") fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
if getattr(downloaded, "mimeType", None): if getattr(downloaded, "mimeType", None):
mimeType = downloaded.mimeType # export may have changed the type mimeType = downloaded.mimeType
if not fileBytes: if not fileBytes:
result.failed += 1 result.failed += 1
return return
@ -354,6 +383,15 @@ async def _ingestOne(
result.skippedPolicy += 1 result.skippedPolicy += 1
return return
provenance: Dict[str, Any] = {
"connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
}
try: try:
handle = await knowledgeService.requestIngestion( handle = await knowledgeService.requestIngestion(
IngestionJob( IngestionJob(
@ -366,14 +404,7 @@ async def _ingestOne(
contentObjects=contentObjects, contentObjects=contentObjects,
contentVersion=revision, contentVersion=revision,
neutralize=limits.neutralize, neutralize=limits.neutralize,
provenance={ provenance=provenance,
"connectionId": connectionId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
},
) )
) )
except Exception as exc: except Exception as exc:
@ -388,6 +419,8 @@ async def _ingestOne(
result.indexed += 1 result.indexed += 1
else: else:
result.failed += 1 result.failed += 1
if handle.error:
result.errors.append(f"ingest({entryPath}): {handle.error}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate processed = result.indexed + result.skippedDuplicate
@ -411,6 +444,8 @@ async def _ingestOne(
}, },
) )
await asyncio.sleep(0)
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]: def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000) durationMs = int((time.time() - startMs) * 1000)

View file

@ -175,35 +175,36 @@ def _buildContentObjects(
async def bootstrapGmail( async def bootstrapGmail(
connectionId: str, connectionId: str,
*, *,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None, dataSources: Optional[List[Dict[str, Any]]] = None,
progressCb: Optional[Any] = None,
adapter: Any = None, adapter: Any = None,
connection: Any = None, connection: Any = None,
knowledgeService: Any = None, knowledgeService: Any = None,
limits: Optional[GmailBootstrapLimits] = None, limits: Optional[GmailBootstrapLimits] = None,
googleGetFn: Optional[Callable[..., Any]] = None, googleGetFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Enumerate Gmail labels (INBOX + SENT default) and ingest messages.""" """Enumerate Gmail labels (INBOX + SENT default) and ingest messages.
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId) Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
Each DataSource defines the neutralize policy for its scope.
"""
if not dataSources:
return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}
if not limits: if not limits:
limits = GmailBootstrapLimits( limits = GmailBootstrapLimits()
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time() startMs = time.time()
result = GmailBootstrapResult(connectionId=connectionId) result = GmailBootstrapResult(connectionId=connectionId)
logger.info( logger.info(
"ingestion.connection.bootstrap.started part=gmail connectionId=%s", "ingestion.connection.bootstrap.started part=gmail connectionId=%s dataSources=%d",
connectionId, connectionId, len(dataSources),
extra={ extra={
"event": "ingestion.connection.bootstrap.started", "event": "ingestion.connection.bootstrap.started",
"part": "gmail", "part": "gmail",
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceCount": len(dataSources),
}, },
) )
@ -221,26 +222,51 @@ async def bootstrapGmail(
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
for labelId in limits.labels: cancelled = False
for ds in dataSources:
if result.indexed + result.skippedDuplicate >= limits.maxMessages: if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break break
try: if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
await _ingestLabel( cancelled = True
googleGetFn=googleGetFn, break
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
result.errors.append(f"label({labelId}): {exc}")
return _finalizeResult(connectionId, result, startMs) dsId = ds.get("id", "")
dsNeutralize = ds.get("neutralize", False)
dsLimits = GmailBootstrapLimits(
maxMessages=limits.maxMessages,
labels=limits.labels,
maxBodyChars=limits.maxBodyChars,
includeAttachments=limits.includeAttachments,
maxAttachmentBytes=limits.maxAttachmentBytes,
maxAgeDays=limits.maxAgeDays,
mailContentDepth=limits.mailContentDepth,
neutralize=dsNeutralize,
)
for labelId in dsLimits.labels:
if result.indexed + result.skippedDuplicate >= dsLimits.maxMessages:
break
try:
await _ingestLabel(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
limits=dsLimits,
result=result,
progressCb=progressCb,
dataSourceId=dsId,
)
except Exception as exc:
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
result.errors.append(f"label({labelId}): {exc}")
finalResult = _finalizeResult(connectionId, result, startMs)
if cancelled:
finalResult["cancelled"] = True
return finalResult
async def _resolveDependencies(connectionId: str): async def _resolveDependencies(connectionId: str):
@ -282,7 +308,8 @@ async def _ingestLabel(
labelId: str, labelId: str,
limits: GmailBootstrapLimits, limits: GmailBootstrapLimits,
result: GmailBootstrapResult, result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0: if remaining <= 0:
@ -316,6 +343,8 @@ async def _ingestLabel(
for stub in messageStubs: for stub in messageStubs:
if result.indexed + result.skippedDuplicate >= limits.maxMessages: if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break break
if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled():
return
msgId = stub.get("id") msgId = stub.get("id")
if not msgId: if not msgId:
continue continue
@ -337,6 +366,7 @@ async def _ingestLabel(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
nextPageToken = page.get("nextPageToken") nextPageToken = page.get("nextPageToken")
@ -355,7 +385,8 @@ async def _ingestMessage(
message: Dict[str, Any], message: Dict[str, Any],
limits: GmailBootstrapLimits, limits: GmailBootstrapLimits,
result: GmailBootstrapResult, result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -386,6 +417,7 @@ async def _ingestMessage(
neutralize=limits.neutralize, neutralize=limits.neutralize,
provenance={ provenance={
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "google", "authority": "google",
"service": "gmail", "service": "gmail",
"externalItemId": messageId, "externalItemId": messageId,
@ -420,6 +452,7 @@ async def _ingestMessage(
parentSyntheticId=syntheticId, parentSyntheticId=syntheticId,
limits=limits, limits=limits,
result=result, result=result,
dataSourceId=dataSourceId,
) )
except Exception as exc: except Exception as exc:
logger.warning("gmail attachments %s failed: %s", messageId, exc) logger.warning("gmail attachments %s failed: %s", messageId, exc)
@ -461,6 +494,7 @@ async def _ingestAttachments(
parentSyntheticId: str, parentSyntheticId: str,
limits: GmailBootstrapLimits, limits: GmailBootstrapLimits,
result: GmailBootstrapResult, result: GmailBootstrapResult,
dataSourceId: str = "",
) -> None: ) -> None:
"""Child ingestion jobs for file attachments. Skips inline images (cid: refs).""" """Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -561,6 +595,7 @@ async def _ingestAttachments(
contentObjects=contentObjects, contentObjects=contentObjects,
provenance={ provenance={
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "google", "authority": "google",
"service": "gmail", "service": "gmail",
"parentId": parentSyntheticId, "parentId": parentSyntheticId,

View file

@ -18,7 +18,7 @@ import hashlib
import logging import logging
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional from typing import Any, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
@ -139,34 +139,35 @@ def _buildContentObjects(
async def bootstrapOutlook( async def bootstrapOutlook(
connectionId: str, connectionId: str,
*, *,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None, dataSources: Optional[List[Dict[str, Any]]] = None,
progressCb: Optional[Any] = None,
adapter: Any = None, adapter: Any = None,
connection: Any = None, connection: Any = None,
knowledgeService: Any = None, knowledgeService: Any = None,
limits: Optional[OutlookBootstrapLimits] = None, limits: Optional[OutlookBootstrapLimits] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Enumerate Outlook folders (inbox + sent by default) and ingest messages.""" """Enumerate Outlook folders (inbox + sent by default) and ingest messages.
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId) Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
Each DataSource defines the neutralize policy for its messages.
"""
if not dataSources:
return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}
if not limits: if not limits:
limits = OutlookBootstrapLimits( limits = OutlookBootstrapLimits()
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time() startMs = time.time()
result = OutlookBootstrapResult(connectionId=connectionId) result = OutlookBootstrapResult(connectionId=connectionId)
logger.info( logger.info(
"ingestion.connection.bootstrap.started part=outlook connectionId=%s", "ingestion.connection.bootstrap.started part=outlook connectionId=%s dataSources=%d",
connectionId, connectionId, len(dataSources),
extra={ extra={
"event": "ingestion.connection.bootstrap.started", "event": "ingestion.connection.bootstrap.started",
"part": "outlook", "part": "outlook",
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceCount": len(dataSources),
}, },
) )
@ -176,27 +177,52 @@ async def bootstrapOutlook(
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
folderIds = await _selectFolderIds(adapter, limits) cancelled = False
for folderId in folderIds: for ds in dataSources:
if result.indexed + result.skippedDuplicate >= limits.maxMessages: if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break break
try: if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
await _ingestFolder( cancelled = True
adapter=adapter, break
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
return _finalizeResult(connectionId, result, startMs) dsId = ds.get("id", "")
dsNeutralize = ds.get("neutralize", False)
dsLimits = OutlookBootstrapLimits(
maxMessages=limits.maxMessages,
maxFolders=limits.maxFolders,
maxBodyChars=limits.maxBodyChars,
includeAttachments=limits.includeAttachments,
maxAttachmentBytes=limits.maxAttachmentBytes,
maxAgeDays=limits.maxAgeDays,
mailContentDepth=limits.mailContentDepth,
neutralize=dsNeutralize,
)
folderIds = await _selectFolderIds(adapter, dsLimits)
for folderId in folderIds:
if result.indexed + result.skippedDuplicate >= dsLimits.maxMessages:
break
try:
await _ingestFolder(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=dsLimits,
result=result,
progressCb=progressCb,
dataSourceId=dsId,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
finalResult = _finalizeResult(connectionId, result, startMs)
if cancelled:
finalResult["cancelled"] = True
return finalResult
async def _resolveDependencies(connectionId: str): async def _resolveDependencies(connectionId: str):
@ -266,8 +292,12 @@ async def _ingestFolder(
folderId: str, folderId: str,
limits: OutlookBootstrapLimits, limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult, result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0: if remaining <= 0:
return return
@ -307,6 +337,8 @@ async def _ingestFolder(
for message in page.get("value", []) or []: for message in page.get("value", []) or []:
if result.indexed + result.skippedDuplicate >= limits.maxMessages: if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break break
if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled():
return
await _ingestMessage( await _ingestMessage(
adapter=adapter, adapter=adapter,
knowledgeService=knowledgeService, knowledgeService=knowledgeService,
@ -317,6 +349,7 @@ async def _ingestFolder(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
nextLink = page.get("@odata.nextLink") nextLink = page.get("@odata.nextLink")
@ -338,7 +371,8 @@ async def _ingestMessage(
message: Dict[str, Any], message: Dict[str, Any],
limits: OutlookBootstrapLimits, limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult, result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -369,6 +403,7 @@ async def _ingestMessage(
neutralize=limits.neutralize, neutralize=limits.neutralize,
provenance={ provenance={
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "msft", "authority": "msft",
"service": "outlook", "service": "outlook",
"externalItemId": messageId, "externalItemId": messageId,
@ -402,6 +437,7 @@ async def _ingestMessage(
parentSyntheticId=syntheticId, parentSyntheticId=syntheticId,
limits=limits, limits=limits,
result=result, result=result,
dataSourceId=dataSourceId,
) )
except Exception as exc: except Exception as exc:
logger.warning("outlook attachments %s failed: %s", messageId, exc) logger.warning("outlook attachments %s failed: %s", messageId, exc)
@ -443,6 +479,7 @@ async def _ingestAttachments(
parentSyntheticId: str, parentSyntheticId: str,
limits: OutlookBootstrapLimits, limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult, result: OutlookBootstrapResult,
dataSourceId: str = "",
) -> None: ) -> None:
"""Child ingestion jobs for file attachments (skip inline & oversized).""" """Child ingestion jobs for file attachments (skip inline & oversized)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -531,6 +568,7 @@ async def _ingestAttachments(
neutralize=limits.neutralize, neutralize=limits.neutralize,
provenance={ provenance={
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "msft", "authority": "msft",
"service": "outlook", "service": "outlook",
"parentId": parentSyntheticId, "parentId": parentSyntheticId,

View file

@ -94,35 +94,36 @@ def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
async def bootstrapSharepoint( async def bootstrapSharepoint(
connectionId: str, connectionId: str,
*, *,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None, dataSources: Optional[List[Dict[str, Any]]] = None,
progressCb: Optional[Any] = None,
adapter: Any = None, adapter: Any = None,
connection: Any = None, connection: Any = None,
knowledgeService: Any = None, knowledgeService: Any = None,
limits: Optional[SharepointBootstrapLimits] = None, limits: Optional[SharepointBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None, runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Enumerate SharePoint drives and ingest every reachable file via the façade. """Enumerate SharePoint drives and ingest files via the facade.
Parameters allow injection for tests; production callers pass only Iterates only over explicitly provided dataSources (ragIndexEnabled=true).
`connectionId` (and optionally a progressCb) and everything else is Each DataSource defines the root path + neutralize policy for its subtree.
resolved against the registered services.
""" """
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs if not dataSources:
prefs = loadConnectionPrefs(connectionId) return {"connectionId": connectionId, "skipped": True, "reason": "no_datasources"}
if not limits: if not limits:
limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed) limits = SharepointBootstrapLimits()
startMs = time.time() startMs = time.time()
result = SharepointBootstrapResult(connectionId=connectionId) result = SharepointBootstrapResult(connectionId=connectionId)
logger.info( logger.info(
"ingestion.connection.bootstrap.started part=sharepoint connectionId=%s", "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s dataSources=%d",
connectionId, connectionId, len(dataSources),
extra={ extra={
"event": "ingestion.connection.bootstrap.started", "event": "ingestion.connection.bootstrap.started",
"part": "sharepoint", "part": "sharepoint",
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceCount": len(dataSources),
}, },
) )
@ -142,17 +143,27 @@ async def bootstrapSharepoint(
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try: cancelled = False
sites = await adapter.browse("/", limit=limits.maxSites) for ds in dataSources:
except Exception as exc:
logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"site_discovery: {exc}")
return _finalizeResult(connectionId, result, startMs)
for site in sites[: limits.maxSites]:
if result.indexed + result.skippedDuplicate >= limits.maxItems: if result.indexed + result.skippedDuplicate >= limits.maxItems:
break break
sitePath = getattr(site, "path", "") or "" if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
cancelled = True
break
dsPath = ds.get("path", "")
dsId = ds.get("id", "")
dsNeutralize = ds.get("neutralize", False)
dsLimits = SharepointBootstrapLimits(
maxItems=limits.maxItems,
maxBytes=limits.maxBytes,
maxFileSize=limits.maxFileSize,
skipMimePrefixes=limits.skipMimePrefixes,
maxDepth=limits.maxDepth,
maxSites=limits.maxSites,
neutralize=dsNeutralize,
)
try: try:
await _walkFolder( await _walkFolder(
adapter=adapter, adapter=adapter,
@ -161,17 +172,21 @@ async def bootstrapSharepoint(
connectionId=connectionId, connectionId=connectionId,
mandateId=mandateId, mandateId=mandateId,
userId=userId, userId=userId,
folderPath=sitePath, folderPath=dsPath,
depth=0, depth=0,
limits=limits, limits=dsLimits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dsId,
) )
except Exception as exc: except Exception as exc:
logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True) logger.error("sharepoint walk failed for ds %s path %s: %s", dsId, dsPath, exc, exc_info=True)
result.errors.append(f"walk({sitePath}): {exc}") result.errors.append(f"walk({dsPath}): {exc}")
return _finalizeResult(connectionId, result, startMs) finalResult = _finalizeResult(connectionId, result, startMs)
if cancelled:
finalResult["cancelled"] = True
return finalResult
async def _resolveDependencies(connectionId: str): async def _resolveDependencies(connectionId: str):
@ -221,10 +236,13 @@ async def _walkFolder(
depth: int, depth: int,
limits: SharepointBootstrapLimits, limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult, result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
if depth > limits.maxDepth: if depth > limits.maxDepth:
return return
if progressCb and hasattr(progressCb, "isCancelled") and progressCb.isCancelled():
return
try: try:
entries = await adapter.browse(folderPath) entries = await adapter.browse(folderPath)
except Exception as exc: except Exception as exc:
@ -237,6 +255,8 @@ async def _walkFolder(
return return
if result.bytesProcessed >= limits.maxBytes: if result.bytesProcessed >= limits.maxBytes:
return return
if progressCb and hasattr(progressCb, "isCancelled") and (result.indexed + result.skippedDuplicate) % 50 == 0 and progressCb.isCancelled():
return
entryPath = getattr(entry, "path", "") or "" entryPath = getattr(entry, "path", "") or ""
if getattr(entry, "isFolder", False): if getattr(entry, "isFolder", False):
@ -252,6 +272,7 @@ async def _walkFolder(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
continue continue
@ -283,6 +304,7 @@ async def _walkFolder(
limits=limits, limits=limits,
result=result, result=result,
progressCb=progressCb, progressCb=progressCb,
dataSourceId=dataSourceId,
) )
@ -301,7 +323,8 @@ async def _ingestOne(
revision: Optional[str], revision: Optional[str],
limits: SharepointBootstrapLimits, limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult, result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]], progressCb: Optional[Any],
dataSourceId: str = "",
) -> None: ) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
@ -339,6 +362,7 @@ async def _ingestOne(
provenance: Dict[str, Any] = { provenance: Dict[str, Any] = {
"connectionId": connectionId, "connectionId": connectionId,
"dataSourceId": dataSourceId,
"authority": "msft", "authority": "msft",
"service": "sharepoint", "service": "sharepoint",
"externalItemId": externalItemId, "externalItemId": externalItemId,

View file

@ -0,0 +1,78 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Resolve effective policies (neutralize, ragIndexEnabled) for DataSource tree hierarchies.
Tree-inheritance rule: nearest ancestor DataSource with an explicit value wins.
If no ancestor has a value, the default (False) is used.
"""
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
def resolveEffectiveNeutralize(
ds: Dict[str, Any],
allDataSources: List[Dict[str, Any]],
) -> bool:
"""Compute effective neutralize by walking up the path tree.
A DataSource at /sites/HR/Documents inherits from /sites/HR if
that ancestor has neutralize=True and the child has no explicit override.
"""
ownValue = ds.get("neutralize")
if ownValue is not None and ownValue is not False:
return True
if ownValue is False:
return False
return _findAncestorPolicy(ds, allDataSources, "neutralize")
def resolveEffectiveRagIndexEnabled(
ds: Dict[str, Any],
allDataSources: List[Dict[str, Any]],
) -> bool:
"""Compute effective ragIndexEnabled by walking up the path tree."""
ownValue = ds.get("ragIndexEnabled")
if ownValue is True:
return True
if ownValue is False:
return False
return _findAncestorPolicy(ds, allDataSources, "ragIndexEnabled")
def _findAncestorPolicy(
ds: Dict[str, Any],
allDataSources: List[Dict[str, Any]],
field: str,
) -> bool:
"""Walk ancestors (longest-prefix match) to find an inherited policy value."""
dsPath = ds.get("path", "")
connectionId = ds.get("connectionId", "")
if not dsPath:
return False
ancestors = []
for candidate in allDataSources:
if candidate.get("id") == ds.get("id"):
continue
if candidate.get("connectionId") != connectionId:
continue
candidatePath = candidate.get("path", "")
if not candidatePath:
continue
if dsPath.startswith(candidatePath) and len(candidatePath) < len(dsPath):
ancestors.append(candidate)
ancestors.sort(key=lambda a: len(a.get("path", "")), reverse=True)
for ancestor in ancestors:
val = ancestor.get(field)
if val is True:
return True
if val is False:
return False
return False

View file

@ -98,7 +98,8 @@ class WebService:
searchUrls = [] searchUrls = []
searchResultsWithContent = [] searchResultsWithContent = []
if needsSearch and (not allUrls or len(allUrls) < maxNumberPages): if needsSearch and (not allUrls or len(allUrls) < maxNumberPages):
self._get_service("chat").progressLogUpdate(operationId, 0.3, "Searching for URLs and content") if operationId:
self._get_service("chat").progressLogUpdate(operationId, 0.3, "Searching for URLs and content")
try: try:
searchUrls, searchResultsWithContent = await self._performWebSearch( searchUrls, searchResultsWithContent = await self._performWebSearch(
@ -113,16 +114,14 @@ class WebService:
searchUrls = [] searchUrls = []
searchResultsWithContent = [] searchResultsWithContent = []
# Prioritize Tavily search URLs over AI-extracted URLs (they're more relevant)
if searchUrls: if searchUrls:
# Prepend Tavily URLs to the list (they're more relevant)
allUrls = searchUrls + allUrls allUrls = searchUrls + allUrls
logger.info(f"Using {len(searchUrls)} Tavily URLs + {len(allUrls) - len(searchUrls)} other URLs = {len(allUrls)} total") logger.info(f"Using {len(searchUrls)} Tavily URLs + {len(allUrls) - len(searchUrls)} other URLs = {len(allUrls)} total")
else: else:
# If Tavily search failed, use AI-extracted URLs
logger.warning("Tavily search returned no URLs, using AI-extracted URLs only") logger.warning("Tavily search returned no URLs, using AI-extracted URLs only")
self._get_service("chat").progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs") if operationId:
self._get_service("chat").progressLogUpdate(operationId, 0.5, f"Found {len(allUrls)} total URLs")
# If we have search results (even without content), use them directly instead of crawling # If we have search results (even without content), use them directly instead of crawling
# Tavily search results are more relevant than generic AI-extracted URLs # Tavily search results are more relevant than generic AI-extracted URLs

View file

@ -144,6 +144,14 @@ NAVIGATION_SECTIONS = [
"path": "/automations", "path": "/automations",
"order": 30, "order": 30,
}, },
{
"id": "rag-inventory",
"objectKey": "ui.system.ragInventory",
"label": t("RAG-Inventar"),
"icon": "FaDatabase",
"path": "/rag-inventory",
"order": 35,
},
{ {
"id": "store", "id": "store",
"objectKey": "ui.system.store", "objectKey": "ui.system.store",

View file

@ -110,6 +110,9 @@ asyncpg==0.30.0
## Stripe payments ## Stripe payments
stripe>=11.0.0 stripe>=11.0.0
## Outlook MSG file extraction
extract-msg>=0.55.0
## Geospatial libraries for STAC connector ## Geospatial libraries for STAC connector
pyproj>=3.6.0 # For coordinate transformations (EPSG:2056 <-> EPSG:4326) pyproj>=3.6.0 # For coordinate transformations (EPSG:2056 <-> EPSG:4326)
shapely>=2.0.0 # For geometric operations (intersections, area calculations) shapely>=2.0.0 # For geometric operations (intersections, area calculations)

View file

@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""Migration: Rename DataSource.autoSync -> ragIndexEnabled, lastSynced -> lastIndexed.
This is a one-off migration for the RAG consent & control unification.
Safe to run multiple times (checks column existence before acting).
Usage:
python script_db_migrate_datasource_rag.py [--dry-run]
"""
import os
import sys
import argparse
import logging
from pathlib import Path
scriptPath = Path(__file__).resolve()
gatewayPath = scriptPath.parent.parent
sys.path.insert(0, str(gatewayPath))
os.chdir(str(gatewayPath))
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", force=True)
logger = logging.getLogger(__name__)
import psycopg2
from modules.shared.configuration import APP_CONFIG
def _getConnection():
return psycopg2.connect(
host=APP_CONFIG.get("DB_HOST", "localhost"),
port=int(APP_CONFIG.get("DB_PORT", "5432")),
database=APP_CONFIG.get("DB_DATABASE", "poweron_app"),
user=APP_CONFIG.get("DB_USER"),
password=APP_CONFIG.get("DB_PASSWORD_SECRET"),
)
def _columnExists(cur, table: str, column: str) -> bool:
cur.execute(
"""SELECT 1 FROM information_schema.columns
WHERE table_schema = 'public' AND table_name = %s AND column_name = %s""",
(table, column),
)
return cur.fetchone() is not None
def migrate(dryRun: bool = False):
conn = _getConnection()
conn.autocommit = False
cur = conn.cursor()
renames = [
("DataSource", "autoSync", "ragIndexEnabled"),
("DataSource", "lastSynced", "lastIndexed"),
]
executed = []
for table, oldCol, newCol in renames:
if _columnExists(cur, table, oldCol) and not _columnExists(cur, table, newCol):
sql = f'ALTER TABLE public."{table}" RENAME COLUMN "{oldCol}" TO "{newCol}";'
logger.info("EXEC: %s", sql)
if not dryRun:
cur.execute(sql)
executed.append(sql)
elif _columnExists(cur, table, newCol):
logger.info("SKIP: %s.%s already exists (migration already applied)", table, newCol)
elif not _columnExists(cur, table, oldCol):
logger.warning("SKIP: %s.%s does not exist (table schema may differ)", table, oldCol)
if not dryRun and executed:
conn.commit()
logger.info("Migration committed (%d statements)", len(executed))
elif dryRun and executed:
conn.rollback()
logger.info("DRY RUN — would execute %d statements", len(executed))
else:
logger.info("Nothing to do — schema already up to date")
cur.close()
conn.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--dry-run", action="store_true", help="Print SQL without executing")
args = parser.parse_args()
migrate(dryRun=args.dry_run)

View file

@ -0,0 +1,23 @@
# Copyright (c) 2025 Patrick Motsch
"""Unit tests for Google STT helper config (no API calls)."""
from modules.connectors.connectorVoiceGoogle import _buildPrimarySttRecognitionFields
def test_buildPrimaryStt_lightweight_stripsHeavyFeatures():
d = _buildPrimarySttRecognitionFields(model="latest_short", lightweight=True)
assert d["model"] == "latest_short"
assert d["enable_word_time_offsets"] is False
assert d["enable_word_confidence"] is False
assert d["max_alternatives"] == 1
assert d["use_enhanced"] is False
assert d["enable_automatic_punctuation"] is True
def test_buildPrimaryStt_full_matchesLegacyDefaults():
d = _buildPrimarySttRecognitionFields(model="latest_long", lightweight=False)
assert d["model"] == "latest_long"
assert d["enable_word_time_offsets"] is True
assert d["enable_word_confidence"] is True
assert d["max_alternatives"] == 3
assert d["use_enhanced"] is True

View file

@ -100,6 +100,9 @@ def _adapter(svc):
return SimpleNamespace(_svc=svc) return SimpleNamespace(_svc=svc)
_DEFAULT_DS = [{"id": "ds-1", "neutralize": False}]
def test_bootstrap_walks_team_space_lists_and_tasks(): def test_bootstrap_walks_team_space_lists_and_tasks():
svc = _FakeClickupService(taskCount=2) svc = _FakeClickupService(taskCount=2)
knowledge = _FakeKnowledgeService() knowledge = _FakeKnowledgeService()
@ -108,6 +111,7 @@ def test_bootstrap_walks_team_space_lists_and_tasks():
async def _run(): async def _run():
return await bootstrapClickup( return await bootstrapClickup(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=_adapter(svc), adapter=_adapter(svc),
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -126,10 +130,10 @@ def test_bootstrap_walks_team_space_lists_and_tasks():
assert job.mimeType == "application/vnd.clickup.task+json" assert job.mimeType == "application/vnd.clickup.task+json"
assert job.mandateId == "m1" assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1" assert job.provenance["connectionId"] == "c1"
assert job.provenance["dataSourceId"] == "ds-1"
assert job.provenance["authority"] == "clickup" assert job.provenance["authority"] == "clickup"
assert job.provenance["teamId"] == "team-1" assert job.provenance["teamId"] == "team-1"
assert job.contentVersion # numeric millisecond string assert job.contentVersion # numeric millisecond string
# At least the header content-object is present.
ids = [co["contentObjectId"] for co in job.contentObjects] ids = [co["contentObjectId"] for co in job.contentObjects]
assert "header" in ids assert "header" in ids
@ -146,6 +150,7 @@ def test_bootstrap_reports_duplicates_on_second_run():
async def _run(): async def _run():
return await bootstrapClickup( return await bootstrapClickup(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=_adapter(svc), adapter=_adapter(svc),
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -165,6 +170,7 @@ def test_bootstrap_skips_tasks_older_than_maxAgeDays():
async def _run(): async def _run():
return await bootstrapClickup( return await bootstrapClickup(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=_adapter(svc), adapter=_adapter(svc),
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -185,6 +191,7 @@ def test_bootstrap_maxTasks_caps_ingestion():
async def _run(): async def _run():
return await bootstrapClickup( return await bootstrapClickup(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=_adapter(svc), adapter=_adapter(svc),
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -195,9 +202,41 @@ def test_bootstrap_maxTasks_caps_ingestion():
assert result["indexed"] == 3 assert result["indexed"] == 3
def test_bootstrap_skips_when_no_datasources():
async def _run():
return await bootstrapClickup(connectionId="c1")
result = asyncio.run(_run())
assert result["skipped"] is True
assert result["reason"] == "no_datasources"
def test_bootstrap_honours_datasource_neutralize():
svc = _FakeClickupService(taskCount=1)
knowledge = _FakeKnowledgeService()
connection = SimpleNamespace(mandateId="m1", userId="u1")
async def _run():
return await bootstrapClickup(
connectionId="c1",
dataSources=[{"id": "ds-n", "neutralize": True}],
adapter=_adapter(svc),
connection=connection,
knowledgeService=knowledge,
limits=ClickupBootstrapLimits(maxAgeDays=None),
)
asyncio.run(_run())
for job in knowledge.calls:
assert job.neutralize is True
assert job.provenance["dataSourceId"] == "ds-n"
if __name__ == "__main__": if __name__ == "__main__":
test_bootstrap_walks_team_space_lists_and_tasks() test_bootstrap_walks_team_space_lists_and_tasks()
test_bootstrap_reports_duplicates_on_second_run() test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_skips_tasks_older_than_maxAgeDays() test_bootstrap_skips_tasks_older_than_maxAgeDays()
test_bootstrap_maxTasks_caps_ingestion() test_bootstrap_maxTasks_caps_ingestion()
test_bootstrap_skips_when_no_datasources()
test_bootstrap_honours_datasource_neutralize()
print("OK — bootstrapClickup tests passed") print("OK — bootstrapClickup tests passed")

View file

@ -119,6 +119,9 @@ def _fakeRunExtraction(data, name, mime, options):
) )
_DEFAULT_DS = [{"id": "ds1", "path": "/", "neutralize": False}]
def test_bootstrap_walks_drive_and_subfolders(): def test_bootstrap_walks_drive_and_subfolders():
adapter = _FakeDriveAdapter() adapter = _FakeDriveAdapter()
knowledge = _FakeKnowledgeService() knowledge = _FakeKnowledgeService()
@ -127,6 +130,7 @@ def test_bootstrap_walks_drive_and_subfolders():
async def _run(): async def _run():
return await bootstrapGdrive( return await bootstrapGdrive(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -160,6 +164,7 @@ def test_bootstrap_reports_duplicates_on_second_run():
async def _run(): async def _run():
return await bootstrapGdrive( return await bootstrapGdrive(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -180,11 +185,11 @@ def test_bootstrap_skips_files_older_than_maxAgeDays():
async def _run(): async def _run():
return await bootstrapGdrive( return await bootstrapGdrive(
connectionId="c1", connectionId="c1",
dataSources=[{"id": "ds1", "path": "/", "neutralize": False, "maxAgeDays": 180}],
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
runExtractionFn=_fakeRunExtraction, runExtractionFn=_fakeRunExtraction,
limits=GdriveBootstrapLimits(maxAgeDays=180),
) )
result = asyncio.run(_run()) result = asyncio.run(_run())
@ -200,6 +205,7 @@ def test_bootstrap_passes_connection_provenance():
async def _run(): async def _run():
return await bootstrapGdrive( return await bootstrapGdrive(
connectionId="c1", connectionId="c1",
dataSources=_DEFAULT_DS,
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -212,14 +218,25 @@ def test_bootstrap_passes_connection_provenance():
assert job.sourceKind == "gdrive_item" assert job.sourceKind == "gdrive_item"
assert job.mandateId == "m1" assert job.mandateId == "m1"
assert job.provenance["connectionId"] == "c1" assert job.provenance["connectionId"] == "c1"
assert job.provenance["dataSourceId"] == "ds1"
assert job.provenance["authority"] == "google" assert job.provenance["authority"] == "google"
assert job.provenance["service"] == "drive" assert job.provenance["service"] == "drive"
assert job.contentVersion # modifiedTime ISO string assert job.contentVersion # modifiedTime ISO string
def test_bootstrap_skips_when_no_datasources():
async def _run():
return await bootstrapGdrive(connectionId="c1")
result = asyncio.run(_run())
assert result["skipped"] is True
assert result["reason"] == "no_datasources"
if __name__ == "__main__": if __name__ == "__main__":
test_bootstrap_walks_drive_and_subfolders() test_bootstrap_walks_drive_and_subfolders()
test_bootstrap_reports_duplicates_on_second_run() test_bootstrap_reports_duplicates_on_second_run()
test_bootstrap_skips_files_older_than_maxAgeDays() test_bootstrap_skips_files_older_than_maxAgeDays()
test_bootstrap_passes_connection_provenance() test_bootstrap_passes_connection_provenance()
test_bootstrap_skips_when_no_datasources()
print("OK — bootstrapGdrive tests passed") print("OK — bootstrapGdrive tests passed")

View file

@ -111,6 +111,7 @@ def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
async def _run(): async def _run():
return await bootstrapOutlook( return await bootstrapOutlook(
connectionId="c1", connectionId="c1",
dataSources=[{"id": "ds1", "neutralize": False}],
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -129,6 +130,7 @@ def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
assert job.sourceKind == "outlook_message" assert job.sourceKind == "outlook_message"
assert job.mimeType == "message/rfc822" assert job.mimeType == "message/rfc822"
assert job.provenance["connectionId"] == "c1" assert job.provenance["connectionId"] == "c1"
assert job.provenance["dataSourceId"] == "ds1"
assert job.provenance["service"] == "outlook" assert job.provenance["service"] == "outlook"
assert job.contentVersion == "ck1" assert job.contentVersion == "ck1"
assert any(co["contentObjectId"] == "header" for co in job.contentObjects) assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
@ -146,6 +148,7 @@ def test_bootstrap_outlook_follows_pagination():
async def _run(): async def _run():
return await bootstrapOutlook( return await bootstrapOutlook(
connectionId="c1", connectionId="c1",
dataSources=[{"id": "ds1", "neutralize": False}],
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,
@ -171,6 +174,7 @@ def test_bootstrap_outlook_reports_duplicates():
async def _run(): async def _run():
return await bootstrapOutlook( return await bootstrapOutlook(
connectionId="c1", connectionId="c1",
dataSources=[{"id": "ds1", "neutralize": False}],
adapter=adapter, adapter=adapter,
connection=connection, connection=connection,
knowledgeService=knowledge, knowledgeService=knowledge,