refactor: modules/services/ abgeloest durch serviceCenter + serviceHub

serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung
serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter)

- modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface()
- 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt
- resolver.py: legacy fallback auf altes services/ entfernt
- modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py)
- pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
This commit is contained in:
ValueOn AG 2026-03-14 11:51:45 +01:00
parent 6919a23d4f
commit c8b7517209
144 changed files with 1050 additions and 38313 deletions

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = http://localhost:5176
# AI configuration
Connector_AiOpenai_API_SECRET = DEV_ENC:Z0FBQUFBQnBaSnM4TWFRRmxVQmNQblVIYmc1Y0Q3aW9zZUtDWlNWdGZjbFpncGp2NHN2QjkxMWxibUJnZDBId252MWk5TXN3Yk14ajFIdi1CTkx2ZWx2QzF5OFR6LUx5azQ3dnNLaXJBOHNxc0tlWmtZcTFVelF4eXBSM2JkbHd2eTM0VHNXdHNtVUprZWtPVzctNlJsZHNmM20tU1N6Q1Q2cHFYSi1tNlhZNDNabTVuaEVGWmIydEhadTcyMlBURmw2aUJxOF9GTzR0dTZiNGZfOFlHaVpPZ1A1LXhhOEFtN1J5TEVNNWtMcGpyNkMzSl8xRnZsaTF1WTZrOUZmb0cxVURjSGFLS2dIYTQyZEJtTm90bEYxVWxNNXVPdTVjaVhYbXhxT3JsVDM5VjZMVFZKSE1tZnM9
Connector_AiAnthropic_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpENmFBWG16STFQUVZxNzZZRzRLYTA4X3lRanF1VkF4cU45OExNMzlsQmdISGFxTUxud1dXODBKcFhMVG9KNjdWVnlTTFFROVc3NDlsdlNHLUJXeG41NDBHaXhHR0VHVWl5UW9RNkVWbmlhakRKVW5pM0R4VHk0LUw0TV9LdkljNHdBLXJua21NQkl2b3l4UkVkMGN1YjBrMmJEeWtMay1jbmxrYWJNbUV0aktCXzU1djR2d2RSQXZORTNwcG92ZUVvVGMtQzQzTTVncEZTRGRtZUFIZWQ0dz09
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQTdnUHMwd2pIaXNtMmtCTFREd0pyQXRKb1F5eGtHSnkyOGZiUnlBOFc0b3Vzcndrc3ViRm1nMDJIOEZKYWxqdWNkZGh5N0Z4R0JlQmxXSG5pVnJUR2VYckZhMWNMZ1FNeXJ3enJLVlpiblhOZTNleUg3ZzZyUzRZanFSeDlVMkI=
Connector_AiPrivateLlm_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGRHM5eFdUVmVZU1R1cHBwN1RlMUx4T0NlLTJLUFFVX3J2OElDWFpuZmJHVmp4Z3BNNWMwZUVVZUd2TFhRSjVmVkVlcFlVRWtybXh0ZHloZ01ZcnVvX195YjdlWVdEcjZSWFFTTlNBWUlaTlNoLWhqVFBIb0thVlBiaWhjYjFQOFY=
Connector_AiMistral_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGeEQxYUIxOHhia0JlQWpWQ2dWQWZzY3l6SWwyUnJoR1hRQWloX2lxb2lGNkc4UnA4U2tWNjJaYzB1d1hvNG9fWUp1N3V4OW9FMGhaWVhjSlVwWEc1X2loVDBSZDEtdHdfcTA5QkcxQTR4OHc4RkRzclJrU2d1RFZpNDJkRDRURlE=

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla-int.poweron-center.net
# AI configuration
Connector_AiOpenai_API_SECRET = INT_ENC:Z0FBQUFBQnBaSnM4MENkQ2xJVmE5WFZKUkh2SHJFby1YVXN3ZmVxRkptS3ZWRmlwdU93ZEJjSjlMV2NGbU5mS3NCdmFfcmFYTEJNZXFIQ3ozTWE4ZC1pemlQNk9wbjU1d3BPS0ZCTTZfOF8yWmVXMWx0TU1DamlJLVFhSTJXclZsY3hMVWlPcXVqQWtMdER4T252NHZUWEhUOTdIN1VGR3ltazEweXFqQ0lvb0hYWmxQQnpxb0JwcFNhRDNGWXdoRTVJWm9FalZpTUF5b1RqZlRaYnVKYkp0NWR5Vko1WWJ0Wmg2VWJzYXZ0Z3Q4UkpsTldDX2dsekhKMmM4YjRoa2RwemMwYVQwM2cyMFlvaU5mOTVTWGlROU8xY2ZVRXlxZzJqWkxURWlGZGI2STZNb0NpdEtWUnM9
Connector_AiAnthropic_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjT1ZlRWVJdVZMT3ljSFJDcFdxRFBRVkZhS204NnN5RDBlQ0tpenhTM0FFVktuWW9mWHNwRWx2dHB0eDBSZ0JFQnZKWlp6c01pVGREWHd1eGpERnU0Q2xhaks1clQ1ZXVsdnd2ZzhpNXNQS1BhY3FjSkdkVEhHalNaRGR4emhpakZncnpDQUVxOHVXQzVUWmtQc0FsYmFwTF9TSG5FOUFtWk5Ick1NcHFvY2s1T1c2WXlRUFFJZnh6TWhuaVpMYmppcDR0QUx0a0R6RXlwbGRYb1R4dzJkUT09
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkdkJMTDY0akhXNzZDWHVYSEt1cDZoOWEzSktneHZEV2JndTNmWlNSMV9KbFNIZmQzeVlrNE5qUEIwcUlBSGM1a0hOZ3J6djIyOVhnZzI3M1dIUkdicl9FVXF3RGktMmlEYmhnaHJfWTdGUkktSXVUSGdQMC1vSEV6VE8zR2F1SVk=
Connector_AiPrivateLlm_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGSjZ1NWh0aWc1R3Z4MHNaeS1HamtUbndhcUZFZDlqUDhjSmg5eHFfdlVkU0RsVkJ2UVRaMWs3aWhraG5jSlc0YkxNWHVmR2JoSW5ENFFCdkJBM0VienlKSnhzNnBKbTJOUTFKczRfWlQ3bWpmUkRTT1I1OGNUSTlQdExacGRpeXg=
Connector_AiMistral_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGZTNtZ1E4TWIxSEU1OUlreUpxZkJIR0Vxcm9xRHRUbnBxbTQ1cXlkbnltWkJVdTdMYWZ4c3Fsam42TERWUTVhNzZFMU9xVjdyRGFCYml6bmZsZFd2YmJzemlrSWN6Q3o3X0NXX2xXNUQteTNONHdKYzJ5YVpLLWdhU2JhSTJQZnI=

View file

@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla.poweron-center.net
# AI configuration
Connector_AiOpenai_API_SECRET = PROD_ENC:Z0FBQUFBQnBaSnM4TWJOVm4xVkx6azRlNDdxN3UxLUdwY2hhdGYxRGp4VFJqYXZIcmkxM1ZyOWV2M0Z4MHdFNkVYQ0ROb1d6LUZFUEdvMHhLMEtXYVBCRzM5TlYyY3ROYWtJRk41cDZxd0tYYi00MjVqMTh4QVcyTXl0bmVocEFHbXQwREpwNi1vODdBNmwzazE5bkpNelE2WXpvblIzWlQwbGdEelI2WXFqT1RibXVHcjNWbVhwYzBOM25XTzNmTDAwUjRvYk4yNjIyZHc5c2RSZzREQUFCdUwyb0ZuOXN1dzI2c2FKdXI4NGxEbk92czZWamJXU3ZSbUlLejZjRklRRk4tLV9aVUFZekI2bTU4OHYxNTUybDg3RVo0ZTh6dXNKRW5GNXVackZvcm9laGI0X3R6V3M9
Connector_AiAnthropic_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3TnhYdlhSLW5RbXJyMHFXX0V0bHhuTDlTaFJsRDl2dTdIUTFtVFAwTE8tY3hLbzNSMnVTLXd3RUZualN3MGNzc1kwOTIxVUN2WW1rYi1TendFRVVBSVNqRFVjckEzNExyTGNaUkJLMmozazUwemI1cnhrcEtZVXJrWkdaVFFramp3MWZ6RmY2aGlRMXVEYjM2M3ZlbmxMdnNCRDM1QWR0Wmd6MWVnS1I1c01nV3hRLXg3d2NTZXVfTi1Wdm16UnRyNGsyRTZ0bG9TQ1g1OFB5Z002bmQ3QT09
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
Connector_AiPerplexity_API_SECRET = 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
Connector_AiTavily_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3NmItcDh6V0JpcE5Jc0NlUWZqcmllRHB5eDlNZmVnUlNVenhNTm5xWExzbjJqdE1GZ0hTSUYtb2dvdWNhTnlQNmVWQ2NGVDgwZ0MwMWZBMlNKWEhzdlF3TlZzTXhCZWM4Z1Uwb18tSTRoU1JBVTVkSkJHOTJwX291b3dPaVphVFg=
Connector_AiPrivateLlm_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGanZ6U3pzZWkwXzVPWGtIQ040XzFrTXc5QWRnazdEeEktaUJ0akJmNnEzbWUzNHczLTJfc2dIdzBDY0FTaXZYcDhxNFdNbTNtbEJTb2VRZ0ZYd05hdlNLR1h6SUFzVml2Z1FLY1BjTl90UWozUGxtak1URnhhZmNDRWFTb0dKVUo=
Connector_AiMistral_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGc2tQc2lvMk1YZk01Q1dob1U5cnR0dG03WWE3WkpoOWo0SEpvLU9Rc2lCNDExdy1wZExaN3lpT2FEQkxnaHRmWmZUUUZUUUJmblZreGlpaFpOdnFhbzlEd1RsVVJtX216cmhxTm5BcTN2eUZ2T054cDE5bmlEamJ3NGR6MVpFQnA=

View file

@ -113,7 +113,7 @@ class AiAnthropic(BaseConnectorAi):
processingMode=ProcessingModeEnum.DETAILED,
operationTypes=createOperationTypeRatings(
(OperationTypeEnum.PLAN, 10),
(OperationTypeEnum.DATA_ANALYSE, 10),
(OperationTypeEnum.DATA_ANALYSE, 8),
(OperationTypeEnum.DATA_GENERATE, 10),
(OperationTypeEnum.DATA_EXTRACT, 9)
),

View file

@ -288,7 +288,16 @@ class AiTavily(BaseConnectorAi):
if maxResults < minResults or maxResults > maxAllowedResults:
raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")
# Perform actual API call
# Tavily enforces a 400-character query limit
TAVILY_MAX_QUERY_LENGTH = 400
if len(query) > TAVILY_MAX_QUERY_LENGTH:
truncated = query[:TAVILY_MAX_QUERY_LENGTH]
lastSpace = truncated.rfind(' ')
if lastSpace > TAVILY_MAX_QUERY_LENGTH // 2:
truncated = truncated[:lastSpace]
logger.warning(f"Tavily query truncated from {len(query)} to {len(truncated)} chars")
query = truncated
# Build kwargs only for provided options to avoid API rejections
kwargs: dict = {"query": query, "max_results": maxResults}
if searchDepth is not None:

View file

@ -123,6 +123,12 @@ class BillingTransaction(BaseModel):
aicoreProvider: Optional[str] = Field(None, description="AICore provider (anthropic, openai, etc.)")
aicoreModel: Optional[str] = Field(None, description="AICore model name (e.g., claude-4-sonnet, gpt-4o)")
createdByUserId: Optional[str] = Field(None, description="User who created/caused this transaction")
# AI call metadata (for per-call analytics)
processingTime: Optional[float] = Field(None, description="Processing time in seconds")
bytesSent: Optional[int] = Field(None, description="Bytes sent to AI model")
bytesReceived: Optional[int] = Field(None, description="Bytes received from AI model")
errorCount: Optional[int] = Field(None, description="Number of errors in this call")
registerModelLabels(

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatStat, ChatDocument."""
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatDocument."""
from typing import List, Dict, Any, Optional
from enum import Enum
@ -10,44 +10,6 @@ from modules.shared.timeUtils import getUtcTimestamp
import uuid
class ChatStat(BaseModel):
"""Statistics for chat operations. User-owned, no mandate context."""
model_config = {"populate_by_name": True, "extra": "allow"} # Allow DB system fields
id: str = Field(
default_factory=lambda: str(uuid.uuid4()), description="Primary key"
)
workflowId: Optional[str] = Field(
None, description="Foreign key to workflow (for workflow stats)"
)
processingTime: Optional[float] = Field(
None, description="Processing time in seconds"
)
bytesSent: Optional[int] = Field(None, description="Number of bytes sent")
bytesReceived: Optional[int] = Field(None, description="Number of bytes received")
errorCount: Optional[int] = Field(None, description="Number of errors encountered")
process: Optional[str] = Field(None, description="The process that delivers the stats data (e.g. 'action.outlook.readMails', 'ai.process.document.name')")
engine: Optional[str] = Field(None, description="The engine used (e.g. 'ai.anthropic.35', 'ai.tavily.basic', 'renderer.docx')")
priceCHF: Optional[float] = Field(None, description="Calculated price in USD for the operation")
registerModelLabels(
"ChatStat",
{"en": "Chat Statistics", "fr": "Statistiques de chat"},
{
"id": {"en": "ID", "fr": "ID"},
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
"processingTime": {"en": "Processing Time", "fr": "Temps de traitement"},
"bytesSent": {"en": "Bytes Sent", "fr": "Octets envoyés"},
"bytesReceived": {"en": "Bytes Received", "fr": "Octets reçus"},
"errorCount": {"en": "Error Count", "fr": "Nombre d'erreurs"},
"process": {"en": "Process", "fr": "Processus"},
"engine": {"en": "Engine", "fr": "Moteur"},
"priceCHF": {"en": "Price CHF", "fr": "Prix CHF"},
},
)
class ChatLog(BaseModel):
"""Log entries for chat workflows. User-owned, no mandate context."""
id: str = Field(
@ -322,7 +284,6 @@ class ChatWorkflow(BaseModel):
startedAt: float = Field(default_factory=getUtcTimestamp, description="When the workflow started (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
logs: List[ChatLog] = Field(default_factory=list, description="Workflow logs", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
messages: List[ChatMessage] = Field(default_factory=list, description="Messages in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
stats: List[ChatStat] = Field(default_factory=list, description="Workflow statistics list", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
tasks: list = Field(default_factory=list, description="List of tasks in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
workflowMode: WorkflowModeEnum = Field(default=WorkflowModeEnum.WORKFLOW_DYNAMIC, description="Workflow mode selector", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [
{

View file

@ -21,6 +21,7 @@ from modules.datamodels.datamodelChat import ChatWorkflow, ChatMessage, ChatLog
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.interfaces import interfaceDbChat
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
# Configure logger
logger = logging.getLogger(__name__)
@ -682,7 +683,9 @@ def get_automation_workflow_chat_data(
workflow = chatInterface.getWorkflow(workflowId)
if not workflow:
raise HTTPException(status_code=404, detail=f"Workflow {workflowId} not found")
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
billingInterface = _getBillingInterface(context.user, context.mandateId)
workflowCost = billingInterface.getWorkflowCost(workflowId)
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
except HTTPException:
raise
except Exception as e:

View file

@ -1291,17 +1291,6 @@ class ChatObjects:
logger.error(f"Error updating message {messageId}: {str(e)}", exc_info=True)
raise ValueError(f"Error updating message {messageId}: {str(e)}")
def createStat(self, statData: Dict[str, Any]):
"""Create stat record. Compatibility with ChatService; stats may not be persisted in chatbot schema."""
from modules.datamodels.datamodelChat import ChatStat
stat = ChatStat(**statData)
try:
created = self.db.recordCreate(ChatStat, statData)
return ChatStat(**created)
except Exception as e:
logger.debug(f"createStat: not persisting (chatbot schema): {e}")
return stat
def deleteMessage(self, conversationId: str, messageId: str) -> bool:
"""Deletes a conversation message and related data if user has access."""
try:

View file

@ -306,12 +306,12 @@ def getChatbotServices(
Uses interfaceFeatureChatbot (ChatObjects) for interfaceDbChat to avoid
duplicate DB init - chatProcess reuses hub.interfaceDbChat.
"""
from modules.services import PublicService
from modules.serviceHub import PublicService
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
from modules.features.chatbot.interfaceFeatureChatbot import getInterface as getChatbotInterface
from modules.services.serviceChat.mainServiceChat import ChatService
from modules.services.serviceAi.mainServiceAi import AiService
from modules.services.serviceStreaming.mainServiceStreaming import StreamingService
from modules.serviceCenter.services.serviceChat.mainServiceChat import ChatService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.core.serviceStreaming.mainServiceStreaming import StreamingService
hub = _ChatbotServiceHub()
hub.user = user

View file

@ -135,11 +135,3 @@ class ChatPlaygroundObjects:
def createLog(self, log) -> Dict[str, Any]:
"""Create a new log entry."""
return self._chatInterface.createLog(log)
def getStats(self, workflowId: str) -> List[Dict[str, Any]]:
"""Get stats for a workflow."""
return self._chatInterface.getStats(workflowId)
def createStat(self, stat) -> Dict[str, Any]:
"""Create a new stat entry."""
return self._chatInterface.createStat(stat)

View file

@ -15,6 +15,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
# Import interfaces
from modules.interfaces import interfaceDbChat
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
# Import models
from modules.datamodels.datamodelChat import (
@ -220,9 +221,11 @@ def get_workflow_chat_data(
detail=f"Workflow with ID {workflowId} not found"
)
# Get unified chat data
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
# Get workflow cost from billing transactions (single source of truth)
billingInterface = _getBillingInterface(context.user, context.mandateId)
workflowCost = billingInterface.getWorkflowCost(workflowId)
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
return chatData
except HTTPException:

View file

@ -17,7 +17,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
from modules.interfaces import interfaceDbChat, interfaceDbManagement
from modules.interfaces.interfaceAiObjects import AiObjects
from modules.datamodels.datamodelChat import UserInputRequest
from modules.services.serviceStreaming import get_event_manager
from modules.serviceCenter.core.serviceStreaming import get_event_manager
from modules.features.codeeditor import codeEditorProcessor, fileContextManager
from modules.features.codeeditor.datamodelCodeeditor import FileEditProposal, EditStatusEnum

View file

@ -1011,7 +1011,7 @@ class CommcoachService:
async def _callAi(self, systemPrompt: str, userPrompt: str):
"""Call the AI service with the given prompts."""
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
serviceContext = type('Ctx', (), {
'user': self.currentUser,

View file

@ -7,7 +7,7 @@ from urllib.parse import urlparse, unquote
from modules.datamodels.datamodelUam import User
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
logger = logging.getLogger(__name__)
@ -205,7 +205,7 @@ class NeutralizationPlayground:
async def processSharepointFiles(self, sourcePath: str, targetPath: str) -> Dict[str, Any]:
"""Process files from SharePoint source path and store neutralized files in target path"""
from modules.services.serviceSharepoint.mainServiceSharepoint import SharepointService
from modules.serviceCenter.services.serviceSharepoint.mainServiceSharepoint import SharepointService
processor = SharepointProcessor(self.currentUser, self.services)
return await processor.processSharepointFiles(sourcePath, targetPath)

View file

@ -262,8 +262,8 @@ class NeutralizationService:
fileId: Optional[str]
) -> Dict[str, Any]:
"""Extract -> neutralize -> adapt -> generate for PDF/DOCX/XLSX/PPTX."""
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.mainServiceExtraction import ExtractionService
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
# Ensure registries exist
@ -405,10 +405,10 @@ class NeutralizationService:
def _getRendererForMime(self, mimeType: str):
"""Get renderer instance and output mime for the given input MIME type."""
from modules.services.serviceGeneration.renderers.rendererPdf import RendererPdf
from modules.services.serviceGeneration.renderers.rendererDocx import RendererDocx
from modules.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
from modules.services.serviceGeneration.renderers.rendererPptx import RendererPptx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import RendererPdf
from modules.serviceCenter.services.serviceGeneration.renderers.rendererDocx import RendererDocx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPptx import RendererPptx
mime_map = {
"application/pdf": (RendererPdf, "application/pdf"),

View file

@ -284,7 +284,7 @@ from .datamodelFeatureRealEstate import (
Land,
DokumentTyp,
)
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
from .interfaceFeatureRealEstate import getInterface as getRealEstateInterface
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector

View file

@ -843,7 +843,7 @@ async def testVoice(
):
"""Test TTS voice with AI-generated sample text in the correct language."""
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
mandateId = _validateInstanceAccess(instanceId, context)

View file

@ -1062,7 +1062,7 @@ class TeamsbotService:
# Call SPEECH_TEAMS
try:
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
# Create minimal service context for AI billing
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1684,7 +1684,7 @@ class TeamsbotService:
"""Summarize a long user-provided session context to its essential points.
This reduces token usage in every subsequent AI call."""
try:
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1738,7 +1738,7 @@ class TeamsbotService:
lines.append(f"[{speaker}]: {text}")
textToSummarize = "\n".join(lines)
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
@ -1783,7 +1783,7 @@ class TeamsbotService:
for t in transcripts
)
from modules.services.serviceAi.mainServiceAi import AiService
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
aiService = AiService(serviceCenter=serviceContext)

View file

@ -188,7 +188,7 @@ def get_mime_type_options(
"""Get supported MIME types from the document extraction service.
Returns: [{ value: "mime/type", label: "Description" }]
"""
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
registry = ExtractorRegistry()
formats = registry.getSupportedFormats()

View file

@ -764,7 +764,11 @@ class BillingObjects:
featureCode: str = None,
aicoreProvider: str = None,
aicoreModel: str = None,
description: str = "AI Usage"
description: str = "AI Usage",
processingTime: float = None,
bytesSent: int = None,
bytesReceived: int = None,
errorCount: int = None
) -> Optional[Dict[str, Any]]:
"""
Record usage cost as a billing transaction.
@ -774,20 +778,6 @@ class BillingObjects:
- PREPAY_USER: deduct from user's own balance
- PREPAY_MANDATE: deduct from mandate pool balance
- CREDIT_POSTPAY: deduct from mandate pool balance
Args:
mandateId: Mandate ID
userId: User ID
priceCHF: Cost in CHF
workflowId: Optional workflow ID
featureInstanceId: Optional feature instance ID
featureCode: Optional feature code
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
description: Transaction description
Returns:
Created transaction dict or None
"""
if priceCHF <= 0:
return None
@ -816,7 +806,11 @@ class BillingObjects:
featureCode=featureCode,
aicoreProvider=aicoreProvider,
aicoreModel=aicoreModel,
createdByUserId=userId
createdByUserId=userId,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=errorCount
)
# Determine where to deduct balance
@ -828,6 +822,20 @@ class BillingObjects:
poolAccount = self.getOrCreateMandateAccount(mandateId)
return self.createTransaction(transaction, balanceAccountId=poolAccount["id"])
# =========================================================================
# Workflow Cost Query
# =========================================================================
def getWorkflowCost(self, workflowId: str) -> float:
"""Sum of all transaction amounts for a workflow."""
if not workflowId:
return 0.0
transactions = self.db.getRecordset(
BillingTransaction,
recordFilter={"workflowId": workflowId}
)
return sum(t.get("amount", 0.0) for t in transactions)
# =========================================================================
# Billing Model Switch Operations
# =========================================================================

View file

@ -18,7 +18,6 @@ from modules.datamodels.datamodelUam import AccessLevel
from modules.datamodels.datamodelChat import (
ChatDocument,
ChatStat,
ChatLog,
ChatMessage,
ChatWorkflow,
@ -663,10 +662,8 @@ class ChatObjects:
workflow = workflows[0]
try:
# Load related data from normalized tables
logs = self.getLogs(workflowId)
messages = self.getMessages(workflowId)
stats = self.getStats(workflowId)
# Validate workflow data against ChatWorkflow model
# Explicit type coercion: DB may store numeric fields as TEXT on some platforms
@ -694,8 +691,7 @@ class ChatObjects:
lastActivity=_toFloat(workflow.get("lastActivity")),
startedAt=_toFloat(workflow.get("startedAt")),
logs=logs,
messages=messages,
stats=stats
messages=messages
)
except Exception as e:
logger.error(f"Error validating workflow data: {str(e)}")
@ -731,7 +727,7 @@ class ChatObjects:
except Exception as e:
logger.warning(f"Could not get Root mandate: {e}")
# Note: ChatWorkflow has featureInstanceId for multi-tenancy isolation.
# Child tables (ChatMessage, ChatLog, ChatStat, ChatDocument) are user-owned
# Child tables (ChatMessage, ChatLog, ChatDocument) are user-owned
# and do NOT store featureInstanceId - they inherit isolation from ChatWorkflow.
# Ensure featureInstanceId is set from context if not already in workflowData
if "featureInstanceId" not in workflowData or not workflowData.get("featureInstanceId"):
@ -789,9 +785,7 @@ class ChatObjects:
# Load fresh data from normalized tables
logs = self.getLogs(workflowId)
messages = self.getMessages(workflowId)
stats = self.getStats(workflowId)
# Convert to ChatWorkflow model
return ChatWorkflow(
id=updated["id"],
status=updated.get("status", workflow.status),
@ -804,8 +798,7 @@ class ChatObjects:
lastActivity=updated.get("lastActivity", workflow.lastActivity),
startedAt=updated.get("startedAt", workflow.startedAt),
logs=logs,
messages=messages,
stats=stats
messages=messages
)
def deleteWorkflow(self, workflowId: str) -> bool:
@ -827,7 +820,6 @@ class ChatObjects:
messageId = message.id
if messageId:
# Delete message documents (but NOT the files!)
# Note: ChatStat does NOT have messageId - stats are only at workflow level
try:
existing_docs = self._getRecordset(ChatDocument, recordFilter={"messageId": messageId})
for doc in existing_docs:
@ -839,11 +831,7 @@ class ChatObjects:
self.db.recordDelete(ChatMessage, messageId)
# 2. Delete workflow stats
existing_stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
for stat in existing_stats:
self.db.recordDelete(ChatStat, stat["id"])
# 3. Delete workflow logs
# 2. Delete workflow logs
existing_logs = self._getRecordset(ChatLog, recordFilter={"workflowId": workflowId})
for log in existing_logs:
self.db.recordDelete(ChatLog, log["id"])
@ -1270,7 +1258,6 @@ class ChatObjects:
self.db.recordDelete(ChatDocument, doc["id"])
# 2. Finally delete the message itself
# Note: ChatStat has no messageId field -- stats are workflow-level, not message-level
success = self.db.recordDelete(ChatMessage, messageId)
return success
@ -1517,74 +1504,10 @@ class ChatObjects:
# Return validated ChatLog instance
return ChatLog(**createdLog)
# Stats methods
def getStats(self, workflowId: str) -> List[ChatStat]:
"""Returns list of statistics for a workflow if user has access."""
# Check workflow access first (without calling getWorkflow to avoid circular reference)
# Use RBAC filtering
workflows = self._getRecordset(ChatWorkflow, recordFilter={"id": workflowId})
if not workflows:
return []
# Get stats for this workflow from normalized table
stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
if not stats:
return []
# Return all stats records sorted by creation time.
# Use parseTimestamp to tolerate mixed DB types (float/string) on INT.
# DB uses _createdAt (camelCase system field).
stats.sort(key=lambda x: parseTimestamp(x.get("_createdAt"), default=0))
# Convert to ChatStat objects, preserving _createdAt via extra="allow"
result = []
for stat in stats:
chat_stat = ChatStat(**stat)
# Explicitly preserve _createdAt from raw DB record
if "_createdAt" in stat:
setattr(chat_stat, '_createdAt', stat["_createdAt"])
result.append(chat_stat)
return result
def createStat(self, statData: Dict[str, Any]) -> ChatStat:
"""Creates a new stats record and returns it."""
try:
# Ensure workflowId is present in statData
if "workflowId" not in statData:
raise ValueError("workflowId is required in statData")
# Note: Chat data is user-owned, no mandate/featureInstance context stored
# mandateId/featureInstanceId removed from ChatStat model
# Validate the stat data against ChatStat model
stat = ChatStat(**statData)
logger.debug(f"Creating stat for workflow {statData.get('workflowId')}: "
f"process={statData.get('process')}, "
f"priceCHF={statData.get('priceCHF', 0):.4f}, "
f"processingTime={statData.get('processingTime', 0):.2f}s")
# Create the stat record in the database
created = self.db.recordCreate(ChatStat, stat)
logger.info(f"Created stat {created.get('id')} for workflow {statData.get('workflowId')}")
# Return the created ChatStat
return ChatStat(**created)
except Exception as e:
logger.error(f"Error creating workflow stat: {str(e)}")
raise
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None) -> Dict[str, Any]:
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None, workflowCost: float = 0.0) -> Dict[str, Any]:
"""
Returns unified chat data (messages, logs, stats) for a workflow in chronological order.
Uses timestamp-based selective data transfer for efficient polling.
Returns unified chat data (messages, logs) for a workflow in chronological order,
plus workflowCost from billing transactions (single source of truth).
"""
# Check workflow access first
# Use RBAC filtering
@ -1652,29 +1575,10 @@ class ChatObjects:
"item": chatLog
})
# Get stats - ChatStat model supports _createdAt via model_config extra="allow"
stats = self.getStats(workflowId)
for stat in stats:
# Apply timestamp filtering in Python
# Use _createdAt (system field from DB, preserved via model_config extra="allow")
stat_timestamp = getattr(stat, '_createdAt', None) or getUtcTimestamp()
if afterTimestamp is not None and stat_timestamp <= afterTimestamp:
continue
# Convert to dict and include _createdAt for frontend
stat_dict = stat.model_dump() if hasattr(stat, 'model_dump') else stat.dict()
stat_dict['_createdAt'] = stat_timestamp
items.append({
"type": "stat",
"createdAt": stat_timestamp,
"item": stat_dict
})
# Sort all items by createdAt timestamp for chronological order
items.sort(key=lambda x: parseTimestamp(x.get("createdAt"), default=0))
return {"items": items}
return {"items": items, "workflowCost": workflowCost}
def getInterface(currentUser: Optional[User] = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> 'ChatObjects':

View file

@ -58,7 +58,6 @@ TABLE_NAMESPACE = {
"ChatWorkflow": "chat",
"ChatMessage": "chat",
"ChatLog": "chat",
"ChatStat": "chat",
"ChatDocument": "chat",
"Prompt": "chat",
# Chatbot (poweron_chatbot) - per feature-instance isolation
@ -175,7 +174,7 @@ def getRecordsetWithRBAC(
whereValues = []
# CRITICAL: Only pass featureInstanceId to WHERE clause if the model actually has
# this column. Chat child tables (ChatMessage, ChatLog, ChatStat, ChatDocument)
# this column. Chat child tables (ChatMessage, ChatLog, ChatDocument)
# are user-owned and do NOT have featureInstanceId - only ChatWorkflow does.
# Without this check, the SQL query would reference a non-existent column,
# causing a silent error that returns empty results.

View file

@ -21,7 +21,7 @@ from modules.auth import limiter, requireSysAdminRole, getRequestContext, Reques
# Import billing components
from modules.interfaces.interfaceDbBilling import getInterface as getBillingInterface, _getRootInterface
from modules.services.serviceBilling.mainServiceBilling import getService as getBillingService
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import getService as getBillingService
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
from modules.routes.routeDataUsers import _applyFiltersAndSort
from modules.datamodels.datamodelBilling import (
@ -162,6 +162,23 @@ def _isAdminOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
return False
def _isMemberOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
"""Check if user has any enabled membership in the specified mandate."""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
userMandates = rootInterface.getUserMandates(str(ctx.user.id))
for um in userMandates:
if str(getattr(um, 'mandateId', None)) != str(targetMandateId):
continue
if not getattr(um, 'enabled', True):
continue
return True
return False
except Exception:
return False
def _filterTransactionsByScope(transactions: list, scope: BillingDataScope) -> list:
"""
Filter a list of transaction dicts based on the user's BillingDataScope.
@ -720,11 +737,11 @@ def createCheckoutSession(
targetMandateId: str = Path(..., description="Mandate ID"),
checkoutRequest: CheckoutCreateRequest = Body(...),
ctx: RequestContext = Depends(getRequestContext),
_admin = Depends(requireSysAdminRole)
):
"""
Create Stripe Checkout Session for credit top-up. Returns redirect URL.
SysAdmin only. Amount is validated server-side against allowed presets.
RBAC: PREPAY_USER requires mandate membership (user loads own account),
PREPAY_MANDATE requires mandate admin role.
"""
try:
billingInterface = getBillingInterface(ctx.user, targetMandateId)
@ -738,10 +755,17 @@ def createCheckoutSession(
if billingModel == BillingModelEnum.PREPAY_USER:
if not checkoutRequest.userId:
raise HTTPException(status_code=400, detail="userId is required for PREPAY_USER model")
elif billingModel not in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
if str(checkoutRequest.userId) != str(ctx.user.id):
raise HTTPException(status_code=403, detail="Users can only load credit to their own account")
if not _isMemberOfMandate(ctx, targetMandateId):
raise HTTPException(status_code=403, detail="User is not a member of this mandate")
elif billingModel in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
if not _isAdminOfMandate(ctx, targetMandateId):
raise HTTPException(status_code=403, detail="Mandate admin role required to load mandate credit")
else:
raise HTTPException(status_code=400, detail=f"Cannot add credit to {billingModel.value} billing model")
from modules.services.serviceBilling.stripeCheckout import create_checkout_session
from modules.serviceCenter.services.serviceBilling.stripeCheckout import create_checkout_session
redirect_url = create_checkout_session(
mandate_id=targetMandateId,
user_id=checkoutRequest.userId,
@ -768,7 +792,7 @@ async def stripeWebhook(
No JWT auth - Stripe authenticates via Stripe-Signature header.
"""
from modules.shared.configuration import APP_CONFIG
from modules.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
from modules.serviceCenter.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
webhook_secret = APP_CONFIG.get("STRIPE_WEBHOOK_SECRET")
if not webhook_secret:

View file

@ -764,7 +764,7 @@ def send_password_link(
expiryHours = int(APP_CONFIG.get("Auth_RESET_TOKEN_EXPIRY_HOURS", "24"))
try:
from modules.services import Services
from modules.serviceHub import Services
services = Services(targetUser)
emailSubject = "PowerOn - Passwort setzen"

View file

@ -395,7 +395,7 @@ def trigger_subscription(
)
# Get messaging service from request app state
from modules.services import getInterface as getServicesInterface
from modules.serviceHub import getInterface as getServicesInterface
services = getServicesInterface(context.user, None, mandateId=str(context.mandateId))
# Konvertiere Dict zu Pydantic Model

View file

@ -12,7 +12,7 @@ from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request, sta
from modules.auth import limiter, getCurrentUser
from modules.datamodels.datamodelUam import User, UserConnection
from modules.interfaces.interfaceDbApp import getInterface
from modules.services import getInterface as getServices
from modules.serviceHub import getInterface as getServices
logger = logging.getLogger(__name__)

View file

@ -26,7 +26,6 @@ logger = logging.getLogger(__name__)
def getService(
key: str,
context: ServiceCenterContext,
legacy_hub: Optional[Any] = None,
) -> Any:
"""
Get a service instance by key for the given context.
@ -34,14 +33,13 @@ def getService(
Args:
key: Service key (e.g., "web", "extraction", "utils")
context: ServiceCenterContext with user, mandate_id, feature_instance_id, workflow
legacy_hub: Optional legacy Services instance for fallback when service not yet migrated
Returns:
Service instance
"""
cache = get_resolution_cache()
resolving = set()
return resolve(key, context, cache, resolving, legacy_hub=legacy_hub)
return resolve(key, context, cache, resolving)
def preWarm(service_keys: Optional[List[str]] = None) -> None:

View file

@ -2,7 +2,7 @@
# All rights reserved.
"""
Service Center Resolver.
Resolution logic, dependency injection, and optional legacy fallback.
Resolution logic and dependency injection for service instantiation.
"""
import importlib
@ -14,7 +14,6 @@ from modules.serviceCenter.registry import CORE_SERVICES, IMPORTABLE_SERVICES
logger = logging.getLogger(__name__)
# Type for get_service callable passed to services
GetServiceFunc = Callable[[str], Any]
@ -29,50 +28,15 @@ def _load_service_class(module_path: str, class_name: str):
return getattr(module, class_name)
def _create_legacy_hub(ctx: ServiceCenterContext) -> Any:
"""Create legacy Services instance for fallback when service not yet migrated."""
from modules.services import getInterface
return getInterface(
ctx.user,
workflow=ctx.workflow,
mandateId=ctx.mandate_id,
featureInstanceId=ctx.feature_instance_id,
)
def _get_from_legacy(legacy_hub: Any, key: str) -> Any:
"""Map service key to legacy hub attribute (for fallback when service center module fails)."""
key_to_attr = {
"utils": "utils",
"security": "security",
"streaming": "streaming",
"ticket": "ticket",
"messaging": "messaging",
"billing": "billing",
"sharepoint": "sharepoint",
"chat": "chat",
"extraction": "extraction",
"generation": "generation",
"ai": "ai",
"web": "web",
"neutralization": "neutralization",
}
attr = key_to_attr.get(key)
if attr and hasattr(legacy_hub, attr):
return getattr(legacy_hub, attr)
return None
def resolve(
key: str,
context: ServiceCenterContext,
cache: Dict[str, Any],
resolving: Set[str],
legacy_hub: Optional[Any] = None,
) -> Any:
"""
Resolve a service by key. Uses cache, resolves dependencies recursively.
Falls back to legacy_hub if service module cannot be loaded.
Raises KeyError if the service is not registered.
"""
cache_key = f"{_make_context_id(context)}_{key}"
if cache_key in cache:
@ -82,59 +46,20 @@ def resolve(
raise RuntimeError(f"Circular dependency detected for service: {key}")
def get_service(dep_key: str) -> Any:
return resolve(dep_key, context, cache, resolving, legacy_hub)
return resolve(dep_key, context, cache, resolving)
# Try core first
if key in CORE_SERVICES:
spec = CORE_SERVICES[key]
spec = CORE_SERVICES.get(key) or IMPORTABLE_SERVICES.get(key)
if spec:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
except (ImportError, ModuleNotFoundError, AttributeError) as e:
logger.debug(f"Could not load core service '{key}' from service center: {e}")
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
raise
# Try importable
if key in IMPORTABLE_SERVICES:
spec = IMPORTABLE_SERVICES[key]
try:
cls = _load_service_class(spec["module"], spec["class"])
resolving.add(key)
try:
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
except (ImportError, ModuleNotFoundError, AttributeError) as e:
logger.debug(f"Could not load importable service '{key}' from service center: {e}")
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
raise
if legacy_hub:
fallback = _get_from_legacy(legacy_hub, key)
if fallback is not None:
cache[cache_key] = fallback
return fallback
for dep in spec.get("dependencies", []):
get_service(dep)
finally:
resolving.discard(key)
instance = cls(context, get_service)
cache[cache_key] = instance
return instance
raise KeyError(f"Unknown service: {key}")

View file

@ -64,6 +64,10 @@ class _ServicesAdapter:
def interfaceDbChat(self):
return self._get_service("chat").interfaceDbChat
@property
def interfaceDbComponent(self):
return self._get_service("chat").interfaceDbComponent
@property
def featureCode(self) -> Optional[str]:
w = self.workflow
@ -171,12 +175,8 @@ class AiService:
else:
response = await self.aiObjects.callWithTextContext(request)
finally:
# Clear callback after call completes
self.aiObjects.billingCallback = None
# Store workflow stats for analytics
self._storeAiCallStats(response, request)
return response
# =========================================================================
@ -295,9 +295,6 @@ class AiService:
except Exception as e:
logger.error(f"BILLING: Failed to record billing for SPEECH_TEAMS: {e}")
# Store stats
self._storeAiCallStats(response, request)
logger.info(f"SPEECH_TEAMS call completed: model={model.name}, time={processingTime:.2f}s, cost={priceCHF:.4f} CHF")
return response
@ -644,12 +641,12 @@ detectedIntent-Werte:
billingService = getBillingService(user, mandateId, featureInstanceId, featureCode)
def _billingCallback(response) -> None:
"""Record billing for a single AI model call."""
"""Record billing transaction with full AI call metadata."""
if not response or getattr(response, 'errorCount', 0) > 0:
return
priceCHF = getattr(response, 'priceCHF', 0.0)
if not priceCHF or priceCHF <= 0:
basePriceCHF = getattr(response, 'priceCHF', 0.0)
if not basePriceCHF or basePriceCHF <= 0:
return
provider = getattr(response, 'provider', None) or 'unknown'
@ -657,20 +654,24 @@ detectedIntent-Werte:
try:
billingService.recordUsage(
priceCHF=priceCHF,
priceCHF=basePriceCHF,
workflowId=workflowId,
aicoreProvider=provider,
aicoreModel=modelName,
description=f"AI: {modelName}"
description=f"AI: {modelName}",
processingTime=getattr(response, 'processingTime', None),
bytesSent=getattr(response, 'bytesSent', None),
bytesReceived=getattr(response, 'bytesReceived', None),
errorCount=getattr(response, 'errorCount', None)
)
logger.debug(
f"Billed model call: {priceCHF:.4f} CHF, "
f"Billed model call: {basePriceCHF:.4f} CHF, "
f"provider={provider}, model={modelName}, mandate={mandateId}"
)
except Exception as e:
logger.error(
f"BILLING: Failed to record transaction! "
f"Cost={priceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
f"Cost={basePriceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
f"provider={provider}, model={modelName}, error={e}"
)
@ -723,40 +724,6 @@ detectedIntent-Werte:
logger.warning(f"Error calculating effective providers: {e}")
return None
def _storeAiCallStats(self, response, request: AiCallRequest) -> None:
"""Store workflow stats after an AI call.
This method stores the AI call statistics (cost, processing time, bytes)
to the workflow stats collection for tracking and billing purposes.
Args:
response: AiCallResponse with cost/timing data
request: Original AiCallRequest for context
"""
try:
# Skip if no workflow context
workflow = getattr(self.services, 'workflow', None)
if not workflow or not hasattr(workflow, 'id') or not workflow.id:
logger.debug("No workflow context - skipping stats storage")
return
# Skip if response is an error
if not response or getattr(response, 'errorCount', 0) > 0:
logger.debug("Error response - skipping stats storage")
return
# Determine process name from operation type
opType = getattr(request.options, 'operationType', 'unknown') if request.options else 'unknown'
process = f"ai.call.{opType}"
# Store the stat
self.services.chat.storeWorkflowStat(workflow, response, process)
logger.debug(f"Stored AI call stat: {process}, cost={getattr(response, 'priceCHF', 0):.4f} CHF")
except Exception as e:
# Log but don't fail - stats storage is not critical
logger.debug(f"Could not store AI call stat: {str(e)}")
async def ensureAiObjectsInitialized(self):
"""Ensure aiObjects is initialized and submodules are ready."""
if self.aiObjects is None:
@ -766,17 +733,17 @@ detectedIntent-Werte:
self._initializeSubmodules()
@classmethod
async def create(cls, legacy_services) -> "AiService":
"""Create AiService from legacy Services hub. For backward compatibility with tests."""
async def create(cls, servicesHub) -> "AiService":
"""Create AiService from a ServiceHub instance."""
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(
user=legacy_services.user,
mandate_id=legacy_services.mandateId,
feature_instance_id=legacy_services.featureInstanceId,
workflow=getattr(legacy_services, "workflow", None),
user=servicesHub.user,
mandate_id=servicesHub.mandateId,
feature_instance_id=servicesHub.featureInstanceId,
workflow=getattr(servicesHub, "workflow", None),
)
return getService("ai", ctx, legacy_hub=legacy_services)
return getService("ai", ctx)
# Helper methods

View file

@ -125,10 +125,11 @@ class AiCallLooper:
logger.error(errorMsg)
raise ValueError(errorMsg)
maxIterations = 50 # Prevent infinite loops
maxIterations = 10
iteration = 0
allSections = [] # Accumulate all sections across iterations
lastRawResponse = None # Store last raw JSON response for continuation
result = ""
allSections = []
lastRawResponse = None
# JSON Base Iteration System:
# - jsonBase: the merged JSON string (replaces accumulatedDirectJson array)

View file

@ -261,35 +261,34 @@ class ContentExtractor:
# Check if it's standardized JSON format (has "documents" or "sections")
if document.mimeType == "application/json":
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
try:
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
# Create reference ContentPart for structured JSON
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue # Skip normal extraction for this document
except Exception as e:
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
# Continue with normal extraction
except (json.JSONDecodeError, UnicodeDecodeError) as e:
logger.warning(f"Could not parse JSON document {document.fileName}: {str(e)}")
jsonData = None
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue
# Normal extraction path
intent = getIntentForDocument(document.id, documentIntents)

View file

@ -230,9 +230,12 @@ class DocumentIntentAnalyzer:
else:
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
return None
except (json.JSONDecodeError, UnicodeDecodeError) as e:
logger.debug(f"Error parsing document {document.fileName}: {str(e)}")
return None
except Exception as e:
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
logger.error(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
def _buildIntentAnalysisPrompt(

View file

@ -330,17 +330,7 @@ class JsonMergeLogger:
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
else:
# No log file set - write individual file (fallback)
currentFileDir = os.path.dirname(os.path.abspath(__file__))
logDir = currentFileDir
os.makedirs(logDir, exist_ok=True)
logFilePath = os.path.join(logDir, f"{mergeId}.txt")
try:
with open(logFilePath, 'w', encoding='utf-8') as f:
f.write(logContent)
logger.info(f"JSON merge log written to: {logFilePath}")
except Exception as e:
logger.error(f"Failed to write merge log file: {e}")
logger.debug(f"JSON merge {mergeId} completed ({len(logContent)} chars log). Use initializeLogFile() to persist merge logs.")
# Clear buffer for next merge
JsonMergeLogger._logBuffer = []

View file

@ -25,7 +25,7 @@ class StructureFiller:
"""Handles filling document structure with content."""
# Default concurrency limit for parallel generation (chapters/sections)
DEFAULT_MAX_CONCURRENT_GENERATION = 16
DEFAULT_MAX_CONCURRENT_GENERATION = 5
def __init__(self, services, aiService):
"""Initialize StructureFiller with service center and AI service access."""
@ -568,11 +568,16 @@ class StructureFiller:
all_sections_list: List[Dict[str, Any]],
language: str,
outputFormat: str = "txt",
calculateOverallProgress: callable = None
calculateOverallProgress: callable = None,
preExtractedText: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Process a single section and return its elements.
Used for parallel processing of sections within a chapter.
When preExtractedText is provided, the section uses the pre-extracted
content directly in its prompt instead of sending raw content parts
through the heavy extraction pipeline (avoids chunking + N*M AI calls).
"""
sectionId = section.get("id")
sectionTitle = section.get("title", sectionId)
@ -600,6 +605,149 @@ class StructureFiller:
elements = []
# --- Fast path: use pre-extracted text instead of raw content parts ---
if preExtractedText and useAiCall and generationHint:
logger.info(
f"Section {sectionId}: Using pre-extracted text "
f"({len(preExtractedText):,} chars) - lightweight AI path"
)
for partId in contentPartIds:
part = self._findContentPartById(partId, contentParts)
if not part:
continue
cf = contentFormats.get(partId, part.metadata.get("contentFormat"))
if cf == "reference":
elements.append({
"type": "reference",
"documentReference": part.metadata.get("documentReference"),
"label": part.metadata.get("usageHint", part.label)
})
elif cf == "object":
if part.typeGroup == "image" and part.data:
caption = (
section.get("caption")
or section.get("metadata", {}).get("caption")
or part.metadata.get("caption", "")
)
elements.append({
"type": "image",
"content": {
"base64Data": part.data,
"altText": part.metadata.get("usageHint", part.label),
"caption": caption
},
"caption": caption
})
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
section=section,
contentParts=[],
userPrompt=userPrompt,
generationHint=generationHint,
allSections=all_sections_list,
sectionIndex=sectionIndex,
isAggregation=False,
language=language,
outputFormat=outputFormat,
preExtractedText=preExtractedText
)
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
self.services.chat.progressLogStart(
sectionOperationId,
"Section Generation (Pre-extracted)",
f"Section {sectionIndex + 1}/{totalSections}",
f"{sectionTitle} (pre-extracted)",
parentOperationId=chapterOperationId
)
try:
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
operationType = OperationTypeEnum.DATA_ANALYSE
options = AiCallOptions(
operationType=operationType,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.DETAILED
)
checkWorkflowStopped(self.services)
aiResponseJson = await self.aiService.callAiWithLooping(
prompt=generationPrompt,
options=options,
debugPrefix=f"{chapterId}_section_{sectionId}",
promptBuilder=self.buildSectionPromptWithContinuation,
promptArgs={
"section": section,
"contentParts": [],
"userPrompt": userPrompt,
"generationHint": generationHint,
"allSections": all_sections_list,
"sectionIndex": sectionIndex,
"isAggregation": False,
"templateStructure": templateStructure,
"basePrompt": generationPrompt,
"language": language
},
operationId=sectionOperationId,
userPrompt=userPrompt,
contentParts=None,
useCaseId="section_content"
)
try:
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
if isinstance(aiResponseJson, str) and ("---" in aiResponseJson or aiResponseJson.count("```json") > 1):
generatedElements = self._extractAndMergeMultipleJsonBlocks(aiResponseJson, contentType, sectionId)
else:
parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson)
if parsedResponse is None:
logger.warning(f"Section {sectionId}: tryParseJson failed, attempting repair")
repairedStr = repairBrokenJson(aiResponseJson)
parsedResponse, parseError2, _ = tryParseJson(repairedStr)
if parsedResponse and isinstance(parsedResponse, dict):
generatedElements = parsedResponse.get("elements", [])
elif parsedResponse and isinstance(parsedResponse, list):
generatedElements = parsedResponse
else:
generatedElements = []
except Exception as parseErr:
logger.error(f"Section {sectionId}: JSON parse error: {parseErr}")
generatedElements = []
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
class _AiResponse:
def __init__(self, content):
self.content = content
responseElements = await self._processAiResponseForSection(
aiResponse=_AiResponse(aiResponseJson),
contentType=contentType,
operationType=operationType,
sectionId=sectionId,
generationHint=generationHint,
generatedElements=generatedElements,
section=section
)
elements.extend(responseElements)
self.services.chat.progressLogFinish(sectionOperationId, True)
except Exception as e:
self.services.chat.progressLogFinish(sectionOperationId, False)
logger.error(f"Error in pre-extracted section {sectionId}: {e}")
elements.append({
"type": "error",
"message": f"Error processing section {sectionId}: {str(e)}",
"sectionId": sectionId
})
return elements
# --- Standard path: process content parts directly ---
# Prüfe ob Aggregation nötig ist
needsAggregation = self._needsAggregation(
contentType=contentType,
@ -1507,6 +1655,156 @@ class StructureFiller:
return elements
async def _preExtractSharedContent(
self,
contentParts: List[ContentPart],
allSectionTasks: List[Dict[str, Any]],
userPrompt: str,
parentOperationId: str
) -> Dict[str, str]:
"""
Pre-extract content from large/shared content parts ONCE before parallel
section filling. Returns dict mapping sectionId -> pre-extracted text.
Extracts a comprehensive plain-text summary per content part, then gives
ALL sections referencing that part the SAME summary. Each section's own
generationHint focuses the AI on the relevant aspect during generation.
This eliminates the N*M AI call explosion where N sections each independently
chunk and process the same M-byte content part through the extraction pipeline.
"""
SIZE_THRESHOLD = 100_000
MIN_SHARED_SECTIONS = 2
partToSections: Dict[str, List[Dict[str, Any]]] = {}
for task in allSectionTasks:
section = task["section"]
for partId in section.get("contentPartIds", []):
if partId not in partToSections:
partToSections[partId] = []
partToSections[partId].append(section)
if not partToSections:
return {}
preExtractedCache: Dict[str, str] = {}
for partId, sections in partToSections.items():
part = self._findContentPartById(partId, contentParts)
if not part:
continue
contentFormat = part.metadata.get("contentFormat", "unknown")
if contentFormat != "extracted":
continue
if part.typeGroup in ("image", "binary"):
continue
if part.mimeType and (
part.mimeType.startswith("image/")
or part.mimeType.startswith("video/")
or part.mimeType.startswith("audio/")
):
continue
partSize = len(part.data) if part.data else 0
numSections = len(sections)
if numSections < MIN_SHARED_SECTIONS and partSize < SIZE_THRESHOLD:
continue
fileName = part.metadata.get("originalFileName", partId)
logger.info(
f"Pre-extracting content part {partId} "
f"({partSize:,} bytes, referenced by {numSections} sections)"
)
topicLines = []
for section in sections:
hint = (
section.get("generationHint")
or section.get("generation_hint")
or section.get("title", "")
)
topicLines.append(f"- {hint}")
topicsText = "\n".join(topicLines)
extractionPrompt = (
"# TASK: Extract key information from this document\n\n"
"Extract ALL relevant information from the provided content as "
"plain text. The extracted content will be used to generate a report "
"covering the topics listed below.\n\n"
f"## User Request\n{userPrompt}\n\n"
f"## Report topics that need data\n{topicsText}\n\n"
"## Instructions\n"
"- Extract key facts, data points, timestamps, error messages, "
"statistics, and specific findings\n"
"- Organize by theme but output as PLAIN TEXT (not JSON)\n"
"- Be comprehensive but concise - include specific data, "
"skip generic filler\n"
"- Include concrete examples with exact values from the source\n"
"- Do NOT add commentary or analysis - just extract the raw data\n"
)
try:
self.services.chat.progressLogUpdate(
parentOperationId, 0.05,
f"Pre-extracting content from {fileName} ({partSize:,} bytes)..."
)
def _preExtractionProgress(chunkProgress, message):
mapped = 0.05 + chunkProgress * 0.05
self.services.chat.progressLogUpdate(
parentOperationId, mapped,
f"Pre-extraction: {message}"
)
request = AiCallRequest(
prompt=extractionPrompt,
contentParts=[part],
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.BALANCED,
processingMode=ProcessingModeEnum.DETAILED
)
)
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request, progressCallback=_preExtractionProgress)
responseText = response.content if hasattr(response, "content") else str(response)
if responseText and len(responseText.strip()) > 50:
for section in sections:
sId = section.get("id", "unknown")
preExtractedCache[sId] = responseText
logger.info(
f"Pre-extraction of {partId} successful: "
f"{len(responseText):,} chars summary for {numSections} sections"
)
self.services.chat.progressLogUpdate(
parentOperationId, 0.10,
f"Pre-extraction complete ({len(responseText):,} chars). Starting section generation..."
)
else:
logger.warning(
f"Pre-extraction of {partId} returned empty/short response "
f"({len(responseText) if responseText else 0} chars), "
"sections will fall back to direct extraction"
)
except Exception as e:
logger.error(
f"Pre-extraction of {partId} failed: {e}. "
"Sections will fall back to direct extraction."
)
if preExtractedCache:
logger.info(
f"Pre-extraction complete: {len(preExtractedCache)} sections "
"have pre-extracted content (will use lightweight AI path)"
)
return preExtractedCache
async def _fillChapterSections(
self,
chapterStructure: Dict[str, Any],
@ -1564,27 +1862,42 @@ class StructureFiller:
"docFormat": docFormat # Include output format
})
MAX_TOTAL_SECTIONS = 35
if totalSections > MAX_TOTAL_SECTIONS:
logger.warning(
f"Structure has {totalSections} sections (limit {MAX_TOTAL_SECTIONS}). "
"Truncating to stay within budget."
)
allSectionTasks = allSectionTasks[:MAX_TOTAL_SECTIONS]
totalSections = len(allSectionTasks)
preExtractedCache = await self._preExtractSharedContent(
contentParts, allSectionTasks, userPrompt, fillOperationId
)
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
# Create task wrapper for each section with progress tracking
async def processSectionWithSemaphore(taskInfo):
checkWorkflowStopped(self.services)
sectionId = taskInfo["section"].get("id", "unknown")
async with sectionSemaphore:
result = await self._processSingleSection(
section=taskInfo["section"],
sectionIndex=taskInfo["sectionIndex"],
totalSections=taskInfo["chapterSectionCount"],
chapterIndex=0, # Not used for sequential logic anymore
chapterIndex=0,
totalChapters=totalChapters,
chapterId=taskInfo["chapterId"],
chapterOperationId=fillOperationId, # Use fillOperationId as parent (no chapter-level ops in parallel mode)
chapterOperationId=fillOperationId,
fillOperationId=fillOperationId,
contentParts=contentParts,
userPrompt=userPrompt,
all_sections_list=all_sections_list,
language=taskInfo["docLanguage"],
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
outputFormat=taskInfo.get("docFormat", "txt"),
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0,
preExtractedText=preExtractedCache.get(sectionId)
)
# Update progress after each section completes
@ -1810,6 +2123,7 @@ GENERATION HINT: {generationHint}
- Each section should serve a clear purpose with meaningful data
- If no relevant data exists for a topic, do NOT create a section for it
- Prefer ONE comprehensive section over multiple sparse sections
- HARD LIMIT: Maximum 5 sections per chapter. Combine related subtopics into single sections to stay within this limit.
**CRITICAL**: The chapter's generationHint above describes what content this chapter should generate. If the generationHint references documents/images/data, then EACH section that generates content for this chapter MUST assign the relevant ContentParts from AVAILABLE CONTENT PARTS below.
@ -1893,7 +2207,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
sectionIndex: Optional[int] = None,
isAggregation: bool = False,
language: str = "en",
outputFormat: str = "txt"
outputFormat: str = "txt",
preExtractedText: Optional[str] = None
) -> tuple[str, str]:
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
# Filtere None-Werte
@ -2057,7 +2372,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
6. Format based on content_type ({effectiveContentType}).
7. No HTML/styling: Plain text only, no markup.
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
8. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
## OUTPUT FORMAT
@ -2083,6 +2398,62 @@ Output requirements:
{userPrompt}
```
## CONTEXT
{contextText if contextText else ""}
"""
elif preExtractedText:
prompt = f"""# TASK: Generate Section Content from Pre-Extracted Data
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
## SECTION METADATA
- Section ID: {sectionId}
- Content Type: {effectiveContentType}
- Generation Hint: {generationHint}{formatNoteAggr}
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential facts only
- AVOID verbose text, filler phrases, or redundant explanations
- Be CONCISE and direct - every word should add value
- NO introductory phrases like "This section describes..." or "Here we present..."
- Minimize output size for efficient processing
## PRE-EXTRACTED CONTENT FOR THIS SECTION
```
{preExtractedText}
```
## INSTRUCTIONS
1. Use ONLY the pre-extracted content above. Never invent or generate data not present in it.
2. If the pre-extracted content is empty, return empty structures.
3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
6. Focus on the MOST RELEVANT information. Be concise.
## OUTPUT FORMAT
Return a JSON object with this structure:
{{
"elements": [
{{
"type": "{effectiveContentType}",
"content": {contentStructureExample}
}}
]
}}
Output requirements:
- "content" must be an object (never a string)
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
- Start with {{ and end with }} - return ONLY the JSON object itself
- No invented data: Return empty structures if pre-extracted content is empty
## USER REQUEST
```
{userPrompt}
```
## CONTEXT
{contextText if contextText else ""}
"""
@ -2117,7 +2488,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
3. Format based on content_type ({effectiveContentType}).
4. Return only valid JSON with "elements" array.
5. No HTML/styling: Plain text only, no markup.
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
6. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
## OUTPUT FORMAT
Return a JSON object with this structure:

View file

@ -430,6 +430,7 @@ Then chapters that generate those generic content types MUST assign the relevant
## CHAPTER STRUCTURE REQUIREMENTS
- Generate chapters based on USER REQUEST - analyze what structure the user wants
- Create ONLY the minimum chapters needed to cover the user's request - avoid over-structuring
- HARD LIMIT: Maximum 7 chapters per document. If the topic can be covered in fewer, prefer fewer. Combine related topics into single chapters rather than creating many small ones.
- IMPORTANT: Each chapter MUST have ALL these fields:
- id: Unique identifier (e.g., "chapter_1")
- level: Heading level (1, 2, 3, etc.)

View file

@ -205,36 +205,20 @@ class BillingService:
workflowId: str = None,
aicoreProvider: str = None,
aicoreModel: str = None,
description: str = None
description: str = None,
processingTime: float = None,
bytesSent: int = None,
bytesReceived: int = None,
errorCount: int = None
) -> Optional[Dict[str, Any]]:
"""
Record AI usage cost as a billing transaction.
This method:
1. Applies the pricing markup
2. Creates a DEBIT transaction
3. Updates the account balance
Args:
priceCHF: Base price from AI model (before markup)
workflowId: Optional workflow ID
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
description: Optional description
Returns:
Created transaction dict or None if not recorded
"""
"""Record AI usage cost as a billing transaction with markup applied."""
if priceCHF <= 0:
return None
# Apply markup
finalPrice = self.calculatePriceWithMarkup(priceCHF)
if finalPrice <= 0:
return None
# Build description
if not description:
description = f"AI Usage: {aicoreModel or aicoreProvider or 'unknown'}"
@ -247,9 +231,17 @@ class BillingService:
featureCode=self.featureCode,
aicoreProvider=aicoreProvider,
aicoreModel=aicoreModel,
description=description
description=description,
processingTime=processingTime,
bytesSent=bytesSent,
bytesReceived=bytesReceived,
errorCount=errorCount
)
def getWorkflowCost(self, workflowId: str) -> float:
"""Get total cost for a workflow from billing transactions."""
return self._billingInterface.getWorkflowCost(workflowId)
# =========================================================================
# Provider Permission Check (via RBAC)
# =========================================================================

View file

@ -4,7 +4,7 @@
import logging
from typing import Dict, Any, List, Optional, Callable
from modules.datamodels.datamodelUam import User, UserConnection
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatStat, ChatLog
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatLog
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
from modules.shared.progressLogger import ProgressLogger
@ -688,35 +688,6 @@ class ChatService:
workflow.logs.append(chatLog)
return chatLog
def storeWorkflowStat(self, workflow: Any, aiResponse: Any, process: str) -> ChatStat:
"""Persist workflow-level ChatStat from AiCallResponse and append to workflow stats list.
Billing is handled at the AI call source (interfaceAiObjects._callWithModel)
via billingCallback - not here. This method only handles workflow stats.
"""
try:
statData = {
"workflowId": workflow.id,
"process": process,
"engine": aiResponse.modelName,
"priceCHF": aiResponse.priceCHF,
"processingTime": aiResponse.processingTime,
"bytesSent": aiResponse.bytesSent,
"bytesReceived": aiResponse.bytesReceived,
"errorCount": aiResponse.errorCount
}
stat = self.interfaceDbChat.createStat(statData)
if not hasattr(workflow, 'stats') or workflow.stats is None:
workflow.stats = []
workflow.stats.append(stat)
return stat
except Exception as e:
logger.error(f"Failed to store workflow stat: {e}")
raise
def updateMessage(self, messageId: str, messageData: Dict[str, Any]):
"""Update message by delegating to the chat interface"""
try:

View file

@ -2,90 +2,147 @@
# All rights reserved.
from typing import Any, Dict, List
import json
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
logger = logging.getLogger(__name__)
class StructureChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("structureChunkSize", 40000))
data = part.data or ""
# best-effort: try JSON list/object bucketing; else fallback to line-based
chunks: List[Dict[str, Any]] = []
try:
obj = json.loads(data)
def emit(bucket: Any):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
if isinstance(obj, list):
bucket: list[Any] = []
size = 0
for item in obj:
text = json.dumps(item, ensure_ascii=False)
s = len(text.encode('utf-8'))
if size + s > maxBytes and bucket:
emit(bucket)
bucket = [item]
size = s
else:
bucket.append(item)
size += s
if bucket:
emit(bucket)
else:
# JSON object (dict) - check if it fits
text = json.dumps(obj, ensure_ascii=False)
textSize = len(text.encode('utf-8'))
if textSize <= maxBytes:
emit(obj)
else:
# Object too large - try to split by keys if possible
# For large objects, we need to chunk by character boundaries
# since we can't split JSON objects arbitrarily
if isinstance(obj, dict) and len(obj) > 1:
# Try to split object into multiple chunks by keys
# This preserves JSON structure better than line-based chunking
currentChunk: Dict[str, Any] = {}
currentSize = 2 # Start with "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
# Account for comma and spacing between items
if currentChunk:
itemSize += 2 # ", " separator
if currentSize + itemSize > maxBytes and currentChunk:
# Current chunk is full, emit it
emit(currentChunk)
currentChunk = {key: value}
currentSize = len(itemText.encode('utf-8'))
else:
currentChunk[key] = value
currentSize += itemSize
# Emit remaining chunk
if currentChunk:
emit(currentChunk)
else:
# Single large value or can't split - fallback to line chunking
raise ValueError("too large")
except Exception:
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
self._chunkValue(obj, maxBytes, chunks)
except (json.JSONDecodeError, ValueError):
self._chunkByLines(data, maxBytes, chunks)
return chunks
def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Recursively chunk a JSON value (list or dict) into pieces <= maxBytes."""
text = json.dumps(obj, ensure_ascii=False)
if len(text.encode('utf-8')) <= maxBytes:
self._emit(obj, chunks)
return
if isinstance(obj, list):
self._chunkList(obj, maxBytes, chunks)
elif isinstance(obj, dict):
self._chunkDict(obj, maxBytes, chunks)
else:
self._chunkByLines(text, maxBytes, chunks)
def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON array into sub-arrays that each fit within maxBytes."""
bucket: list = []
bucketSize = 2 # "[]" overhead
for item in items:
itemText = json.dumps(item, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if bucket else 0 # ", "
if bucketSize + itemSize + separator > maxBytes and bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if bucket:
self._emit(bucket, chunks)
bucket = []
bucketSize = 2
self._chunkValue(item, maxBytes, chunks)
else:
bucket.append(item)
bucketSize += itemSize + separator
if bucket:
self._emit(bucket, chunks)
def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it."""
if len(obj) <= 1:
key, value = next(iter(obj.items()))
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
text = json.dumps(obj, ensure_ascii=False)
self._chunkByLines(text, maxBytes, chunks)
return
currentChunk: Dict[str, Any] = {}
currentSize = 2 # "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
separator = 2 if currentChunk else 0
if currentSize + itemSize + separator > maxBytes and currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
separator = 0
if itemSize + 2 > maxBytes:
if currentChunk:
self._emit(currentChunk, chunks)
currentChunk = {}
currentSize = 2
if isinstance(value, (list, dict)):
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
else:
self._chunkByLines(itemText, maxBytes, chunks)
else:
currentChunk[key] = value
currentSize += itemSize + separator
if currentChunk:
self._emit(currentChunk, chunks)
def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}."""
subChunks: List[Dict[str, Any]] = []
self._chunkValue(value, maxBytes, subChunks)
for sub in subChunks:
subData = json.loads(sub["data"])
wrapped = {key: subData}
wrappedText = json.dumps(wrapped, ensure_ascii=False)
wrappedSize = len(wrappedText.encode('utf-8'))
if wrappedSize <= maxBytes:
self._emit(wrapped, chunks)
else:
self._chunkByLines(wrappedText, maxBytes, chunks)
def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]):
"""Line-based fallback for content that cannot be split structurally."""
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})

View file

@ -243,11 +243,7 @@ class ExtractionService:
errorCount=0
)
self._get_service("chat").storeWorkflowStat(
self._context.workflow,
aiResponse,
f"extraction.process.{doc.mimeType}"
)
# Cost is recorded via billingCallback in _callWithModel
# Write extraction results to debug file
try:
@ -1230,15 +1226,52 @@ class ExtractionService:
logger.info(f"Chunking {contentPart.typeGroup} part: contentSize={contentSize} bytes, textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes")
chunks = chunker.chunk(contentPart, chunkingOptions)
logger.info(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part (contentSize={contentSize} bytes)")
if chunks:
for i, chunk in enumerate(chunks):
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
logger.info(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes")
return chunks
# Post-chunking validation: force line-based split on any chunk still exceeding target
validatedChunks = []
for i, chunk in enumerate(chunks):
chunkData = chunk.get('data', '')
chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
if chunkSize > availableContentBytes and chunkData:
logger.warning(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes exceeds target {availableContentBytes} bytes, force-splitting by lines")
subChunks = self._forceLineSplit(chunkData, availableContentBytes, len(validatedChunks))
validatedChunks.extend(subChunks)
else:
chunk["order"] = len(validatedChunks)
validatedChunks.append(chunk)
if len(validatedChunks) != len(chunks):
logger.info(f"Post-chunking validation: {len(chunks)} -> {len(validatedChunks)} chunks after force-splitting oversized chunks")
for i, chunk in enumerate(validatedChunks):
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
logger.info(f" Chunk {i+1}/{len(validatedChunks)}: {chunkSize} bytes")
return validatedChunks
except Exception as e:
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
return []
def _forceLineSplit(self, data: str, maxBytes: int, startOrder: int) -> List[Dict[str, Any]]:
"""Line-based safety-net split for chunks that still exceed maxBytes after structured chunking."""
chunks: List[Dict[str, Any]] = []
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
return chunks
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
"""Process a single content part with model-aware chunking and fallback.
@ -1386,73 +1419,210 @@ class ExtractionService:
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
# If either condition fails, chunk the content
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
# Part too large or total exceeds limit - chunk it (but not for image generation)
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
if not chunks:
raise ValueError(f"Failed to chunk content part for model {model.name}")
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
if progressCallback:
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
chunkResults = []
for idx, chunk in enumerate(chunks):
chunkNum = idx + 1
chunkData = chunk.get('data', '')
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
if progressCallback:
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
try:
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
chunkResults.append(chunkResponse)
except Exception as chunkError:
logger.error(f"Error processing chunk {chunkNum}/{len(chunks)}: {str(chunkError)}")
# Continue with other chunks even if one fails
continue
# Merge chunk results
if not chunkResults:
raise ValueError(f"All chunks failed for content part")
# Pass original contentPart to preserve typeGroup for all chunks (one-to-many: 1 part -> N chunks)
mergedContent = self.mergePartResults(chunkResults, options, [contentPart])
# Parallel chunk processing with per-chunk failover
remainingModels = failoverModelList[attempt:]
allChunkResults, allResponses = await self._processChunksParallel(
chunks, prompt, options, remainingModels, aiObjects, progressCallback
)
if not allResponses:
raise ValueError("All chunks failed for content part")
mergedContent = self.mergePartResults(allResponses, options, [contentPart])
# Stitch pass: reconcile cross-chunk artifacts when multiple chunks were processed
if len(allResponses) > 1:
mergedContent = await self._stitchChunkResults(
mergedContent, len(allResponses), prompt, options, aiObjects
)
return AiCallResponse(
content=mergedContent,
modelName=model.name,
provider=model.connectorType,
priceCHF=sum(r.priceCHF for r in chunkResults),
processingTime=sum(r.processingTime for r in chunkResults),
bytesSent=sum(r.bytesSent for r in chunkResults),
bytesReceived=sum(r.bytesReceived for r in chunkResults),
errorCount=sum(r.errorCount for r in chunkResults)
priceCHF=sum(r.priceCHF for r in allResponses),
processingTime=sum(r.processingTime for r in allResponses),
bytesSent=sum(r.bytesSent for r in allResponses),
bytesReceived=sum(r.bytesReceived for r in allResponses),
errorCount=sum(r.errorCount for r in allResponses)
)
else:
# Part fits - call AI directly via aiObjects interface
logger.info(f"Content part fits within model limits, processing directly")
# Part fits - call AI directly
logger.info(f"Content part fits within model limits, processing directly")
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
logger.info(f"Content part processed successfully with model: {model.name}")
logger.info(f"Content part processed successfully with model: {model.name}")
return response
except Exception as e:
lastError = e
error_msg = str(e) if str(e) else f"{type(e).__name__}"
logger.warning(f"Model {model.name} failed for content part: {error_msg}", exc_info=True)
logger.warning(f"Model {model.name} failed for content part: {error_msg}", exc_info=True)
if attempt < len(failoverModelList) - 1:
logger.info(f"🔄 Trying next failover model...")
logger.info(f"Trying next failover model...")
continue
else:
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
logger.error(f"All {len(failoverModelList)} models failed for content part")
break
# All models failed
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
async def _processChunksParallel(
self,
chunks: List[Dict[str, Any]],
prompt: str,
options,
failoverModels: list,
aiObjects,
progressCallback=None,
maxRetries: int = 3
) -> tuple:
"""Process chunks in parallel. On failure, re-chunk only the failed chunks for the next model.
Returns (orderedResults, allResponses) where orderedResults is a dict
mapping original order -> AiCallResponse and allResponses is a flat list.
"""
if not failoverModels:
return {}, []
pendingChunks = [(chunk.get("order", i), chunk) for i, chunk in enumerate(chunks)]
completedResults: Dict[float, AiCallResponse] = {}
allResponses: List[AiCallResponse] = []
retryCount = 0
modelIdx = 0
currentModel = failoverModels[modelIdx]
maxConcurrent = 3
semaphore = asyncio.Semaphore(maxConcurrent)
logger.info(f"Starting parallel chunk processing: {len(pendingChunks)} chunks with model {currentModel.name}")
while pendingChunks and retryCount <= maxRetries and currentModel:
modelForRound = currentModel
totalInRound = len(pendingChunks)
completedInRound = [0]
async def _processOneChunk(order: float, chunkData: str, model=modelForRound):
async with semaphore:
result = await aiObjects._callWithModel(model, prompt, chunkData, options)
completedInRound[0] += 1
if progressCallback:
progressCallback(completedInRound[0] / totalInRound, f"Chunk {completedInRound[0]}/{totalInRound} completed")
return result
tasks = {}
for order, chunk in pendingChunks:
chunkData = chunk.get('data', '')
tasks[order] = asyncio.create_task(_processOneChunk(order, chunkData))
if progressCallback:
progressCallback(0.0, f"Processing {len(tasks)} chunks in parallel with {currentModel.name}")
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
failedChunks = []
for (order, chunk), result in zip(pendingChunks, results):
if isinstance(result, Exception):
logger.warning(f"Chunk order={order} failed with {currentModel.name}: {result}")
failedChunks.append((order, chunk))
else:
completedResults[order] = result
allResponses.append(result)
logger.info(f"Round {retryCount}: {len(pendingChunks) - len(failedChunks)}/{len(pendingChunks)} chunks succeeded with {currentModel.name}")
if not failedChunks:
break
retryCount += 1
modelIdx += 1
if modelIdx >= len(failoverModels):
logger.error(f"No more failover models available, {len(failedChunks)} chunks remain failed")
break
currentModel = failoverModels[modelIdx]
logger.info(f"Failover: re-chunking {len(failedChunks)} failed chunks for model {currentModel.name}")
newPending = []
for order, failedChunk in failedChunks:
reChunked = await self._reChunkForModel(failedChunk, currentModel, prompt, options)
for i, subChunk in enumerate(reChunked):
subOrder = order + i * 0.001
newPending.append((subOrder, subChunk))
pendingChunks = newPending
orderedResponses = [completedResults[k] for k in sorted(completedResults.keys())]
return orderedResponses, allResponses
async def _reChunkForModel(self, chunk: Dict[str, Any], model, prompt: str, options) -> List[Dict[str, Any]]:
"""Re-chunk a single failed chunk according to the new model's context limits."""
chunkData = chunk.get('data', '')
tempPart = ContentPart(
id=f"rechunk_{uuid.uuid4().hex[:8]}",
label="re-chunk",
typeGroup="structure" if chunkData.strip().startswith(('{', '[')) else "text",
mimeType="application/json" if chunkData.strip().startswith(('{', '[')) else "text/plain",
data=chunkData
)
reChunked = await self.chunkContentPartForAi(tempPart, model, options, prompt)
if not reChunked:
return [chunk]
return reChunked
async def _stitchChunkResults(
self,
mergedContent: str,
chunkCount: int,
originalPrompt: str,
options,
aiObjects
) -> str:
"""Reconcile cross-chunk artifacts in merged content.
Only called when chunkCount > 1. Delegates to aiObjects.callWithTextContext
which handles model selection, failover, and billing.
"""
mergedSize = len(mergedContent.encode('utf-8')) if mergedContent else 0
stitchPrompt = (
"The following content was assembled from multiple independently processed "
f"chunks ({chunkCount} chunks) of the same document. "
"Review and fix ONLY these issues, preserving all content:\n"
"1. Cross-references that point to content from other chunks\n"
"2. Duplicate text at chunk boundaries (remove duplicates)\n"
"3. Sentences or paragraphs split mid-thought (reconnect them)\n"
"4. Inconsistent terminology for the same entity\n\n"
"Do NOT add, remove, or rephrase content beyond these fixes. "
"Return the corrected content in the same format.\n\n"
f"Original processing instruction (truncated): {originalPrompt[:500]}"
)
try:
logger.info(f"Running stitch pass on {mergedSize} bytes")
request = AiCallRequest(
prompt=stitchPrompt,
context=mergedContent,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE)
)
response = await aiObjects.callWithTextContext(request)
if hasattr(response, 'errorCount') and response.errorCount > 0:
logger.warning(f"Stitch pass returned error: {response.content[:200] if response.content else 'empty'}")
return mergedContent
resultSize = len(response.content.encode('utf-8')) if response.content else 0
logger.info(f"Stitch pass completed: {mergedSize} -> {resultSize} bytes")
return response.content
except Exception as e:
logger.warning(f"Stitch pass failed (non-fatal), returning unstitched content: {e}")
return mergedContent
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
"""Create an error response."""
return AiCallResponse(
@ -1521,9 +1691,18 @@ class ExtractionService:
progressCallback(0.1 + (partIndex / totalParts) * 0.8, f"Processing {partLabel} ({partType}) - {partIndex+1}/{totalParts}")
try:
# Process the part
partProgressCb = None
if progressCallback:
partStart = 0.1 + (partIndex / totalParts) * 0.8
partRange = 0.8 / totalParts
def _makePartProgressCb(start, rangeSize):
def _cb(chunkProgress, message):
progressCallback(start + chunkProgress * rangeSize, message)
return _cb
partProgressCb = _makePartProgressCb(partStart, partRange)
partResult = await self.processContentPartWithFallback(
contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging
contentPart, prompt, options, failoverModelList, aiObjects, partProgressCb
)
# Write debug files for generation phase (section content generation)

View file

@ -375,7 +375,7 @@ USER PROVIDED:
- Language: {language or "Not specified"}
Extract and provide a JSON response with:
1. instruction: Formulate directly, WHAT you want to find on the web. Do not include URLs in the instruction. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz"
1. instruction: Formulate a concise search query (MAXIMUM 400 characters) stating WHAT you want to find on the web. Do not include URLs in the instruction. Keep it focused on the core question. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz and find all information about..."
2. urls: Put list of URLs found in the prompt text, and URL's you know, that are relevant to the research
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)

View file

@ -1,13 +1,18 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Services Module.
Central service registry that provides access to shared services.
Service Hub.
Consumer-facing aggregation layer for services, DB interfaces, and runtime state.
IMPORTANT: Import-Regelwerk
- Zentrale Module (wie dieses) dürfen KEINE Feature-Container importieren
Architecture:
- serviceHub delegates service resolution to serviceCenter (DI container)
- serviceHub owns DB interface initialization and runtime state
- serviceCenter knows nothing about serviceHub (one-way dependency)
Import-Regelwerk:
- Zentrale Module (wie dieses) duerfen KEINE Feature-Container importieren
- Feature-spezifische Services werden dynamisch geladen
- Nur Shared Services werden direkt geladen
- Shared Services werden via serviceCenter resolved
"""
import os
@ -23,7 +28,6 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
# Path to feature containers
_FEATURES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "features")
@ -54,15 +58,19 @@ class PublicService:
])
class Services:
class ServiceHub:
"""
Central Services class providing access to all services.
Import-Regelwerk:
- Shared Services are loaded directly (from modules/services/)
- Feature-specific Services are loaded dynamically via filename discovery
Consumer-facing aggregation of services, DB interfaces, and runtime state.
Services are lazy-resolved via serviceCenter on first access.
DB interfaces and runtime state are initialized eagerly.
Feature services/interfaces are discovered dynamically from features/.
"""
_SERVICE_CENTER_WRAPPING = {
"ai": {"functionsOnly": False},
}
def __init__(self, user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None):
self.user: User = user
self.workflow = workflow
@ -71,123 +79,89 @@ class Services:
self.currentUserPrompt: str = ""
self.rawUserPrompt: str = ""
# Initialize central interfaces
from modules.serviceCenter.context import ServiceCenterContext
self._serviceCenterContext = ServiceCenterContext(
user=user,
workflow=workflow,
mandate_id=mandateId,
feature_instance_id=featureInstanceId,
)
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
self.interfaceDbApp = getAppInterface(user, mandateId=mandateId)
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
self.interfaceDbComponent = getComponentInterface(user, mandateId=mandateId)
self.rbac = self.interfaceDbApp.rbac if self.interfaceDbApp else None
# ============================================================
# CENTRAL INTERFACE (Chat/Workflow)
# ============================================================
from modules.interfaces.interfaceDbChat import getInterface as getChatInterface
self.interfaceDbChat = getChatInterface(user, mandateId=mandateId, featureInstanceId=featureInstanceId)
# ============================================================
# SHARED SERVICES (from modules/services/)
# ============================================================
from .serviceSharepoint.mainServiceSharepoint import SharepointService
self.sharepoint = PublicService(SharepointService(self))
from .serviceTicket.mainServiceTicket import TicketService
self.ticket = PublicService(TicketService(self))
from .serviceChat.mainServiceChat import ChatService
self.chat = PublicService(ChatService(self))
from .serviceUtils.mainServiceUtils import UtilsService
self.utils = PublicService(UtilsService(self))
from .serviceSecurity.mainServiceSecurity import SecurityService
self.security = PublicService(SecurityService(self))
from .serviceMessaging.mainServiceMessaging import MessagingService
self.messaging = PublicService(MessagingService(self))
from .serviceStreaming.mainServiceStreaming import StreamingService
self.streaming = PublicService(StreamingService(self))
# ============================================================
# AI SERVICES (from modules/services/)
# ============================================================
from .serviceAi.mainServiceAi import AiService
self.ai = PublicService(AiService(self), functionsOnly=False)
from .serviceExtraction.mainServiceExtraction import ExtractionService
self.extraction = PublicService(ExtractionService(self))
from .serviceGeneration.mainServiceGeneration import GenerationService
self.generation = PublicService(GenerationService(self))
from .serviceWeb.mainServiceWeb import WebService
self.web = PublicService(WebService(self))
# ============================================================
# FEATURE INTERFACES (dynamically loaded)
# ============================================================
self._loadFeatureInterfaces()
self._loadFeatureServices()
def __getattr__(self, name: str):
"""Lazy-resolve services via serviceCenter on first access."""
if name.startswith('_'):
raise AttributeError(name)
try:
from modules.serviceCenter import getService
service = getService(name, self._serviceCenterContext)
wrapping = self._SERVICE_CENTER_WRAPPING.get(name, {})
functionsOnly = wrapping.get("functionsOnly", True)
wrapped = PublicService(service, functionsOnly=functionsOnly)
setattr(self, name, wrapped)
return wrapped
except KeyError:
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
def _loadFeatureInterfaces(self):
"""Dynamically load interfaces from feature containers by filename pattern."""
# Find all interfaceFeature*.py files
pattern = os.path.join(_FEATURES_DIR, "*", "interfaceFeature*.py")
for filepath in glob.glob(pattern):
try:
# Extract feature name and interface name
featureDir = os.path.basename(os.path.dirname(filepath))
filename = os.path.basename(filepath)[:-3] # Remove .py
# Build module path: modules.features.<feature>.<filename>
filename = os.path.basename(filepath)[:-3]
modulePath = f"modules.features.{featureDir}.{filename}"
module = importlib.import_module(modulePath)
# Get interface via getInterface()
if hasattr(module, "getInterface"):
interface = module.getInterface(self.user, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
# Derive attribute name: interfaceFeatureAiChat -> interfaceDbChat
attrName = filename.replace("interfaceFeature", "interfaceDb")
setattr(self, attrName, interface)
logger.debug(f"Loaded interface: {attrName} from {modulePath}")
except Exception as e:
logger.debug(f"Could not load interface from {filepath}: {e}")
def _loadFeatureServices(self):
"""Dynamically load services from feature containers by filename pattern."""
# Find all service*/mainService*.py files in feature containers
pattern = os.path.join(_FEATURES_DIR, "*", "service*", "mainService*.py")
for filepath in glob.glob(pattern):
try:
# Extract paths
serviceDir = os.path.basename(os.path.dirname(filepath))
featureDir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
filename = os.path.basename(filepath)[:-3] # Remove .py
# Build module path: modules.features.<feature>.<serviceDir>.<filename>
filename = os.path.basename(filepath)[:-3]
modulePath = f"modules.features.{featureDir}.{serviceDir}.{filename}"
module = importlib.import_module(modulePath)
# Find service class (ends with "Service")
serviceClass = None
for name in dir(module):
if name.endswith("Service") and not name.startswith("_"):
cls = getattr(module, name)
for attrName in dir(module):
if attrName.endswith("Service") and not attrName.startswith("_"):
cls = getattr(module, attrName)
if isinstance(cls, type):
serviceClass = cls
break
if serviceClass:
# Derive attribute name: serviceAi -> ai, serviceExtraction -> extraction
attrName = serviceDir.replace("service", "").lower()
if not attrName:
attrName = serviceDir.lower()
# Check if it needs functionsOnly=False (for AI service)
functionsOnly = attrName != "ai"
serviceInstance = serviceClass(self)
setattr(self, attrName, PublicService(serviceInstance, functionsOnly=functionsOnly))
logger.debug(f"Loaded service: {attrName} from {modulePath}")
@ -195,6 +169,10 @@ class Services:
logger.debug(f"Could not load service from {filepath}: {e}")
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> Services:
"""Get Services instance for the given user, mandate, and feature instance context."""
return Services(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)
# Backward-compatible alias
Services = ServiceHub
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> ServiceHub:
"""Get ServiceHub instance for the given user, mandate, and feature instance context."""
return ServiceHub(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)

View file

@ -1,166 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
AIChat Feature Container - Main Module.
Handles feature initialization and RBAC catalog registration.
AIChat is the dynamic chat workflow feature that handles:
- AI-powered document processing
- Dynamic workflow execution
- Automation definitions
"""
import logging
from typing import Dict, List, Any
logger = logging.getLogger(__name__)
# Feature metadata
FEATURE_CODE = "chatworkflow"
FEATURE_LABEL = {"en": "Chat Workflow", "de": "Chat-Workflow", "fr": "Workflow de Chat"}
FEATURE_ICON = "mdi-message-cog"
# UI Objects for RBAC catalog
UI_OBJECTS = [
{
"objectKey": "ui.feature.aichat.workflows",
"label": {"en": "Workflows", "de": "Workflows", "fr": "Workflows"},
"meta": {"area": "workflows"}
},
{
"objectKey": "ui.feature.aichat.automations",
"label": {"en": "Automations", "de": "Automatisierungen", "fr": "Automatisations"},
"meta": {"area": "automations"}
},
{
"objectKey": "ui.feature.aichat.logs",
"label": {"en": "Logs", "de": "Logs", "fr": "Journaux"},
"meta": {"area": "logs"}
},
]
# Resource Objects for RBAC catalog
RESOURCE_OBJECTS = [
{
"objectKey": "resource.feature.aichat.workflow.start",
"label": {"en": "Start Workflow", "de": "Workflow starten", "fr": "Démarrer workflow"},
"meta": {"endpoint": "/api/chat/playground/start", "method": "POST"}
},
{
"objectKey": "resource.feature.aichat.workflow.stop",
"label": {"en": "Stop Workflow", "de": "Workflow stoppen", "fr": "Arrêter workflow"},
"meta": {"endpoint": "/api/chat/playground/stop/{workflowId}", "method": "POST"}
},
{
"objectKey": "resource.feature.aichat.workflow.delete",
"label": {"en": "Delete Workflow", "de": "Workflow löschen", "fr": "Supprimer workflow"},
"meta": {"endpoint": "/api/chat/playground/workflow/{workflowId}", "method": "DELETE"}
},
]
# Template roles for this feature
TEMPLATE_ROLES = [
{
"roleLabel": "workflow-admin",
"description": {
"en": "Workflow Administrator - Full access to workflow configuration and execution",
"de": "Workflow-Administrator - Vollzugriff auf Workflow-Konfiguration und Ausführung",
"fr": "Administrateur workflow - Accès complet à la configuration et exécution"
}
},
{
"roleLabel": "workflow-editor",
"description": {
"en": "Workflow Editor - Create and modify workflows",
"de": "Workflow-Editor - Workflows erstellen und bearbeiten",
"fr": "Éditeur workflow - Créer et modifier les workflows"
}
},
{
"roleLabel": "workflow-viewer",
"description": {
"en": "Workflow Viewer - View workflows and execution results",
"de": "Workflow-Betrachter - Workflows und Ausführungsergebnisse einsehen",
"fr": "Visualiseur workflow - Consulter les workflows et résultats"
}
},
]
def getFeatureDefinition() -> Dict[str, Any]:
"""Return the feature definition for registration."""
return {
"code": FEATURE_CODE,
"label": FEATURE_LABEL,
"icon": FEATURE_ICON
}
def getUiObjects() -> List[Dict[str, Any]]:
"""Return UI objects for RBAC catalog registration."""
return UI_OBJECTS
def getResourceObjects() -> List[Dict[str, Any]]:
"""Return resource objects for RBAC catalog registration."""
return RESOURCE_OBJECTS
def getTemplateRoles() -> List[Dict[str, Any]]:
"""Return template roles for this feature."""
return TEMPLATE_ROLES
def registerFeature(catalogService) -> bool:
"""
Register this feature's RBAC objects in the catalog.
Args:
catalogService: The RBAC catalog service instance
Returns:
True if registration was successful
"""
try:
# Register UI objects
for uiObj in UI_OBJECTS:
catalogService.registerUiObject(
featureCode=FEATURE_CODE,
objectKey=uiObj["objectKey"],
label=uiObj["label"],
meta=uiObj.get("meta")
)
# Register Resource objects
for resObj in RESOURCE_OBJECTS:
catalogService.registerResourceObject(
featureCode=FEATURE_CODE,
objectKey=resObj["objectKey"],
label=resObj["label"],
meta=resObj.get("meta")
)
logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects")
return True
except Exception as e:
logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}")
return False
async def onStart(eventUser) -> None:
"""
Called when the feature container starts.
Initializes AI connectors for model registry.
"""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
modelRegistry.ensureConnectorsRegistered()
logger.info(f"Feature '{FEATURE_CODE}' started - AI connectors initialized")
except Exception as e:
logger.error(f"Feature '{FEATURE_CODE}' failed to initialize AI connectors: {e}")
async def onStop(eventUser) -> None:
"""Called when the feature container stops."""
logger.info(f"Feature '{FEATURE_CODE}' stopped")

File diff suppressed because it is too large Load diff

View file

@ -1,513 +0,0 @@
================================================================================
JSON MERGE OPERATION #1
================================================================================
Timestamp: 2026-01-06T22:24:33.405726
INPUT:
Accumulated length: 40250 chars
New Fragment length: 2471 chars
Accumulated: 373 lines (showing first 5 and last 5)
{
"elements": [
{
"type": "table",
"content": {
... (363 lines omitted) ...
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hin
New Fragment: 33 lines (showing first 5 and last 5)
```json
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
... (23 lines omitted) ...
}
}
]
}
```
Normalized Accumulated (40250 chars)
(showing first 5 and last 5 of 373 lines)
{
"elements": [
{
"type": "table",
"content": {
... (363 lines omitted) ...
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hin
Normalized New Fragment (2459 chars)
(showing first 5 and last 5 of 31 lines)
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
... (21 lines omitted) ...
]
}
}
]
}
STEP: PHASE 1
Description: Finding overlap between JSON strings
⏳ In progress...
Overlap Detection (string (exact)):
Overlap length: 40
✅ Found overlap of 40 chars
Accumulated suffix (COMPLETE, 40 chars):
============================================================================
["06.12.25", "08.12.25", "Decathlon, Hin
============================================================================
Fragment prefix (40 chars, 1 lines)
["06.12.25", "08.12.25", "Decathlon, Hin
Overlap found (40 chars):
Accumulated suffix: ["06.12.25", "08.12.25", "Decathlon, Hin
Fragment prefix: ["06.12.25", "08.12.25", "Decathlon, Hin
STEP: PHASE 2
Description: Merging strings (overlap: 40 chars)
⏳ In progress...
Merged String (42669 chars)
(showing first 5 and last 5 of 403 lines)
{
"elements": [
{
"type": "table",
"content": {
... (393 lines omitted) ...
]
}
}
]
}
STEP: PHASE 3
Description: Returning merged string (may be unclosed)
⏳ In progress...
Returning merged string (preserving incomplete element at end for next iteration)
================================================================================
MERGE RESULT: ✅ SUCCESS
================================================================================
Final result length: 42669 chars
Final result (COMPLETE):
================================================================================
{
"elements": [
{
"type": "table",
"content": {
"headers": [
"Date",
"Valuta",
"Details",
"Currency",
"Amount",
"Amount in CHF",
"Maskierte Kreditkarte"
],
"rows": [
["12.09.25", "15.09.25", "Coop-1911 Ruti, Ruti ZH", "CH", "102.05", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "26.20", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "4.50", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "Gartencenter Meier, Durnten", "CH", "88.40", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "18.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "KONDITOREI VOLAND WALD, WALD ZH", "CH", "16.50", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.02", "0.00", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "50.80", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM", "US", "USD 108.10", "88.60", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "113.35", "", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "3.60", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "Coop-4991 Fallanden, Fallanden", "CH", "116.00", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "5.95", "", "**** **** **** 1234"],
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "7.00", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "32.10", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "14.80", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "370.65", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "11.50", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "Kreuzwirt, Weissensee", "AT", "EUR 278.00", "266.50", "**** **** **** 1234"],
["23.09.25", "24.09.25", "FILIALE, WALD ZH", "CH", "EUR 500.00", "492.15", "**** **** **** 1234"],
["24.09.25", "25.09.25", "P2 Parkhaus Ein- & Ausfah, Zurich", "CH", "5.00", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "A.I.R. Bakery, Zurich", "CH", "18.60", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "23.35", "", "**** **** **** 1234"],
["25.09.25", "26.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "203.20", "", "**** **** **** 1234"],
["25.09.25", "26.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "44.10", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "95.25", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "Puls Apotheke & Drogerie, Hinwil", "CH", "140.60", "", "**** **** **** 1234"],
["26.09.25", "29.09.25", "FILIALE, WALD ZH", "CH", "CHF 280.00", "287.00", "**** **** **** 1234"],
["27.09.25", "29.09.25", "NYX*LullySA, Lully", "CH", "1.00", "", "**** **** **** 1234"],
["27.09.25", "29.09.25", "Kisoque de Lully, Lully", "CH", "5.70", "", "**** **** **** 1234"],
["27.09.25", "29.09.25", "TOTAL MKT FR, NANTERRE", "FR", "EUR 79.95", "76.90", "**** **** **** 1234"],
["27.09.25", "29.09.25", "AREA NFC 4261525, 69BRON CEDEX", "FR", "EUR 33.50", "32.20", "**** **** **** 1234"],
["27.09.25", "29.09.25", "HOLIDAY APARTMENTS, PORT SAPLAYA", "ES", "EUR 1'118.15", "1'075.45", "**** **** **** 1234"],
["27.09.25", "29.09.25", "LE BISTROT DEL M, MEZE", "FR", "EUR 210.20", "202.15", "**** **** **** 1234"],
["27.09.25", "29.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "EUR 2.40", "2.30", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 90.09", "86.65", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 4.70", "4.50", "**** **** **** 1234"],
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 8.40", "8.10", "**** **** **** 1234"],
["28.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 15.60", "15.00", "**** **** **** 1234"],
["27.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 24.40", "23.45", "**** **** **** 1234"],
["29.09.25", "30.09.25", "OROMARKET SUPERMERCADOS, OROPESA", "ES", "EUR 17.32", "16.65", "**** **** **** 1234"],
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 40.40", "38.85", "**** **** **** 1234"],
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 22.55", "21.70", "**** **** **** 1234"],
["29.09.25", "30.09.25", "ALDI OROPESA, OROPESA", "ES", "EUR 129.39", "124.40", "**** **** **** 1234"],
["30.09.25", "01.10.25", "QUESADA CENTER, OROPESA DEL M", "ES", "EUR 84.05", "80.95", "**** **** **** 1234"],
["30.09.25", "01.10.25", "PASSION CREPES, OROPESA", "ES", "EUR 10.30", "9.90", "**** **** **** 1234"],
["30.09.25", "01.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 17.53", "16.90", "**** **** **** 1234"],
["30.09.25", "01.10.25", "Restaurante DRAGON, OROPESA", "ES", "EUR 75.00", "72.25", "**** **** **** 1234"],
["30.09.25", "01.10.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM", "US", "USD 216.20", "177.55", "**** **** **** 1234"],
["01.10.25", "02.10.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "29.60", "", "**** **** **** 1234"],
["01.10.25", "02.10.25", "RTE PUERTA DEL SOL, OROPESA DEL M", "ES", "EUR 169.20", "163.10", "**** **** **** 1234"],
["01.10.25", "02.10.25", "TREN TURISTICO OROPESA, OROPESA DEL M", "ES", "EUR 15.00", "14.45", "**** **** **** 1234"],
["01.10.25", "02.10.25", "LANGDOCK GMBH, BERLIN", "DE", "EUR 25.00", "24.10", "**** **** **** 1234"],
["01.10.25", "02.10.25", "WWW.PERPLEXITY.AI, WWW.PERPLEXIT", "US", "USD 10.81", "8.90", "**** **** **** 1234"],
["02.10.25", "06.10.25", "GOOGLE *YouTubePremium, g.co/helppay#", "GB", "33.90", "", "**** **** **** 1234"],
["02.10.25", "06.10.25", "WILLY LA CONCHA, OROPESA DEL M", "ES", "EUR 98.93", "95.40", "**** **** **** 1234"],
["03.10.25", "06.10.25", "Netflix.com, Los Gatos", "NL", "20.90", "", "**** **** **** 1234"],
["03.10.25", "06.10.25", "COALIMENT LA CONCHA, OROPESA DEL M", "ES", "EUR 11.74", "11.30", "**** **** **** 1234"],
["03.10.25", "06.10.25", "DONA RESU, OROPESA", "ES", "EUR 7.30", "7.05", "**** **** **** 1234"],
["04.10.25", "06.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 89.50", "86.30", "**** **** **** 1234"],
["04.10.25", "06.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 8.45", "8.15", "**** **** **** 1234"],
["04.10.25", "06.10.25", "HELADERIA LAS DELICIAS, OROPESA DEL M", "ES", "EUR 10.80", "10.40", "**** **** **** 1234"],
["04.10.25", "06.10.25", "REST. BISTROT, OROPESA DEL M", "ES", "EUR 117.90", "113.70", "**** **** **** 1234"],
["04.10.25", "06.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["04.10.25", "06.10.25", "Google Duolingo Langu, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 3.00", "2.90", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 9.00", "8.70", "**** **** **** 1234"],
["05.10.25", "06.10.25", "RESTAURANTE, ORPESA", "ES", "EUR 87.75", "84.60", "**** **** **** 1234"],
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 15.50", "14.95", "**** **** **** 1234"],
["06.10.25", "07.10.25", "HABANA, OROPESA", "ES", "EUR 25.00", "24.05", "**** **** **** 1234"],
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 3.95", "3.80", "**** **** **** 1234"],
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 47.75", "45.95", "**** **** **** 1234"],
["07.10.25", "08.10.25", "MAGIC SPORT HALL OLYMPICS, OROPESA DEL M", "ES", "EUR 183.75", "176.70", "**** **** **** 1234"],
["07.10.25", "08.10.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 172.55", "165.90", "**** **** **** 1234"],
["07.10.25", "08.10.25", "Wondershare, Hong Kong", "HK", "25.95", "", "**** **** **** 1234"],
["07.10.25", "08.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 99.13", "95.30", "**** **** **** 1234"],
["07.10.25", "08.10.25", "RECEP HOTEL MAGIC SPORTS, OROPESA DEL M", "ES", "EUR 10.00", "9.60", "**** **** **** 1234"],
["07.10.25", "08.10.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 98.07", "94.00", "**** **** **** 1234"],
["08.10.25", "09.10.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 44.20", "42.35", "**** **** **** 1234"],
["08.10.25", "09.10.25", "A.R.E.A., 69671", "FR", "EUR 11.20", "10.75", "**** **** **** 1234"],
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "113.10", "", "**** **** **** 1234"],
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "6.80", "", "**** **** **** 1234"],
["09.10.25", "10.10.25", "A.R.E.A., 69671", "FR", "EUR 15.00", "14.40", "**** **** **** 1234"],
["08.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 110.00", "105.45", "**** **** **** 1234"],
["09.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 40.00", "38.35", "**** **** **** 1234"],
["10.10.25", "13.10.25", "Coop-1252 Wald, Wald ZH", "CH", "164.85", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "CURSOR, AI POWERED IDE, CURSOR.COM", "US", "USD 20.00", "16.60", "**** **** **** 1234"],
["11.10.25", "13.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Cafe Konditorei Voland, Laupen ZH", "CH", "37.70", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "17.35", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "5.40", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "54.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Rest Volkshaus, Zurich", "CH", "18.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Sora Sushi - HB Zurich, Zurich", "CH", "74.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "176.32 avec, Ruti ZH", "CH", "2.45", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Baradox AG, Zurich", "CH", "15.00", "", "**** **** **** 1234"],
["12.09.25", "15.09.25", "Volkshausstiftung Zurich, Zurich", "CH", "3.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "9.20", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "SBB Bahnhof Wald, Wald ZH", "CH", "27.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
["13.09.25", "15.09.25", "URBAN FOOD CLUTURE GMB, ZURICH", "CH", "135.00", "", "**** **** **** 1234"],
["14.09.25", "15.09.25", "Google One, 650-2530000", "US", "100.00", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Ex Libris AG, Dietikon", "CH", "13.00", "", "**** **** **** 1234"],
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "51.45", "", "**** **** **** 1234"],
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "5.80", "", "**** **** **** 1234"],
["19.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "16.05", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "14.60", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.55", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.90", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "Coop-1252 Wald, Wald ZH", "CH", "60.75", "", "**** **** **** 1234"],
["20.09.25", "22.09.25", "MORE BAR GMBH, BUBIKON", "CH", "70.00", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "6.40", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "4.20", "", "**** **** **** 1234"],
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.45", "", "**** **** **** 1234"],
["22.09.25", "23.09.25", "Migros M Wald, Wald ZH", "CH", "16.80", "", "**** **** **** 1234"],
["22.09.25", "23.09.25", "BLEICHI + HOTEL, WALD", "CH", "43.00", "", "**** **** **** 1234"],
["23.09.25", "24.09.25", "Coop-1252 Wald, Wald ZH", "CH", "155.75", "", "**** **** **** 1234"],
["24.09.25", "25.09.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 177.35", "170.35", "**** **** **** 1234"],
["27.09.25", "29.09.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "21.50", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "15.75", "", "**** **** **** 1234"],
["28.09.25", "29.09.25", "AREAS LA SELVA, BARCELONA", "ES", "EUR 19.11", "18.40", "**** **** **** 1234"],
["02.10.25", "06.10.25", "GOOGLE *YouTube Member, g.co/helppay#", "GB", "15.00", "", "**** **** **** 1234"],
["01.10.25", "06.10.25", "Eventfrog.c 737909203525, Olten", "CH", "114.95", "", "**** **** **** 1234"],
["06.10.25", "07.10.25", "digitec Galaxus (Online), Zurich", "CH", "23.80", "", "**** **** **** 1234"],
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 29.58", "28.35", "**** **** **** 1234"],
["10.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "72.45", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "Ticketcorner*89987227, 410900800800", "CH", "199.80", "", "**** **** **** 1234"],
["10.10.25", "13.10.25", "SP NORAYA, RUMISBERG", "CH", "79.90", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "139.95", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "TEMU.COM, BASEL", "CH", "81.20", "", "**** **** **** 1234"],
["11.10.25", "13.10.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Rest Volkshaus, Zurich", "CH", "9.00", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Shell Heuberg, Forch", "CH", "100.10", "", "**** **** **** 1234"],
["12.10.25", "13.10.25", "Parkhaus Helvetiaplatz, Zurich", "CH", "8.00", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "P2 Parkhaus Ein- & Ausfah, Zurich CH", "CHF", "5.00", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "Migros Zurich Airport, Zurich CH", "CHF", "16.35", "", "**** **** **** 1234"],
["14.10.25", "15.10.25", "GITHUB, INC., GITHUB.COM US", "USD", "0.30", "0.25", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Dosenbach Schuhe & Sport, Hinwil CH", "CHF", "50.00", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "257.20", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Landi, Wald CH", "CHF", "67.85", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Puls Apotheke & Drogerie, Hinwil CH", "CHF", "9.20", "", "**** **** **** 1234"],
["15.10.25", "16.10.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM US", "USD", "108.10", "89.50", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "7.80", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "14.50", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "4.20", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "Universitatsspital Zurich, Zurich CH", "CHF", "30.00", "", "**** **** **** 1234"],
["18.10.25", "20.10.25", "HubSpot Germany GmbH, Berlin DE", "EUR", "267.55", "256.05", "**** **** **** 1234"],
["18.10.25", "20.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "7.20", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "20.30", "", "**** **** **** 1234"],
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "11.10", "", "**** **** **** 1234"],
["18.10.25", "20.10.25", "ANTHROPIC, ANTHROPIC.COM US", "USD", "108.10", "88.75", "**** **** **** 1234"],
["20.10.25", "21.10.25", "APCOA, Dubendorf CH", "CHF", "20.00", "", "**** **** **** 1234"],
["20.10.25", "21.10.25", "STWEG Ambassador House, Glattbrugg CH", "CHF", "5.00", "", "**** **** **** 1234"],
["23.10.25", "24.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "199.85", "", "**** **** **** 1234"],
["24.10.25", "24.10.25", "Ticketcorner*90004263, 410900800800 CH", "CHF", "159.75", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Google Duolingo Langu, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "1.50", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "814.10", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "REMO WUEST BACK. KOND., GALGENEN CH", "CHF", "20.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "12.90", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "15.30", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "6.50", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "139.85", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Coop-4054 Hinwil Restaura, Hinwil CH", "CHF", "34.95", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "Coop-1911 Ruti, Ruti ZH CH", "CHF", "66.50", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM US", "USD", "216.20", "178.70", "**** **** **** 1234"],
["01.11.25", "03.11.25", "GOOGLE *ADS5192965135, cc§google.com IE", "", "79.15", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "99.60", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "LANGDOCK GMBH, BERLIN DE", "EUR", "25.00", "23.90", "**** **** **** 1234"],
["01.11.25", "03.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "GOOGLE *YouTubePremium, g.co/helppay# GB", "", "33.90", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "119.45", "", "**** **** **** 1234"],
["03.11.25", "03.11.25", "Netflix.com, Los Gatos NL", "", "20.90", "", "**** **** **** 1234"],
["03.11.25", "04.11.25", "www.fust.ch, Oberburen CH", "CHF", "1'560.90", "", "**** **** **** 1234"],
["06.11.25", "07.11.25", "Grand Casino Luzern AG, Luzern CH", "CHF", "100.00", "108.00", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "0.40", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "15.90", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "wondershare.com, Hong Kong HK", "", "25.95", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "9.85", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Google One, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Steiner-Beck AG, Wald ZH CH", "CHF", "32.20", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
["09.11.25", "10.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "25.80", "", "**** **** **** 1234"],
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "Coop-2253 Jona Eisenhof, Jona CH", "CHF", "161.25", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Hess AG Erdbau + Recy, Laupen ZH CH", "CHF", "39.20", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Jumbo-6017 Hinwil, Hinwil CH", "CHF", "173.70", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "57.90", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "140.10", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "22.30", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "UDIO.COM, UDIO.COM US", "EUR", "36.00", "34.35", "**** **** **** 1234"],
["15.10.25", "16.10.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "4.95", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "61.50", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "12.95", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "32.30", "", "**** **** **** 1234"],
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "17.95", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "54.00", "", "**** **** **** 1234"],
["17.10.25", "20.10.25", "Candrian Catering AG 2, Zurich CH", "CHF", "15.50", "", "**** **** **** 1234"],
["20.10.25", "21.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "178.95", "", "**** **** **** 1234"],
["21.10.25", "22.10.25", "Denner Ruti ZH, Ruti ZH CH", "CHF", "50.15", "", "**** **** **** 1234"],
["24.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "100.65", "", "**** **** **** 1234"],
["24.10.25", "27.10.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "70.35", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "47.00", "", "**** **** **** 1234"],
["25.10.25", "27.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "3.20", "", "**** **** **** 1234"],
["26.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "63.10", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "ONLY, Hinwil CH", "CHF", "222.60", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "104.10", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "24.95", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "177.25", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "H & M, Hinwil CH", "CHF", "43.85", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "52.30", "", "**** **** **** 1234"],
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "59.05", "", "**** **** **** 1234"],
["28.10.25", "29.10.25", "Migros MM Rapperswil, Rapperswil SG CH", "CHF", "23.35", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "ROSSMANN Schweiz AG, Wallisellen CH", "CHF", "13.95", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Migros MR Glattzentrum, Glattzentrum CH", "CHF", "42.20", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Calzedonia, Wallisellen CH", "CHF", "178.25", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Intimissimi, Wallisellen CH", "CHF", "90.20", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "76.80", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "7.95", "", "**** **** **** 1234"],
["29.10.25", "30.10.25", "Golden Bar GmbH, Wald ZH CH", "CHF", "40.00", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "12.60", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "4.20", "", "**** **** **** 1234"],
["30.10.25", "31.10.25", "Halle 622, Zurich CH", "CHF", "15.75", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "Eventfrog.c 739003945141, Olten CH", "CHF", "67.85", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "AMERON ZUERICH, ZUERICH CH", "CHF", "30.00", "", "**** **** **** 1234"],
["31.10.25", "03.11.25", "SKYLINE EVENTS, ZUERICH CH", "CHF", "13.50", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "AURA Event Saal, Zuerich CH", "CHF", "15.75", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "GOOGLE *YouTube Member, g.co/helppay# GB", "", "15.00", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "VBZ Bellevue, Zurich CH", "CHF", "2.80", "", "**** **** **** 1234"],
["01.11.25", "03.11.25", "WAL*CLUB BELLEVUE, HOERI CH", "CHF", "16.50", "", "**** **** **** 1234"],
["02.11.25", "03.11.25", "MCDONALDS ZUERICH 2016, ZUERICH CH", "CHF", "10.50", "", "**** **** **** 1234"],
["03.11.25", "04.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "191.15", "", "**** **** **** 1234"],
["05.11.25", "06.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "51.35", "", "**** **** **** 1234"],
["06.11.25", "07.11.25", "Ticketcorner*90024523, 410900800800 CH", "CHF", "158.75", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "SUMUP *JW BROW&LASH, LACHEN CH", "CHF", "290.00", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "104.50", "", "**** **** **** 1234"],
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "10.30", "", "**** **** **** 1234"],
["08.11.25", "10.11.25", "Pizza Thal GmbH, Murgenthal CH", "CHF", "19.50", "", "**** **** **** 1234"],
["09.11.25", "10.11.25", "TEMU.COM, BASEL CH", "CHF", "190.85", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "Sinora GmbH, Bonstetten CH", "CHF", "115.20", "", "**** **** **** 1234"],
["10.11.25", "11.11.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "33.85", "", "**** **** **** 1234"],
["11.11.25", "12.11.25", "Bleiche Fitness, Wald ZH CH", "CHF", "90.00", "", "**** **** **** 1234"],
["11.11.25", "12.11.25", "Parkhaus Urania, Zurich CH", "CHF", "14.00", "", "**** **** **** 1234"],
["12.11.25", "13.11.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "24.80", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "56.00", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "5.95", "", "**** **** **** 1234"],
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "15.25", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Santa Lucia Altstetten, Zurich", "CH", "38.00", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Agrola TopShop Wald, Wald ZH", "CH", "126.80", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.70", "0.60", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Jumbo-6017 Hinwil, Hinwil", "CH", "53.85", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "57.00", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "13.95", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "NEGISHI ALTSTETTEN BAH, ZUERICH", "CH", "31.90", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "CANVA* I04701-26464248, CANVA.COM", "US", "12.00", "", "**** **** **** 1234"],
["17.11.25", "18.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 270.25", "220.65", "**** **** **** 1234"],
["18.11.25", "19.11.25", "Coop-1252 Wald, Wald ZH", "CH", "7.80", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.30", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "343.30", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 5.41", "4.45", "**** **** **** 1234"],
["18.11.25", "20.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.35", "", "**** **** **** 1234"],
["19.11.25", "20.11.25", "Wuest Partner, Zurich", "CH", "324.30", "", "**** **** **** 1234"],
["19.11.25", "21.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.40", "11.80", "**** **** **** 1234"],
["20.11.25", "21.11.25", "Coop-1252 Wald, Wald ZH", "CH", "85.35", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "17.95", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "6.30", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "STWEG Ambassador House, Glattbrugg", "CH", "7.50", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "16.95", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "MCDONALDS RESTAURANT G, WALLISELLEN", "CH", "13.00", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "Ski- und Snowboard-Center, Neuhaus SG", "CH", "128.00", "", "**** **** **** 1234"],
["21.11.25", "24.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "408.25", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "GOOGLE *Duolingo Langu, g.co/HelpPay#", "US", "9.20", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "48.60", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "8.50", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "Migros ELS Santispark PH, Abtwil SG", "CH", "3.00", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "121.80", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "10.50", "", "**** **** **** 1234"],
["23.11.25", "24.11.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "62.80", "", "**** **** **** 1234"],
["23.11.25", "25.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 9.30", "8.90", "**** **** **** 1234"],
["24.11.25", "25.11.25", "Landi, Wald", "CH", "27.15", "", "**** **** **** 1234"],
["24.11.25", "26.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
["26.11.25", "27.11.25", "MyPlace, Affoltern am", "CH", "10.30", "", "**** **** **** 1234"],
["27.11.25", "28.11.25", "Coop-1911 Ruti, Ruti ZH", "CH", "57.20", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "10.10", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "136.25", "", "**** **** **** 1234"],
["28.11.25", "01.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "205.35", "", "**** **** **** 1234"],
["01.12.25", "02.12.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "59.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "112.50", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Coop-1252 Wald, Wald ZH", "CH", "117.70", "", "**** **** **** 1234"],
["03.12.25", "03.12.25", "Autodesk ADY, Dublin 2", "IE", "1'989.05", "", "**** **** **** 1234"],
["03.12.25", "03.12.25", "NETFLIX.COM, Amsterdam", "NL", "22.90", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 17.48", "14.50", "**** **** **** 1234"],
["02.12.25", "03.12.25", "GOOGLE *YouTubePremium, g.co/HelpPay#", "US", "33.90", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "103.20", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.80", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "MICROSOFT#G127221615, MSBILL.INFO", "CH", "55.20", "", "**** **** **** 1234"],
["04.12.25", "08.12.25", "Ristorante Amalfi AG, Zurich", "CH", "67.00", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Landi, Wald", "CH", "11.90", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Notariat Wald, Wald ZH", "CH", "40.00", "", "**** **** **** 1234"],
["05.12.25", "08.12.25", "Coop-1252 Wald, Wald ZH", "CH", "149.75", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "80.30", "", "**** **** **** 1234"],
["07.12.25", "08.12.25", "HERAHELP.COM, 0044330027088", "CY", "EUR 19.95", "19.25", "**** **** **** 1234"],
["07.12.25", "08.12.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
["10.12.25", "11.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 43.26", "35.95", "**** **** **** 1234"],
["11.12.25", "12.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "247.40", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "ONLY, Zurich", "CH", "101.75", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "SUMUP *MARYS COSMETICS, USTER", "CH", "419.00", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "S2P*Calzedonia, 0447554090", "IT", "86.75", "", "**** **** **** 1234"],
["14.11.25", "17.11.25", "Parkhaus Urania, Zurich", "CH", "12.00", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "JustEat, Zurich", "CH", "193.70", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "ONLY, Hinwil", "CH", "126.10", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "242.70", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Manor AG, Hinwil", "CH", "35.35", "", "**** **** **** 1234"],
["15.11.25", "17.11.25", "Valentyna Nails, R?ti", "CH", "160.00", "", "**** **** **** 1234"],
["13.11.25", "17.11.25", "redcare-apotheke, Sevenum", "NL", "79.90", "", "**** **** **** 1234"],
["16.11.25", "17.11.25", "NORDSTERN, Basel", "CH", "64.20", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "La Makeup Sp. z. o.o., Warsaw", "PL", "104.85", "", "**** **** **** 1234"],
["18.11.25", "19.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["20.11.25", "21.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "94.60", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 7.39", "7.05", "**** **** **** 1234"],
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 4.39", "4.20", "**** **** **** 1234"],
["22.11.25", "24.11.25", "Coop-1252 Wald, Wald ZH", "CH", "57.85", "", "**** **** **** 1234"],
["22.11.25", "24.11.25", "ASFINAG S16 HMS ST JAKOB, ST.ANTON/ARLB", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
["24.11.25", "25.11.25", "Posthotel Achenkirc, Achenkirch", "AT", "EUR 1'211.80", "1'160.25", "**** **** **** 1234"],
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "15.00", "", "**** **** **** 1234"],
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "8.40", "", "**** **** **** 1234"],
["24.11.25", "25.11.25", "BKG*BOOKING.COM HOTEL, (888)850-3958", "NL", "187.95", "", "**** **** **** 1234"],
["25.11.25", "26.11.25", "Coop-1252 Wald, Wald ZH", "CH", "63.00", "", "**** **** **** 1234"],
["25.11.25", "26.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["26.11.25", "27.11.25", "Hallenbad Wald, Wald ZH", "CH", "54.00", "", "**** **** **** 1234"],
["27.11.25", "28.11.25", "Bestseller AS, Amsterdam", "NL", "35.90", "", "**** **** **** 1234"],
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "84.90", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "126.15", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "3.70", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
["02.12.25", "03.12.25", "GOOGLE *YouTube Member, g.co/HelpPay#", "US", "15.00", "", "**** **** **** 1234"],
["03.12.25", "04.12.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "54.90", "", "**** **** **** 1234"],
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
["11.12.25", "13.12.25", "Manor, Zürich", "CH", "75.00", "", "**** **** **** 1234"],
["12.12.25", "14.12.25", "Zalando, zalando.ch", "CH", "90.00", "", "**** **** **** 1234"],
["13.12.25", "15.12.25", "SBB CFF FFS, Bern", "CH", "60.00", "", "**** **** **** 1234"],
["14.12.25", "16.12.25", "Apple Store, Zürich", "CH", "999.00", "", "**** **** **** 1234"],
["15.12.25", "17.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "150.00", "", "**** **** **** 1234"],
["16.12.25", "18.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "250.00", "", "**** **** **** 1234"],
["17.12.25", "19.12.25", "Shell Waldhof, Wald ZH", "CH", "60.00", "", "**** **** **** 1234"],
["18.12.25", "20.12.25", "Zürich HB, Zürich", "CH", "30.00", "", "**** **** **** 1234"],
["19.12.25", "21.12.25", "Amazon Marketplace, amazon.de", "DE", "80.00", "", "**** **** **** 1234"],
["20.12.25", "22.12.25", "IKEA, Dietlikon", "CH", "400.00", "", "**** **** **** 1234"],
["21.12.25", "23.12.25", "Manor, Zürich", "CH", "100.00", "", "**** **** **** 1234"],
["22.12.25", "24.12.25", "Zalando, zalando.ch", "CH", "110.00", "", "**** **** **** 1234"],
["23.12.25", "25.12.25", "SBB CFF FFS, Bern", "CH", "70.00", "", "**** **** **** 1234"],
["24.12.25", "26.12.25", "Apple Store, Zürich", "CH", "1200.00", "", "**** **** **** 1234"],
["25.12.25", "27.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "180.00", "", "**** **** **** 1234"],
["26.12.25", "28.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "300.00", "", "**** **** **** 1234"],
["27.12.25", "29.12.25", "Shell Waldhof, Wald ZH", "CH", "70.00", "", "**** **** **** 1234"],
["28.12.25", "30.12.25", "Zürich HB, Zürich", "CH", "40.00", "", "**** **** **** 1234"],
["29.12.25", "31.12.25", "Amazon Marketplace, amazon.de", "DE", "100.00", "", "**** **** **** 1234"],
["30.12.25", "01.01.26", "IKEA, Dietlikon", "CH", "450.00", "", "**** **** **** 1234"],
["31.12.25", "02.01.26", "Manor, Zürich", "CH", "125.00", "", "**** **** **** 1234"]
]
}
}
]
}
================================================================================

View file

@ -1,239 +0,0 @@
# AI Call Iteration Flow - JSON Merging System
This document describes the iteration flow for handling large JSON responses from AI that may be truncated and need to be merged across multiple iterations.
## Overview
When an AI response is too large, it may be truncated (cut) at an arbitrary point. The iteration system:
1. Detects incomplete JSON
2. Requests continuation from the AI
3. Merges the continuation with the existing JSON
4. Repeats until complete or max failures reached
---
## Key Variables
| Variable | Type | Purpose |
|----------|------|---------|
| `jsonBase` | `str \| None` | The merged JSON string (CUT version for overlap matching) |
| `candidateJson` | `str` | Temporary holder for merged result until validated |
| `lastValidCompletePart` | `str \| None` | Fallback - last successfully parsed CLOSED JSON |
| `lastOverlapContext` | `str` | Context for retry/continuation prompts |
| `lastHierarchyContextForPrompt` | `str` | Context for retry/continuation prompts |
| `mergeFailCount` | `int` | Global counter (max 3 failures) |
---
## Key Distinction: hierarchyContext vs completePart
| Field | Description | Use Case |
|-------|-------------|----------|
| `hierarchyContext` | **CUT JSON** - truncated at cut point | Used as `jsonBase` for merging with next AI fragment |
| `completePart` | **CLOSED JSON** - all structures properly closed | Used for validation, parsing, and fallback |
**Why this matters:**
- The next AI fragment starts with an **overlap** that matches the CUT point
- If we used `completePart` (closed), the overlap detection would FAIL
- We must use `hierarchyContext` (cut) so overlap matching works correctly
---
## Flow Steps
### Step 1: BUILD PROMPT
**Location:** `subAiCallLooping.py` lines 163-212
**Function:** `buildContinuationContext()` from `modules/shared/jsonUtils.py`
- **First iteration:** Use original prompt
- **Continuation:** `buildContinuationContext(allSections, lastRawResponse, ...)`
- Internally calls `getContexts(lastRawResponse)` to get overlap/hierarchy
- Builds continuation prompt with `overlapContext` + `hierarchyContextForPrompt`
### Step 2: CALL AI
**Location:** `subAiCallLooping.py` lines 214-299
**Function:** `self.aiService.callAi(request)`
- Returns `response.content` as `result`
- NOTE: Do NOT update `lastRawResponse` yet! (only after successful merge)
### Step 4: MERGE
**Location:** `subAiCallLooping.py` lines 338-396
**Function:** `JsonResponseHandler.mergeJsonStringsWithOverlap()` from `modules/services/serviceAi/subJsonResponseHandling.py`
```
IF first iteration (jsonBase is None):
→ candidateJson = result
ELSE:
→ mergedJsonString, hasOverlap = mergeJsonStringsWithOverlap(jsonBase, result)
IF hasOverlap = False (MERGE FAILED):
→ mergeFailCount++
→ If mergeFailCount >= 3: return lastValidCompletePart (fallback)
→ Else: continue (retry with unchanged jsonBase AND lastRawResponse!)
ELSE:
→ candidateJson = mergedJsonString (don't update jsonBase yet!)
→ lastRawResponse = candidateJson (ONLY after first iteration or successful merge!)
TRY DIRECT PARSE of candidateJson:
IF parse succeeds:
→ jsonBase = candidateJson (commit)
→ FINISHED! Return normalized result
ELSE:
→ Proceed to Step 5
```
### Step 5: GET CONTEXTS
**Location:** `subAiCallLooping.py` lines 420-427
**Function:** `getContexts()` from `modules/shared/jsonContinuation.py`
```python
contexts = getContexts(candidateJson)
```
Returns `JsonContinuationContexts`:
- `overlapContext`: `""` if JSON is complete (no cut point)
- `hierarchyContext`: CUT JSON (for merging with next fragment)
- `hierarchyContextForPrompt`: CUT JSON with budget limits (for prompts)
- `completePart`: CLOSED JSON (repaired if needed)
- `jsonParsingSuccess`: `True` if completePart is valid JSON
**Enhancement:** If original JSON is already complete → `overlapContext = ""`
This signals "JSON is complete, no more continuation needed"
### Step 6: DECIDE
**Location:** `subAiCallLooping.py` lines 429-528
#### Case A: `jsonParsingSuccess=true` AND `overlapContext=""`
**→ FINISHED**
- JSON is complete (no cut point)
- `jsonBase = contexts.completePart` (use CLOSED version for final result)
- Return `completePart` as result
#### Case B: `jsonParsingSuccess=true` AND `overlapContext!=""`
**→ CONTINUE to next iteration**
- JSON parseable but has cut point
- `jsonBase = contexts.hierarchyContext` ← **CUT version for next merge!**
- `lastValidCompletePart = contexts.completePart` ← **CLOSED version for fallback**
- Store contexts for next prompt
- `mergeFailCount = 0` (reset on success)
- `lastRawResponse = jsonBase`
- Continue to next iteration
#### Case C: `jsonParsingSuccess=false`
**→ RETRY with same prompt**
- Do NOT update `jsonBase` (keep previous valid state)
- `mergeFailCount++`
- If `mergeFailCount >= 3`: return `lastValidCompletePart` (fallback)
- Else: continue (retry with unchanged jsonBase/lastRawResponse)
---
## Flow Diagram
```
┌───────────────────────────────────────────────────────────────┐
│ ITERATION START │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 1: BUILD PROMPT │
│ - First: original prompt │
│ - Next: buildContinuationContext(lastRawResponse) │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 2: CALL AI → result │
└───────────────────────────┬───────────────────────────────────┘
┌───────────────────────────▼───────────────────────────────────┐
│ STEP 4: MERGE jsonBase + result → candidateJson │
└───────────────────────────┬───────────────────────────────────┘
┌────────────▼────────────┐
│ Merge OK? │
└────────────┬────────────┘
┌─────────────────────┼─────────────────────┐
│ NO │ YES │
▼ ▼ │
┌──────────────┐ ┌──────────────────┐ │
│ fails++ │ │ TRY DIRECT PARSE │ │
│ if >=3: │ │ of candidateJson │ │
│ RETURN │ └────────┬─────────┘ │
│ fallback │ │ │
│ else: RETRY │ ┌────────▼─────────┐ │
│ (continue) │ │ Parse OK? │ │
└──────────────┘ └────────┬─────────┘ │
│ │
┌─────────────────────┼─────────────────────┐
│ YES │ NO │
▼ ▼ │
┌──────────────┐ ┌──────────────────────────────┐
│ FINISHED ✓ │ │ STEP 5: getContexts() │
│ Return │ │ → jsonParsingSuccess │
│ normalized │ │ → overlapContext │
│ result │ └────────────┬─────────────────┘
└──────────────┘ │
┌────────────▼────────────────────┐
│ STEP 6: DECIDE │
└────────────┬────────────────────┘
┌────────────────────────────┼────────────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────────┐ ┌───────────────────────┐ ┌───────────────────┐
│ success=true │ │ success=true │ │ success=false │
│ overlap="" │ │ overlap!="" │ │ │
│ ───────────── │ │ ───────────────── │ │ ───────────── │
│ FINISHED ✓ │ │ CONTINUE │ │ RETRY │
│ │ │ │ │ │
│ jsonBase = │ │ jsonBase = │ │ jsonBase unchanged│
│ completePart │ │ hierarchyContext │ │ fails++ │
│ (CLOSED) │ │ (CUT for merge!) │ │ │
│ │ │ │ │ if >=3: fallback │
│ Return result │ │ fallback = │ │ else: retry │
│ │ │ completePart │ │ │
│ │ │ (CLOSED) │ │ │
│ │ │ │ │ │
│ │ │ Next iteration → │ │ │
└───────────────────┘ └───────────────────────┘ └───────────────────┘
```
---
## Files Involved
| File | Purpose |
|------|---------|
| `modules/services/serviceAi/subAiCallLooping.py` | Main iteration loop |
| `modules/shared/jsonContinuation.py` | `getContexts()` - context extraction & repair |
| `modules/shared/jsonUtils.py` | `buildContinuationContext()` - prompt building |
| `modules/services/serviceAi/subJsonResponseHandling.py` | `mergeJsonStringsWithOverlap()` |
| `modules/services/serviceAi/subJsonMerger.py` | `ModularJsonMerger` - actual merge logic |
| `modules/datamodels/datamodelAi.py` | `JsonContinuationContexts` model |
---
## Error Handling
### Merge Failures
- Max 3 consecutive failures allowed
- On failure: retry with unchanged `jsonBase` (previous valid state)
- After 3 failures: return `lastValidCompletePart` as fallback
### Parse Failures
- If `getContexts()` cannot produce valid JSON: increment fail counter
- Retry with same prompt (don't update jsonBase)
- After 3 failures: return `lastValidCompletePart` as fallback
### Fallback Strategy
- `lastValidCompletePart` stores the last successfully parsed CLOSED JSON
- Always available as fallback when things go wrong
- Ensures we return valid JSON even after multiple failures

View file

@ -1,665 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
AI Call Looping Module
Handles AI calls with looping and repair logic, including:
- Looping with JSON repair and continuation
- KPI definition and tracking
- Progress tracking and iteration management
FLOW LOGIC
VARIABLES:
- jsonBase: str (merged JSON so far, starts empty)
- lastValidCompletePart: str (fallback for failures)
- mergeFailCount: int = 0 (max 3)
FLOW:
1. BUILD PROMPT
- First: original prompt
- Next: buildContinuationContext(lastRawResponse)
2. CALL AI response fragment
4. MERGE jsonBase + response
FAILS: repeat prompt, fails++ (if >=3 return fallback)
SUCCEEDS: try parse
SUCCEEDS: FINISHED
FAILS: step 5
5. GET CONTEXTS (merge OK, parse failed)
getContexts(mergedJson)
- If no cut point: overlapContext = ""
- Store contexts for next iteration
6. DECIDE
jsonParsingSuccess=true AND overlapContext="":
FINISHED. return completePart
jsonParsingSuccess=true AND overlapContext!="":
CONTINUE, fails=0
ELSE: repeat prompt, fails++
"""
import json
import logging
from typing import Dict, Any, List, Optional, Callable
from modules.datamodels.datamodelAi import (
AiCallRequest, AiCallOptions
)
from modules.datamodels.datamodelExtraction import ContentPart
from .subJsonResponseHandling import JsonResponseHandler
from .subLoopingUseCases import LoopingUseCaseRegistry
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
from modules.shared.jsonContinuation import getContexts
from modules.shared.jsonUtils import buildContinuationContext, extractJsonString, tryParseJson
from modules.shared.jsonUtils import tryParseJson
from modules.shared.jsonUtils import closeJsonStructures
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
logger = logging.getLogger(__name__)
class AiCallLooper:
"""Handles AI calls with looping and repair logic."""
def __init__(self, services, aiService, responseParser):
"""Initialize AiCallLooper with service center, AI service, and response parser access."""
self.services = services
self.aiService = aiService
self.responseParser = responseParser
self.useCaseRegistry = LoopingUseCaseRegistry() # Initialize use case registry
async def callAiWithLooping(
self,
prompt: str,
options: AiCallOptions,
debugPrefix: str = "ai_call",
promptBuilder: Optional[Callable] = None,
promptArgs: Optional[Dict[str, Any]] = None,
operationId: Optional[str] = None,
userPrompt: Optional[str] = None,
contentParts: Optional[List[ContentPart]] = None, # ARCHITECTURE: Support ContentParts for large content
useCaseId: str = None # REQUIRED: Explicit use case ID - no auto-detection, no fallback
) -> str:
"""
Shared core function for AI calls with repair-based looping system.
Automatically repairs broken JSON and continues generation seamlessly.
Args:
prompt: The prompt to send to AI
options: AI call configuration options
debugPrefix: Prefix for debug file names
promptBuilder: Optional function to rebuild prompts for continuation
promptArgs: Optional arguments for prompt builder
operationId: Optional operation ID for progress tracking
userPrompt: Optional user prompt for KPI definition
contentParts: Optional content parts for first iteration
useCaseId: REQUIRED: Explicit use case ID - no auto-detection, no fallback
Returns:
Complete AI response after all iterations
"""
# REQUIRED: useCaseId must be provided - no auto-detection, no fallback
if not useCaseId:
errorMsg = (
"useCaseId is REQUIRED for callAiWithLooping. "
"No auto-detection - must explicitly specify use case ID. "
f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}"
)
logger.error(errorMsg)
raise ValueError(errorMsg)
# Validate use case exists
useCase = self.useCaseRegistry.get(useCaseId)
if not useCase:
errorMsg = (
f"Use case '{useCaseId}' not found in registry. "
f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}"
)
logger.error(errorMsg)
raise ValueError(errorMsg)
maxIterations = 50 # Prevent infinite loops
iteration = 0
allSections = [] # Accumulate all sections across iterations
lastRawResponse = None # Store last raw JSON response for continuation
# JSON Base Iteration System:
# - jsonBase: the merged JSON string (replaces accumulatedDirectJson array)
# - After each iteration, new response is merged with jsonBase
# - On merge success: check if complete, store contexts for next iteration
# - On merge fail: retry with same prompt, increment fails
jsonBase = None # Merged JSON string (starts None, set on first response)
# Merge fail tracking - stop after 3 consecutive merge failures
MAX_MERGE_FAILS = 3
mergeFailCount = 0 # Global counter for merge failures across entire loop
lastValidCompletePart = None # Store last successfully parsed completePart for fallback
# Get parent operation ID for iteration operations (parentId should be operationId, not log entry ID)
parentOperationId = operationId # Use the parent's operationId directly
while iteration < maxIterations:
iteration += 1
# Create separate operation for each iteration with parent reference
iterationOperationId = None
if operationId:
iterationOperationId = f"{operationId}_iter_{iteration}"
self.services.chat.progressLogStart(
iterationOperationId,
"AI Call",
f"Iteration {iteration}",
"",
parentOperationId=parentOperationId
)
# Build iteration prompt
# CRITICAL: Build continuation prompt if we have sections OR if we have a previous response (even if broken)
# This ensures continuation prompts are built even when JSON is so broken that no sections can be extracted
if (len(allSections) > 0 or lastRawResponse) and promptBuilder and promptArgs:
# Extract templateStructure and basePrompt from promptArgs (REQUIRED)
templateStructure = promptArgs.get("templateStructure")
if not templateStructure:
raise ValueError(
f"templateStructure is REQUIRED in promptArgs for use case '{useCaseId}'. "
"Prompt creation functions must return (prompt, templateStructure) tuple."
)
basePrompt = promptArgs.get("basePrompt")
if not basePrompt:
# Fallback: use prompt parameter (should be the same)
basePrompt = prompt
logger.warning(
f"basePrompt not found in promptArgs for use case '{useCaseId}', "
"using prompt parameter instead. This may indicate a bug."
)
# This is a continuation - build continuation context with raw JSON and rebuild prompt
continuationContext = buildContinuationContext(
allSections, lastRawResponse, useCaseId, templateStructure
)
if not lastRawResponse:
logger.warning(f"Iteration {iteration}: No previous response available for continuation!")
# Store valid completePart from continuation context for fallback on merge failures
# Use getContexts to check if completePart is parseable and store it
if lastRawResponse and not lastValidCompletePart:
try:
contexts = getContexts(lastRawResponse)
if contexts.jsonParsingSuccess and contexts.completePart:
lastValidCompletePart = contexts.completePart
logger.debug(f"Iteration {iteration}: Stored initial valid completePart ({len(lastValidCompletePart)} chars)")
except Exception as e:
logger.debug(f"Iteration {iteration}: Failed to extract completePart: {e}")
# Unified prompt builder call: Continuation builders only need continuationContext, templateStructure, and basePrompt
# All initial context (section, userPrompt, etc.) is already in basePrompt, so promptArgs is not needed
# Extract templateStructure and basePrompt from promptArgs (they're explicit parameters)
iterationPrompt = await promptBuilder(
continuationContext=continuationContext,
templateStructure=templateStructure,
basePrompt=basePrompt
)
else:
# First iteration - use original prompt
iterationPrompt = prompt
# Make AI call
try:
checkWorkflowStopped(self.services)
if iterationOperationId:
self.services.chat.progressLogUpdate(iterationOperationId, 0.3, "Calling AI model")
# ARCHITECTURE: Pass ContentParts directly to AiCallRequest
# This allows model-aware chunking to handle large content properly
# ContentParts are only passed in first iteration (continuations don't need them)
request = AiCallRequest(
prompt=iterationPrompt,
context="",
options=options,
contentParts=contentParts if iteration == 1 else None # Only pass ContentParts in first iteration
)
# Write the ACTUAL prompt sent to AI
# For section content generation: write prompt for first iteration and continuation iterations
# For document generation: write prompt for each iteration
isSectionContent = "_section_" in debugPrefix
if iteration == 1:
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
elif isSectionContent:
# Save continuation prompts for section_content debugging
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
else:
# Document generation - save all iteration prompts
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
response = await self.aiService.callAi(request)
result = response.content
# Track bytes for progress reporting
bytesReceived = len(result.encode('utf-8')) if result else 0
totalBytesSoFar = sum(len(section.get('content', '').encode('utf-8')) if isinstance(section.get('content'), str) else 0 for section in allSections) + bytesReceived
# Update progress after AI call with byte information
if iterationOperationId:
# Format bytes for display (kB or MB)
if totalBytesSoFar < 1024:
bytesDisplay = f"{totalBytesSoFar}B"
elif totalBytesSoFar < 1024 * 1024:
bytesDisplay = f"{totalBytesSoFar / 1024:.1f}kB"
else:
bytesDisplay = f"{totalBytesSoFar / (1024 * 1024):.1f}MB"
self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})")
# Write raw AI response to debug file
# For section content generation: write response for first iteration and continuation iterations
# For document generation: write response for each iteration
if iteration == 1:
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
elif isSectionContent:
# Save continuation responses for section_content debugging
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
else:
# Document generation - save all iteration responses
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
# Note: Stats are now stored centrally in callAi() - no need to duplicate here
# Check for error response using generic error detection (errorCount > 0 or modelName == "error")
if hasattr(response, 'errorCount') and response.errorCount > 0:
errorMsg = f"Iteration {iteration}: Error response detected (errorCount={response.errorCount}), stopping loop: {result[:200] if result else 'empty'}"
logger.error(errorMsg)
break
if hasattr(response, 'modelName') and response.modelName == "error":
errorMsg = f"Iteration {iteration}: Error response detected (modelName=error), stopping loop: {result[:200] if result else 'empty'}"
logger.error(errorMsg)
break
if not result or not result.strip():
logger.warning(f"Iteration {iteration}: Empty response, stopping")
break
# Check if this is a text response (not document generation)
# Text responses don't need JSON parsing - return immediately after first successful response
isTextResponse = (promptBuilder is None and promptArgs is None) or debugPrefix == "text"
if isTextResponse:
# For text responses, return the text immediately - no JSON parsing needed
logger.info(f"Iteration {iteration}: Text response received, returning immediately")
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True)
return result
# NOTE: Do NOT update lastRawResponse here!
# lastRawResponse should only be updated after successful merge
# This ensures retry iterations use the correct base context
# Handle use cases that return JSON directly (no section extraction needed)
# Check if use case supports direct return (all registered use cases do)
if useCase and not useCase.requiresExtraction:
# =====================================================================
# ITERATION FLOW (Simplified)
# =====================================================================
# Step 4: MERGE jsonBase + new response
# - FAILS: repeat prompt, increment fails cont (if >=3 return fallback)
# - SUCCEEDS: try parse
# - SUCCEEDS: FINISHED
# - FAILS: proceed to Step 5
# Step 5: GET CONTEXTS (merge OK, parse failed)
# - getContexts() with repair
# - If no cut point: overlapContext = ""
# Step 6: DECIDE
# - jsonParsingSuccess=true AND overlapContext="": FINISHED
# - jsonParsingSuccess=true AND overlapContext!="": continue, fails=0
# - ELSE: repeat prompt, increment fails count
# =====================================================================
# STEP 4: MERGE jsonBase + new response
# Use candidateJson to hold merged result until we confirm it's valid
candidateJson = None
if jsonBase is None:
# First iteration - candidate is the current result
candidateJson = result
logger.debug(f"Iteration {iteration}: First response, candidateJson ({len(candidateJson)} chars)")
else:
# Merge jsonBase with new response
logger.info(f"Iteration {iteration}: Merging jsonBase ({len(jsonBase)} chars) with new response ({len(result)} chars)")
mergedJsonString, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(jsonBase, result)
if not hasOverlap:
# MERGE FAILED - repeat prompt with unchanged jsonBase
mergeFailCount += 1
logger.warning(
f"Iteration {iteration}: Merge failed, no overlap found "
f"(fail {mergeFailCount}/{MAX_MERGE_FAILS})"
)
if mergeFailCount >= MAX_MERGE_FAILS:
# Max failures reached - return last valid completePart
logger.error(
f"Iteration {iteration}: Max merge failures ({MAX_MERGE_FAILS}) reached, "
"returning last valid completePart"
)
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, False)
if lastValidCompletePart:
try:
extracted = extractJsonString(lastValidCompletePart)
parsed, parseErr, _ = tryParseJson(extracted)
if parseErr is None and parsed:
normalized = self._normalizeJsonStructure(parsed, useCase)
return json.dumps(normalized, indent=2, ensure_ascii=False)
except Exception:
pass
return lastValidCompletePart
else:
# No valid fallback - return whatever we have
return jsonBase if jsonBase else ""
# Not at max failures - retry with same prompt (jsonBase unchanged)
if iterationOperationId:
self.services.chat.progressLogUpdate(
iterationOperationId, 0.7,
f"Merge failed ({mergeFailCount}/{MAX_MERGE_FAILS}), retrying"
)
self.services.chat.progressLogFinish(iterationOperationId, True)
continue
# MERGE SUCCEEDED - set candidate (don't update jsonBase yet!)
candidateJson = mergedJsonString
logger.debug(f"Iteration {iteration}: Merge succeeded, candidateJson ({len(candidateJson)} chars)")
# Update lastRawResponse ONLY after we have a valid candidateJson
# (first iteration or successful merge - NOT on merge failure!)
# This ensures retry iterations use the correct base context
lastRawResponse = candidateJson
# Try direct parse of candidate
try:
extracted = extractJsonString(candidateJson)
parsed, parseErr, _ = tryParseJson(extracted)
if parseErr is None and parsed:
# Direct parse succeeded - FINISHED
# Commit candidate to jsonBase
jsonBase = candidateJson
logger.info(f"Iteration {iteration}: Direct parse succeeded, JSON is complete")
normalized = self._normalizeJsonStructure(parsed, useCase)
result = json.dumps(normalized, indent=2, ensure_ascii=False)
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True)
if not useCase.finalResultHandler:
raise ValueError(
f"Use case '{useCaseId}' is missing required 'finalResultHandler' callback."
)
return useCase.finalResultHandler(
result, normalized, extracted, debugPrefix, self.services
)
except Exception as e:
logger.debug(f"Iteration {iteration}: Direct parse failed: {e}")
# STEP 5: GET CONTEXTS (merge OK, parse failed = cut JSON)
# Use candidateJson for context extraction
contexts = getContexts(candidateJson)
overlapInfo = "(empty=complete)" if contexts.overlapContext == "" else f"({len(contexts.overlapContext)} chars)"
logger.debug(
f"Iteration {iteration}: getContexts() -> "
f"jsonParsingSuccess={contexts.jsonParsingSuccess}, "
f"overlapContext={overlapInfo}"
)
# STEP 6: DECIDE based on jsonParsingSuccess and overlapContext
if contexts.jsonParsingSuccess and contexts.overlapContext == "":
# JSON is complete (no cut point) - FINISHED
# Use completePart for final result (closed, repaired JSON)
# No more merging needed, so we don't need the cut version
jsonBase = contexts.completePart
logger.info(f"Iteration {iteration}: jsonParsingSuccess=true, overlapContext='', JSON complete")
# Store and parse completePart
lastValidCompletePart = contexts.completePart
try:
extracted = extractJsonString(contexts.completePart)
parsed, parseErr, _ = tryParseJson(extracted)
if parseErr is None and parsed:
normalized = self._normalizeJsonStructure(parsed, useCase)
result = json.dumps(normalized, indent=2, ensure_ascii=False)
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True)
if not useCase.finalResultHandler:
raise ValueError(
f"Use case '{useCaseId}' is missing required 'finalResultHandler' callback."
)
return useCase.finalResultHandler(
result, normalized, extracted, debugPrefix, self.services
)
except Exception as e:
logger.warning(f"Iteration {iteration}: Failed to parse completePart: {e}")
# Fallback: return completePart as-is
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, True)
return contexts.completePart
elif contexts.jsonParsingSuccess and contexts.overlapContext != "":
# JSON parseable but has cut point - CONTINUE to next iteration
# CRITICAL: Use hierarchyContext (CUT json) as jsonBase for next merge!
# - hierarchyContext = the truncated JSON at cut point (needed for overlap matching)
# - completePart = closed JSON (for validation/fallback only)
# The next AI fragment's overlap must match the CUT point, not closed structures
jsonBase = contexts.hierarchyContext
logger.info(
f"Iteration {iteration}: jsonParsingSuccess=true, overlapContext not empty, "
f"continuing iteration (jsonBase updated to hierarchyContext: {len(jsonBase)} chars)"
)
# Store valid completePart as fallback (different from jsonBase!)
lastValidCompletePart = contexts.completePart
# Reset fail counter on successful progress
mergeFailCount = 0
# Update lastRawResponse for continuation prompt building
# Use the CUT version for prompt context as well
lastRawResponse = jsonBase
if iterationOperationId:
self.services.chat.progressLogUpdate(iterationOperationId, 0.7, "JSON incomplete, requesting continuation")
self.services.chat.progressLogFinish(iterationOperationId, True)
continue
else:
# JSON not parseable after repair - repeat prompt, increment fails
# Do NOT update jsonBase - keep previous valid state
mergeFailCount += 1
logger.warning(
f"Iteration {iteration}: jsonParsingSuccess=false, "
f"repeat prompt (fail {mergeFailCount}/{MAX_MERGE_FAILS})"
)
if mergeFailCount >= MAX_MERGE_FAILS:
# Max failures reached - return last valid completePart
logger.error(
f"Iteration {iteration}: Max failures ({MAX_MERGE_FAILS}) reached, "
"returning last valid completePart"
)
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, False)
if lastValidCompletePart:
try:
extracted = extractJsonString(lastValidCompletePart)
parsed, parseErr, _ = tryParseJson(extracted)
if parseErr is None and parsed:
normalized = self._normalizeJsonStructure(parsed, useCase)
return json.dumps(normalized, indent=2, ensure_ascii=False)
except Exception:
pass
return lastValidCompletePart
else:
return jsonBase if jsonBase else ""
# Not at max - retry with same prompt
# Do NOT update jsonBase or lastRawResponse - keep previous for retry
if iterationOperationId:
self.services.chat.progressLogUpdate(
iterationOperationId, 0.7,
f"Parse failed ({mergeFailCount}/{MAX_MERGE_FAILS}), retrying"
)
self.services.chat.progressLogFinish(iterationOperationId, True)
continue
except Exception as e:
logger.error(f"Error in AI call iteration {iteration}: {str(e)}")
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, False)
break
if iteration >= maxIterations:
logger.warning(f"AI call stopped after maximum iterations ({maxIterations})")
# This code path should never be reached because all registered use cases
# return early when JSON is complete. This would only execute for use cases that
# require section extraction, but no such use cases are currently registered.
logger.error(f"Unexpected code path: reached end of loop without return for use case '{useCaseId}'")
return result if result else ""
def _isJsonStringIncomplete(self, jsonString: str) -> bool:
"""
Check if JSON string is incomplete (truncated) BEFORE closing/parsing.
This is critical because if JSON is truncated, closing it makes it appear complete,
but we need to detect the truncation to continue iteration.
Args:
jsonString: JSON string to check
Returns:
True if JSON string appears incomplete/truncated, False otherwise
"""
if not jsonString or not jsonString.strip():
return False
# Normalize JSON string
normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
if not normalized:
return False
# Find first '{' or '[' to start
startIdx = -1
for i, char in enumerate(normalized):
if char in '{[':
startIdx = i
break
if startIdx == -1:
return False
jsonContent = normalized[startIdx:]
# Check if structures are balanced (all opened structures are closed)
braceCount = 0
bracketCount = 0
inString = False
escapeNext = False
for char in jsonContent:
if escapeNext:
escapeNext = False
continue
if char == '\\':
escapeNext = True
continue
if char == '"':
inString = not inString
continue
if not inString:
if char == '{':
braceCount += 1
elif char == '}':
braceCount -= 1
elif char == '[':
bracketCount += 1
elif char == ']':
bracketCount -= 1
# If structures are unbalanced, JSON is incomplete
if braceCount > 0 or bracketCount > 0:
return True
# Check if JSON ends with incomplete value (e.g., unclosed string, incomplete number, trailing comma)
trimmed = jsonContent.rstrip()
if not trimmed:
return False
# Check for trailing comma (might indicate incomplete)
if trimmed.endswith(','):
# Trailing comma might indicate incomplete, but could also be valid
# Check if there's a closing bracket/brace after the comma
return False # Trailing comma alone doesn't mean incomplete
# Check if ends with incomplete string (odd number of quotes)
quoteCount = jsonContent.count('"')
if quoteCount % 2 == 1:
# Odd number of quotes - string is not closed
return True
# Check if ends mid-value (e.g., ends with "417 instead of "4170. 41719"])
# Look for patterns that suggest truncation:
# - Ends with incomplete number (e.g., "417)
# - Ends with incomplete array element (e.g., ["417)
# - Ends with incomplete object property (e.g., {"key": "val)
# If JSON parses successfully without closing, it's complete
parsed, parseErr, _ = tryParseJson(jsonContent)
if parseErr is None:
# Parses successfully - it's complete
return False
# If it doesn't parse, try closing it and see if that helps
closed = closeJsonStructures(jsonContent)
parsedClosed, parseErrClosed, _ = tryParseJson(closed)
if parseErrClosed is None:
# Only parses after closing - it was incomplete
return True
# Doesn't parse even after closing - might be malformed, but assume incomplete to be safe
return True
def _normalizeJsonStructure(self, parsed: Any, useCase) -> Any:
"""
Normalize JSON structure to ensure consistent format before merging.
Handles different response formats and converts them to expected structure.
Args:
parsed: Parsed JSON object (can be dict, list, or primitive)
useCase: LoopingUseCase instance with jsonNormalizer callback
Returns:
Normalized JSON structure
"""
# Use callback to normalize JSON structure (REQUIRED - no fallback)
if not useCase or not useCase.jsonNormalizer:
raise ValueError(
f"Use case '{useCase.useCaseId if useCase else 'unknown'}' is missing required 'jsonNormalizer' callback. "
"All use cases must provide a jsonNormalizer function."
)
return useCase.jsonNormalizer(parsed, useCase.useCaseId)

View file

@ -1,721 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Content Extraction Module
Handles content extraction and preparation, including:
- Extracting content from documents based on intents
- Processing pre-extracted documents
- Vision AI for image text extraction
- AI processing of text content
"""
import json
import logging
import base64
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent, ExtractionOptions, MergeStrategy
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class ContentExtractor:
"""Handles content extraction and preparation."""
def __init__(self, services, aiService, intentAnalyzer):
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
self.services = services
self.aiService = aiService
self.intentAnalyzer = intentAnalyzer
async def extractAndPrepareContent(
self,
documents: List[ChatDocument],
documentIntents: List[DocumentIntent],
parentOperationId: str,
getIntentForDocument: callable
) -> List[ContentPart]:
"""
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
Gibt Liste von ContentParts im passenden Format zurück.
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
- ContentPart(contentFormat="object", ...) für Rendering
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
Args:
documents: Liste der zu verarbeitenden Dokumente
documentIntents: Liste von DocumentIntent-Objekten
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
getIntentForDocument: Callable to get intent for document ID
Returns:
Liste von ContentParts mit vollständigen Metadaten
"""
# Erstelle Operation-ID für Extraktion
extractionOperationId = f"{parentOperationId}_content_extraction"
# Starte ChatLog mit Parent-Referenz
self.services.chat.progressLogStart(
extractionOperationId,
"Content Extraction",
"Extraction",
f"Extracting from {len(documents)} documents",
parentOperationId=parentOperationId
)
try:
allContentParts = []
for document in documents:
checkWorkflowStopped(self.services)
# Check if document is already a ContentExtracted document (pre-extracted JSON)
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
if preExtracted:
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
# Verwende bereits extrahierte ContentParts direkt
contentExtracted = preExtracted["contentExtracted"]
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
intent = getIntentForDocument(document.id, documentIntents)
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
if intent:
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
else:
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
if contentExtracted.parts:
# CRITICAL: Process pre-extracted parts - analyze structure parts for nested content
processedParts = []
for part in contentExtracted.parts:
# Überspringe leere Parts (Container ohne Daten)
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
if part.typeGroup == "container":
continue # Überspringe leere Container
# CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts)
if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data:
nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent)
if nestedParts:
# Replace structure part with extracted nested parts
processedParts.extend(nestedParts)
logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}")
continue # Skip original structure part
# Keep original part if no nested parts found
processedParts.append(part)
# Use processed parts (with nested parts extracted)
for part in processedParts:
if not part.metadata:
part.metadata = {}
# Ensure metadata is complete
if "documentId" not in part.metadata:
part.metadata["documentId"] = document.id
# WICHTIG: Prüfe Intent für dieses Part
partIntent = intent.intents if intent else ["extract"]
# Debug-Logging für Intent-Verarbeitung
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
# Generische Intent-Verarbeitung für ALLE Content-Typen
hasReferenceIntent = "reference" in partIntent
hasRenderIntent = "render" in partIntent
hasExtractIntent = "extract" in partIntent
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
# SAFETY: For images with any intent, always ensure render is included
# This ensures the image object part is always available for later rendering
isImage = part.typeGroup == "image" or (part.mimeType and part.mimeType.startswith("image/"))
if isImage and hasPartData and not hasRenderIntent:
logger.info(f"🖼️ Auto-adding render intent for image {part.id} (original intents: {partIntent})")
hasRenderIntent = True
# Track ob der originale Part bereits hinzugefügt wurde
originalPartAdded = False
# 1. Reference Intent: Erstelle Reference ContentPart
if hasReferenceIntent:
referencePart = ContentPart(
id=f"ref_{document.id}_{part.id}",
label=f"Reference: {part.label or 'Content'}",
typeGroup="reference",
mimeType=part.mimeType or "application/octet-stream",
data="", # Leer - nur Referenz
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
"intent": "reference",
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"]
}
)
allContentParts.append(referencePart)
logger.debug(f"✅ Created reference ContentPart for {part.id}")
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
if hasRenderIntent and hasPartData:
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
isRenderable = (
part.typeGroup == "image" or
part.typeGroup == "binary" or
(part.mimeType and (
part.mimeType.startswith("image/") or
part.mimeType.startswith("video/") or
part.mimeType.startswith("audio/") or
self._isBinary(part.mimeType)
))
)
if isRenderable:
objectPart = ContentPart(
id=f"obj_{document.id}_{part.id}",
label=f"Object: {part.label or 'Content'}",
typeGroup=part.typeGroup,
mimeType=part.mimeType or "application/octet-stream",
data=part.data, # Base64/Binary data ist bereits vorhanden
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
}
)
allContentParts.append(objectPart)
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
else:
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
elif hasRenderIntent and not hasPartData:
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
# 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
if hasExtractIntent:
# For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
if part.typeGroup == "image" and hasPartData:
logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
# Keep image part as-is, mark with extract intent
part.metadata.update({
"contentFormat": "extracted", # Marked for extraction, but not yet extracted
"intent": "extract",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
"needsVisionExtraction": True # Flag to indicate Vision AI extraction needed
})
allContentParts.append(part)
originalPartAdded = True
else:
# For text/table content: Use directly as extracted (no AI processing here)
# AI processing with extractionPrompt happens during section generation
if not originalPartAdded:
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True, # Already extracted (raw extraction)
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
})
# Stelle sicher dass contentFormat gesetzt ist
if "contentFormat" not in part.metadata:
part.metadata["contentFormat"] = "extracted"
allContentParts.append(part)
originalPartAdded = True
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
part.metadata.update({
"contentFormat": "extracted",
"intent": "extract",
"fromExtractContent": True,
"skipExtraction": True,
"originalFileName": preExtracted["originalDocument"]["fileName"]
})
allContentParts.append(part)
originalPartAdded = True
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
continue # Skip normal extraction for this document
# Check if it's standardized JSON format (has "documents" or "sections")
if document.mimeType == "application/json":
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if docBytes:
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
logger.info(f"Document is already in standardized JSON format, using as reference")
# Create reference ContentPart for structured JSON
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="structure",
mimeType="application/json",
data=docData,
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"skipExtraction": True,
"intent": "reference"
}
)
allContentParts.append(contentPart)
logger.info(f"✅ Using JSON document directly without extraction")
continue # Skip normal extraction for this document
except Exception as e:
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
# Continue with normal extraction
# Normal extraction path
intent = getIntentForDocument(document.id, documentIntents)
if not intent:
# Try to find intent by similar UUID (fix for AI UUID hallucination)
correctedIntent = self._findIntentBySimilarId(document.id, documentIntents)
if correctedIntent:
logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})")
# Create new intent with correct document ID
intent = DocumentIntent(
documentId=document.id,
intents=correctedIntent.intents,
extractionPrompt=correctedIntent.extractionPrompt,
reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})"
)
else:
# Default: extract für alle Dokumente ohne Intent
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
intent = DocumentIntent(
documentId=document.id,
intents=["extract"],
extractionPrompt="Extract all content from the document",
reasoning="Default intent: no specific intent found"
)
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
if "reference" in intent.intents:
# Erstelle Reference ContentPart
contentPart = ContentPart(
id=f"ref_{document.id}",
label=f"Reference: {document.fileName}",
typeGroup="reference",
mimeType=document.mimeType,
data="",
metadata={
"contentFormat": "reference",
"documentId": document.id,
"documentReference": f"docItem:{document.id}:{document.fileName}",
"intent": "reference",
"usageHint": f"Reference document: {document.fileName}"
}
)
allContentParts.append(contentPart)
# WICHTIG: "render" und "extract" können beide vorhanden sein!
# In diesem Fall erzeugen wir BEIDE ContentParts
# SAFETY: For images with any intent, always create object part for later rendering
isImageDocument = document.mimeType and document.mimeType.startswith("image/")
shouldAutoRender = isImageDocument and "render" not in intent.intents and ("extract" in intent.intents or "reference" in intent.intents)
if shouldAutoRender:
logger.info(f"🖼️ Auto-adding render for image document {document.id} (original intents: {intent.intents})")
if "render" in intent.intents or shouldAutoRender:
# Für Images/Binary: extrahiere als Object
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
try:
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
if not binaryData:
logger.warning(f"No binary data found for document {document.id}")
continue
base64Data = base64.b64encode(binaryData).decode('utf-8')
contentPart = ContentPart(
id=f"obj_{document.id}",
label=f"Object: {document.fileName}",
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
mimeType=document.mimeType,
data=base64Data,
metadata={
"contentFormat": "object",
"documentId": document.id,
"intent": "render",
"usageHint": f"Render as visual element: {document.fileName}",
"originalFileName": document.fileName,
# Verknüpfung zu extracted Part (falls vorhanden)
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
}
)
allContentParts.append(contentPart)
except Exception as e:
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
if "extract" in intent.intents:
# Extrahiere Content mit Extraction Service
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
extractionPrompt,
f"content_extraction_prompt_{document.id}"
)
# Führe Extraktion aus
extractionOptions = ExtractionOptions(
prompt=extractionPrompt,
mergeStrategy=MergeStrategy()
)
# extractContent ist nicht async - keine await nötig
checkWorkflowStopped(self.services)
extractedResults = self.services.extraction.extractContent(
[document],
extractionOptions,
operationId=extractionOperationId,
parentOperationId=extractionOperationId
)
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
# Check if object part exists (either explicit render or auto-render for images)
hasObjectPart = "render" in intent.intents or shouldAutoRender
for extracted in extractedResults:
for part in extracted.parts:
# Markiere als extracted Format
part.metadata.update({
"contentFormat": "extracted",
"documentId": document.id,
"extractionPrompt": extractionPrompt,
"intent": "extract",
"usageHint": f"Use extracted content from {document.fileName}",
# Verknüpfung zu object Part (falls vorhanden - including auto-render for images)
"relatedObjectPartId": f"obj_{document.id}" if hasObjectPart else None
})
# For images: Mark that Vision AI extraction is needed during section generation
if part.typeGroup == "image":
part.metadata["needsVisionExtraction"] = True
logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
if hasObjectPart:
part.id = f"ext_{document.id}_{part.id}"
allContentParts.append(part)
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
"content_extraction_result"
)
# State 2 Validation: Validate and auto-fix ContentParts
validatedParts = []
for part in allContentParts:
# Validation 2.1: Skip ContentParts without documentId
if not part.metadata.get("documentId"):
logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata")
continue
# Validation 2.2: Skip ContentParts with invalid contentFormat
contentFormat = part.metadata.get("contentFormat")
if contentFormat not in ["extracted", "object", "reference"]:
logger.warning(
f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}"
)
continue
validatedParts.append(part)
# ChatLog abschließen
self.services.chat.progressLogFinish(extractionOperationId, True)
return validatedParts
except Exception as e:
self.services.chat.progressLogFinish(extractionOperationId, False)
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
raise
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Extrahiere Text aus einem Image-Part mit Vision AI.
Args:
imagePart: ContentPart mit typeGroup="image"
extractionPrompt: Prompt für die Text-Extraktion
Returns:
Extrahierter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
finalPrompt,
f"content_extraction_prompt_image_{imagePart.id}"
)
# Erstelle AI-Call-Request mit Image-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
contentParts=[imagePart]
)
# Verwende AI-Service für Vision AI-Verarbeitung
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_image_{imagePart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
Args:
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
Returns:
AI-verarbeiteter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Final extraction prompt
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
# Debug-Log (harmonisiert) - log prompt with text preview
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
self.services.utils.writeDebugFile(
promptWithContext,
f"content_extraction_prompt_text_{textPart.id}"
)
# Erstelle Text-ContentPart für AI-Verarbeitung
# Verwende den vorhandenen Text als Input
textContentPart = ContentPart(
id=textPart.id,
label=textPart.label,
typeGroup="text",
mimeType="text/plain",
data=textPart.data if textPart.data else "",
metadata=textPart.metadata.copy() if textPart.metadata else {}
)
# Erstelle AI-Call-Request mit Text-Part
request = AiCallRequest(
prompt=finalPrompt,
context="",
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
contentParts=[textContentPart]
)
# Verwende AI-Service für Text-Verarbeitung
checkWorkflowStopped(self.services)
response = await self.aiService.callAi(request)
# Debug-Log für Response (harmonisiert)
if response and response.content:
self.services.utils.writeDebugFile(
response.content,
f"content_extraction_response_text_{textPart.id}"
)
if response and response.content:
return response.content.strip()
# Kein Content zurückgegeben - return error message für Debugging
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
logger.warning(errorMsg)
return f"[ERROR: {errorMsg}]"
except Exception as e:
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
logger.error(errorMsg)
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Return error message statt None für Debugging
return f"[ERROR: {errorMsg}]"
def _isBinary(self, mimeType: str) -> bool:
"""Prüfe ob MIME-Type binary ist."""
binaryTypes = [
"application/octet-stream",
"application/pdf",
"application/zip",
"application/x-zip-compressed"
]
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")
def _extractNestedPartsFromStructure(
self,
structurePart: ContentPart,
document: ChatDocument,
preExtracted: Dict[str, Any],
intent: Optional[Any]
) -> List[ContentPart]:
"""
Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts).
This is a generic function that analyzes pre-processed ContentParts and extracts
any nested parts that are embedded in structure data (typically JSON).
Works with standard ContentExtracted format: documentData.parts array.
Each nested part is extracted as a separate ContentPart with proper metadata.
Args:
structurePart: ContentPart with typeGroup="structure" containing nested parts
document: The document this part belongs to
preExtracted: Pre-extracted document metadata
intent: Document intent for nested parts
Returns:
List of extracted ContentParts, empty if no nested parts found
"""
nestedParts = []
try:
# Parse JSON structure
jsonData = json.loads(structurePart.data)
# Check for standard ContentExtracted format: documentData.parts
if isinstance(jsonData, dict):
documentData = jsonData.get("documentData")
if isinstance(documentData, dict):
parts = documentData.get("parts", [])
if isinstance(parts, list) and len(parts) > 0:
# Extract each nested part
for nestedPartData in parts:
if not isinstance(nestedPartData, dict):
continue
nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}"
nestedTypeGroup = nestedPartData.get("typeGroup", "text")
nestedMimeType = nestedPartData.get("mimeType", "text/plain")
nestedLabel = nestedPartData.get("label", structurePart.label)
nestedData = nestedPartData.get("data", "")
nestedMetadata = nestedPartData.get("metadata", {})
# Create ContentPart for nested part
nestedPart = ContentPart(
id=f"{structurePart.id}_{nestedPartId}",
parentId=structurePart.id,
label=nestedLabel,
typeGroup=nestedTypeGroup,
mimeType=nestedMimeType,
data=nestedData,
metadata={
**nestedMetadata,
"documentId": document.id,
"fromNestedStructure": True,
"parentStructurePartId": structurePart.id,
"originalFileName": preExtracted["originalDocument"]["fileName"]
}
)
nestedParts.append(nestedPart)
logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})")
# If no nested parts found, return empty list (original part will be kept)
if not nestedParts:
logger.debug(f"No nested parts found in structure part {structurePart.id}")
except json.JSONDecodeError as e:
logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}")
except Exception as e:
logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}")
return nestedParts
def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]:
"""
Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist.
Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551).
Args:
documentId: Die Dokument-ID für die ein Intent gesucht wird
documentIntents: Liste aller verfügbaren DocumentIntents
Returns:
DocumentIntent mit ähnlicher UUID falls gefunden, sonst None
"""
if not documentId or len(documentId) != 36: # UUID Format: 8-4-4-4-12
return None
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
if documentId.count('-') != 4:
return None
for intent in documentIntents:
intentId = intent.documentId
if len(intentId) != 36:
continue
# Zähle unterschiedliche Zeichen
differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId))
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
if differences <= 2:
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
if documentId.count('-') == intentId.count('-'):
return intent
return None

View file

@ -1,369 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Intent Analysis Module
Handles analysis of document intents, including:
- Clarifying which documents need extraction vs reference
- Resolving pre-extracted documents
- Building intent analysis prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelExtraction import DocumentIntent
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class DocumentIntentAnalyzer:
"""Handles document intent analysis and resolution."""
def __init__(self, services, aiService):
"""Initialize DocumentIntentAnalyzer with service center and AI service access."""
self.services = services
self.aiService = aiService
async def clarifyDocumentIntents(
self,
documents: List[ChatDocument],
userPrompt: str,
actionParameters: Dict[str, Any],
parentOperationId: str
) -> List[DocumentIntent]:
"""
Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
Gibt DocumentIntent für jedes Dokument zurück.
Args:
documents: Liste der zu verarbeitenden Dokumente
userPrompt: User-Anfrage
actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
Returns:
Liste von DocumentIntent-Objekten
"""
# Erstelle Operation-ID für Intent-Analyse
intentOperationId = f"{parentOperationId}_intent_analysis"
# Starte ChatLog mit Parent-Referenz
self.services.chat.progressLogStart(
intentOperationId,
"Document Intent Analysis",
"Intent Analysis",
f"Analyzing {len(documents)} documents",
parentOperationId=parentOperationId
)
try:
# Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
documentMapping = {} # Maps original doc ID -> JSON doc ID
resolvedDocuments = []
for doc in documents:
preExtracted = self.resolvePreExtractedDocument(doc)
if preExtracted:
originalDocId = preExtracted["originalDocument"]["id"]
documentMapping[originalDocId] = doc.id
# Erstelle temporäres ChatDocument für ursprüngliches Dokument
originalDoc = ChatDocument(
id=originalDocId,
fileName=preExtracted["originalDocument"]["fileName"],
mimeType=preExtracted["originalDocument"]["mimeType"],
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
fileId=doc.fileId, # Behalte fileId vom JSON
messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
)
resolvedDocuments.append(originalDoc)
else:
resolvedDocuments.append(doc)
# Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)
# AI-Call (verwende callAiPlanning für einfache JSON-Responses)
# Debug-Logs werden bereits von callAiPlanning geschrieben
checkWorkflowStopped(self.services)
aiResponse = await self.aiService.callAiPlanning(
prompt=intentPrompt,
debugType="document_intent_analysis"
)
# Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
documentIntents = []
for intent in intentsData.get("intents", []):
docId = intent.get("documentId")
# Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
if docId in documentMapping:
intent["documentId"] = documentMapping[docId]
documentIntents.append(DocumentIntent(**intent))
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps([intent.dict() for intent in documentIntents], indent=2),
"document_intent_analysis_result"
)
# State 1 Validation: Validate and auto-fix document intents
documentIds = {d.id for d in documents}
validatedIntents = []
for intent in documentIntents:
# Validation 1.2: Skip intents for unknown documents
if intent.documentId not in documentIds:
# Try to find similar UUID (fix AI hallucination/typo)
correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds)
if correctedDocId:
logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}")
intent.documentId = correctedDocId
else:
logger.warning(f"Skipping intent for unknown document: {intent.documentId}")
continue
validatedIntents.append(intent)
# Validation 1.1: Documents without intents are OK (not needed)
# Intents for non-existing documents are already filtered above
documentIntents = validatedIntents
# ChatLog abschließen
self.services.chat.progressLogFinish(intentOperationId, True)
return documentIntents
except Exception as e:
self.services.chat.progressLogFinish(intentOperationId, False)
logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
raise
def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
"""
Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
Gibt Dict zurück mit:
- originalDocument: ChatDocument-Info des ursprünglichen Dokuments
- contentExtracted: ContentExtracted-Objekt mit Parts
- parts: Liste der ContentParts
Returns None wenn kein pre-extracted Format erkannt wird.
"""
if document.mimeType != "application/json":
logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
return None
try:
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
if not docBytes:
return None
docData = docBytes.decode('utf-8')
jsonData = json.loads(docData)
if not isinstance(jsonData, dict):
return None
# Check for ContentExtracted format
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
documentData = None
validationMetadata = jsonData.get("validationMetadata", {})
actionType = validationMetadata.get("actionType")
logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")
if actionType == "context.extractContent":
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
documentData = jsonData.get("documentData")
logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
else:
logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")
if documentData:
try:
# Stelle sicher, dass "id" vorhanden ist
if "id" not in documentData:
documentData["id"] = document.id
contentExtracted = ContentExtracted(**documentData)
if contentExtracted.parts:
# Extrahiere ursprüngliche Dokument-Info aus den Parts
originalDocId = None
originalFileName = None
originalMimeType = None
for part in contentExtracted.parts:
if part.metadata:
# Versuche ursprüngliche Dokument-Info zu finden
if not originalDocId and part.metadata.get("documentId"):
originalDocId = part.metadata.get("documentId")
if not originalFileName and part.metadata.get("originalFileName"):
originalFileName = part.metadata.get("originalFileName")
if not originalMimeType and part.metadata.get("documentMimeType"):
originalMimeType = part.metadata.get("documentMimeType")
# Falls nicht gefunden, versuche aus documentName zu extrahieren
if not originalFileName:
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
if document.fileName and "_extracted_" in document.fileName:
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
return {
"originalDocument": {
"id": originalDocId or document.id,
"fileName": originalFileName or document.fileName,
"mimeType": originalMimeType or "application/pdf",
"fileSize": document.fileSize
},
"contentExtracted": contentExtracted,
"parts": contentExtracted.parts
}
except Exception as parseError:
logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
import traceback
logger.debug(f"Parse error traceback: {traceback.format_exc()}")
return None
else:
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
return None
except Exception as e:
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
def _buildIntentAnalysisPrompt(
self,
userPrompt: str,
documents: List[ChatDocument],
actionParameters: Dict[str, Any]
) -> str:
"""Baue Prompt für Intent-Analyse."""
# Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
docListText = ""
for i, doc in enumerate(documents, 1):
# Prüfe ob es ein pre-extracted JSON ist
preExtracted = self.resolvePreExtractedDocument(doc)
if preExtracted:
# Zeige ursprüngliches Dokument statt JSON
originalDoc = preExtracted["originalDocument"]
partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
docListText += f" File Name: {originalDoc['fileName']}{partsInfo}\n"
docListText += f" MIME Type: {originalDoc['mimeType']}\n"
docListText += f" File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
else:
# Normales Dokument
docListText += f"\n{i}. Document ID: {doc.id}\n"
docListText += f" File Name: {doc.fileName}\n"
docListText += f" MIME Type: {doc.mimeType}\n"
docListText += f" File Size: {doc.fileSize} bytes\n"
outputFormat = actionParameters.get("outputFormat", "txt")
# FENCE user input to prevent prompt injection
fencedUserPrompt = f"""```user_request
{userPrompt}
```"""
prompt = f"""USER REQUEST:
{fencedUserPrompt}
DOCUMENTS TO ANALYZE:
{docListText}
TASK: For each document, determine its intents (can be multiple):
- "extract": Content extraction needed (text, structure, OCR, etc.)
- "render": Image/binary should be rendered as-is (visual element)
- "reference": Document reference/attachment (no extraction, just reference)
TASK: For each document, determine:
1. Intents (can be multiple): "extract", "render", "reference"
Note: Output format and language are NOT determined here - they will be
determined during structure generation (Phase 3) in the chapter structure JSON
OUTPUT FORMAT: {outputFormat} (global fallback - for reference only)
RETURN JSON:
{{
"intents": [
{{
"documentId": "doc_1",
"intents": ["extract"],
"extractionPrompt": "Extract all text content, preserving structure",
"reasoning": "User needs text content for document generation"
}},
{{
"documentId": "doc_2",
"intents": ["extract", "render"],
"extractionPrompt": "Extract text content from image using vision AI",
"reasoning": "Image contains text that needs extraction, but also should be rendered visually"
}},
{{
"documentId": "doc_3",
"intents": ["reference"],
"extractionPrompt": null,
"reasoning": "Document is only used as reference, no extraction needed"
}}
]
}}
CRITICAL RULES:
1. For images (mimeType starts with "image/"):
- If user wants to "include" or "show" images add "render"
- If user wants to "analyze", "read text", or "extract text" from images add "extract"
- Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering
2. For text documents:
- If user mentions "template" or "structure" "reference" or "extract" based on context
- If user mentions "reference" or "context" "reference"
- Default "extract"
3. Consider output format:
- For formats like PDF, DOCX, PPTX: images usually need "render"
- For formats like CSV, JSON: usually "extract" only
- For HTML: can have both "extract" and "render"
Return ONLY valid JSON following the structure above.
"""
return prompt
def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]:
"""
Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat.
Prüft auf UUID-Typo (z.B. 4451 -> 4551).
Args:
incorrectId: Die falsche UUID aus der AI-Response
validIds: Set von gültigen Dokument-IDs
Returns:
Korrigierte UUID falls gefunden, sonst None
"""
if not incorrectId or len(incorrectId) != 36: # UUID Format: 8-4-4-4-12
return None
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
if incorrectId.count('-') != 4:
return None
# Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind
for validId in validIds:
if len(validId) != 36:
continue
# Zähle unterschiedliche Zeichen
differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId))
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
if differences <= 2:
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
if incorrectId.count('-') == validId.count('-'):
return validId
return None

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,293 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Generic Looping Use Case System
Provides parametrized looping infrastructure supporting different JSON formats and use cases.
"""
import logging
from dataclasses import dataclass, field
from typing import Dict, Any, List, Optional, Callable
logger = logging.getLogger(__name__)
# Callback functions for use-case-specific logic
def _handleSectionContentFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
debugPrefix: str, services: Any) -> str:
"""Handle final result for section_content: return raw result to preserve all JSON blocks."""
final_json = result # Return raw response to preserve all JSON blocks
# Write final merged result for section_content (overwrites iteration 1 response with complete merged result)
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
services.utils.writeDebugFile(final_json, f"{debugPrefix}_response")
return final_json
def _handleChapterStructureFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
debugPrefix: str, services: Any) -> str:
"""Handle final result for chapter_structure: format JSON and write debug file."""
import json
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
# Write final result for chapter structure
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
return final_json
def _handleCodeStructureFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
debugPrefix: str, services: Any) -> str:
"""Handle final result for code_structure: format JSON and write debug file."""
import json
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
# Write final result for code structure
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
return final_json
def _handleCodeContentFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
debugPrefix: str, services: Any) -> str:
"""Handle final result for code_content: format JSON."""
import json
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
return final_json
def _normalizeSectionContentJson(parsed: Any, useCaseId: str) -> Any:
"""Normalize JSON structure for section_content use case."""
# For section_content, expect {"elements": [...]} structure
if isinstance(parsed, list):
# Check if list contains strings (invalid format) or element objects
if parsed and isinstance(parsed[0], str):
# Invalid format - list of strings instead of elements
# Try to convert strings to paragraph elements as fallback
logger.debug(f"Received list of strings instead of elements array, converting to paragraph elements")
elements = []
for text in parsed:
if isinstance(text, str) and text.strip():
elements.append({
"type": "paragraph",
"content": {
"text": text.strip()
}
})
return {"elements": elements} if elements else {"elements": []}
else:
# Convert plain list of elements to elements structure
return {"elements": parsed}
elif isinstance(parsed, dict):
# If it already has "elements", return as-is
if "elements" in parsed:
return parsed
# If it has "type" and looks like an element, wrap in elements array
elif parsed.get("type"):
return {"elements": [parsed]}
# Otherwise, assume it's already in correct format
else:
return parsed
# For other use cases, return as-is (they have their own structures)
return parsed
def _normalizeDefaultJson(parsed: Any, useCaseId: str) -> Any:
"""Default normalizer: return as-is."""
return parsed
@dataclass
class LoopingUseCase:
"""Configuration for a specific looping use case."""
# Identification
useCaseId: str # "section_content", "chapter_structure", "code_structure", "code_content"
# JSON Format Detection
jsonTemplate: Dict[str, Any] # Expected JSON structure template
detectionKeys: List[str] # Keys to check for format detection (e.g., ["elements"], ["chapters"], ["files"])
detectionPath: str # JSONPath to check (e.g., "documents[0].chapters", "files[0].content")
# Prompt Building
initialPromptBuilder: Optional[Callable] = None # Function to build initial prompt
continuationPromptBuilder: Optional[Callable] = None # Function to build continuation prompt
# Accumulation & Merging
accumulator: Optional[Callable] = None # Function to accumulate fragments
merger: Optional[Callable] = None # Function to merge accumulated data
# Continuation Context
continuationContextBuilder: Optional[Callable] = None # Build continuation context for this format
# Result Building
resultBuilder: Optional[Callable] = None # Build final result from accumulated data
# Use-case-specific handlers (callbacks to avoid if/elif chains in generic code)
finalResultHandler: Optional[Callable] = None # Handle final result formatting and debug file writing
jsonNormalizer: Optional[Callable] = None # Normalize JSON structure for this use case
# Metadata
supportsAccumulation: bool = True # Whether this use case supports accumulation
requiresExtraction: bool = False # Whether this requires extraction (like sections)
class LoopingUseCaseRegistry:
"""Registry of all looping use cases."""
def __init__(self):
self.useCases: Dict[str, LoopingUseCase] = {}
self._registerDefaultUseCases()
def register(self, useCase: LoopingUseCase):
"""Register a new use case."""
self.useCases[useCase.useCaseId] = useCase
logger.debug(f"Registered looping use case: {useCase.useCaseId}")
def get(self, useCaseId: str) -> Optional[LoopingUseCase]:
"""Get use case by ID."""
return self.useCases.get(useCaseId)
def detectUseCase(self, parsedJson: Dict[str, Any]) -> Optional[str]:
"""Detect which use case matches the JSON structure."""
for useCaseId, useCase in self.useCases.items():
if self._matchesFormat(parsedJson, useCase):
return useCaseId
return None
def _matchesFormat(self, json: Dict[str, Any], useCase: LoopingUseCase) -> bool:
"""Check if JSON matches use case format."""
# Check top-level keys
for key in useCase.detectionKeys:
if key in json:
return True
# Check nested path using simple dictionary traversal (no jsonpath_ng needed)
if useCase.detectionPath:
try:
# Simple path matching without jsonpath_ng
# Format: "documents[0].chapters" or "files[0].content"
pathParts = useCase.detectionPath.split(".")
current = json
for part in pathParts:
# Handle array indices like "documents[0]"
if "[" in part and "]" in part:
key = part.split("[")[0]
index = int(part.split("[")[1].split("]")[0])
if isinstance(current, dict) and key in current:
if isinstance(current[key], list) and 0 <= index < len(current[key]):
current = current[key][index]
else:
return False
else:
return False
else:
# Regular key access
if isinstance(current, dict) and part in current:
current = current[part]
else:
return False
# If we successfully traversed the path, it matches
return True
except Exception as e:
logger.debug(f"Path matching failed for {useCase.useCaseId}: {e}")
return False
def _registerDefaultUseCases(self):
"""Register default use cases."""
# Use Case 1: Section Content Generation
# Returns JSON with "elements" array directly
self.register(LoopingUseCase(
useCaseId="section_content",
jsonTemplate={"elements": []},
detectionKeys=["elements"],
detectionPath="",
initialPromptBuilder=None, # Will use default prompt builder
continuationPromptBuilder=None, # Will use default continuation builder
accumulator=None, # Direct return, no accumulation
merger=None,
continuationContextBuilder=None, # Will use default continuation context
resultBuilder=None, # Return JSON directly
finalResultHandler=_handleSectionContentFinalResult,
jsonNormalizer=_normalizeSectionContentJson,
supportsAccumulation=False,
requiresExtraction=False
))
# Use Case 2: Chapter Structure Generation
# Returns JSON with "documents[0].chapters" structure
self.register(LoopingUseCase(
useCaseId="chapter_structure",
jsonTemplate={"documents": [{"chapters": []}]},
detectionKeys=["chapters"],
detectionPath="documents[0].chapters",
initialPromptBuilder=None,
continuationPromptBuilder=None,
accumulator=None, # Direct return, no accumulation
merger=None,
continuationContextBuilder=None,
resultBuilder=None, # Return JSON directly
finalResultHandler=_handleChapterStructureFinalResult,
jsonNormalizer=_normalizeDefaultJson,
supportsAccumulation=False,
requiresExtraction=False
))
# Use Case 3: Code Structure Generation
self.register(LoopingUseCase(
useCaseId="code_structure",
jsonTemplate={
"metadata": {
"language": "",
"projectType": "single_file|multi_file",
"projectName": ""
},
"files": [
{
"id": "",
"filename": "",
"fileType": "",
"dependencies": [],
"imports": [],
"functions": [],
"classes": []
}
]
},
detectionKeys=["files"],
detectionPath="files",
initialPromptBuilder=None,
continuationPromptBuilder=None,
accumulator=None, # Direct return
merger=None,
continuationContextBuilder=None,
resultBuilder=None,
finalResultHandler=_handleCodeStructureFinalResult,
jsonNormalizer=_normalizeDefaultJson,
supportsAccumulation=False,
requiresExtraction=False
))
# Use Case 5: Code Content Generation (NEW)
self.register(LoopingUseCase(
useCaseId="code_content",
jsonTemplate={"files": [{"content": "", "functions": []}]},
detectionKeys=["content", "functions"],
detectionPath="files[0].content",
initialPromptBuilder=None,
continuationPromptBuilder=None,
accumulator=None, # Will use default accumulator
merger=None, # Will use default merger
continuationContextBuilder=None,
resultBuilder=None, # Will use default result builder
finalResultHandler=_handleCodeContentFinalResult,
jsonNormalizer=_normalizeDefaultJson,
supportsAccumulation=True,
requiresExtraction=False
))
logger.info(f"Registered {len(self.useCases)} default looping use cases")

View file

@ -1,275 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Response Parsing Module
Handles parsing of AI responses, including:
- Section extraction from responses
- JSON completeness detection
- Loop detection
- Document metadata extraction
- Final result building
"""
import json
import logging
from typing import Dict, Any, List, Optional, Tuple
from modules.shared.jsonUtils import extractJsonString, repairBrokenJson, extractSectionsFromDocument
from .subJsonResponseHandling import JsonResponseHandler
from modules.datamodels.datamodelAi import JsonAccumulationState
logger = logging.getLogger(__name__)
class ResponseParser:
"""Handles parsing of AI responses and completion detection."""
def __init__(self, services):
"""Initialize ResponseParser with service center access."""
self.services = services
def extractSectionsFromResponse(
self,
result: str,
iteration: int,
debugPrefix: str,
allSections: List[Dict[str, Any]] = None,
accumulationState: Optional[JsonAccumulationState] = None
) -> Tuple[List[Dict[str, Any]], bool, Optional[Dict[str, Any]], Optional[JsonAccumulationState]]:
"""
Extract sections from AI response, handling both valid and broken JSON.
NEW BEHAVIOR:
- First iteration: Check if complete, if not start accumulation
- Subsequent iterations: Accumulate strings, parse when complete
Returns:
Tuple of:
- sections: Extracted sections
- wasJsonComplete: True if JSON is complete
- parsedResult: Parsed JSON object
- updatedAccumulationState: Updated accumulation state (None if not in accumulation mode)
"""
if allSections is None:
allSections = []
if iteration == 1:
# First iteration - check if complete
parsed = None
try:
extracted = extractJsonString(result)
parsed = json.loads(extracted)
# Check completeness
if JsonResponseHandler.isJsonComplete(parsed):
# Complete JSON - no accumulation needed
sections = extractSectionsFromDocument(parsed)
logger.info(f"Iteration 1: Complete JSON detected, no accumulation needed")
return sections, True, parsed, None # No accumulation
except Exception:
pass
# Incomplete - try to extract partial sections from broken JSON
logger.info(f"Iteration 1: Incomplete JSON detected, attempting to extract partial sections")
partialSections = []
if parsed:
# Try to extract sections from parsed (even if incomplete)
partialSections = extractSectionsFromDocument(parsed)
else:
# Try to repair broken JSON and extract sections
try:
repaired = repairBrokenJson(result)
if repaired:
partialSections = extractSectionsFromDocument(repaired)
parsed = repaired # Use repaired version for accumulation state
except Exception:
pass # If repair fails, continue with empty sections
# Define KPIs (async call - need to handle this)
# For now, create accumulation state without KPIs, will be updated after async call
accumulationState = JsonAccumulationState(
accumulatedJsonString=result,
isAccumulationMode=True,
lastParsedResult=parsed,
allSections=partialSections,
kpis=[]
)
# Note: KPI definition will be done in the caller (async context)
return partialSections, False, parsed, accumulationState
else:
# Subsequent iterations - accumulate
if accumulationState and accumulationState.isAccumulationMode:
accumulated, sections, isComplete, parsedResult = \
JsonResponseHandler.accumulateAndParseJsonFragments(
accumulationState.accumulatedJsonString,
result,
allSections,
iteration
)
# Update accumulation state
accumulationState.accumulatedJsonString = accumulated
accumulationState.lastParsedResult = parsedResult
accumulationState.allSections = allSections + sections if sections else allSections
accumulationState.isAccumulationMode = not isComplete
# Log accumulated JSON for debugging
if parsedResult:
accumulated_json_str = json.dumps(parsedResult, indent=2, ensure_ascii=False)
self.services.utils.writeDebugFile(accumulated_json_str, f"{debugPrefix}_accumulated_json_iteration_{iteration}.json")
return sections, isComplete, parsedResult, accumulationState
else:
# No accumulation mode - process normally (shouldn't happen)
logger.warning(f"Iteration {iteration}: No accumulation state but iteration > 1")
return [], False, None, None
def shouldContinueGeneration(
self,
allSections: List[Dict[str, Any]],
iteration: int,
wasJsonComplete: bool,
rawResponse: str = None
) -> bool:
"""
Determine if AI generation loop should continue.
CRITICAL: This is ONLY about AI Loop Completion, NOT Action DoD!
Action DoD is checked AFTER the AI Loop completes in _refineDecide.
Simple logic:
- If JSON parsing failed or incomplete continue (needs more content)
- If JSON parses successfully and is complete stop (all content delivered)
- Loop detection prevents infinite loops
CRITICAL: JSON completeness is determined by parsing, NOT by last character check!
Returns True if we should continue, False if AI Loop is done.
"""
if len(allSections) == 0:
return True # No sections yet, continue
# CRITERION 1: If JSON was incomplete/broken (parsing failed or incomplete) - continue to repair/complete
if not wasJsonComplete:
logger.info(f"Iteration {iteration}: JSON incomplete/broken - continuing to complete")
return True
# CRITERION 2: JSON is complete (parsed successfully) - check for loop detection
if self._isStuckInLoop(allSections, iteration):
logger.warning(f"Iteration {iteration}: Detected potential infinite loop - stopping AI loop")
return False
# JSON is complete and not stuck in loop - done
logger.info(f"Iteration {iteration}: JSON complete - AI loop done")
return False
def _isStuckInLoop(
self,
allSections: List[Dict[str, Any]],
iteration: int
) -> bool:
"""
Detect if we're stuck in a loop (same content being repeated).
Generic approach: Check if recent iterations are adding minimal or duplicate content.
"""
if iteration < 3:
return False # Need at least 3 iterations to detect a loop
if len(allSections) == 0:
return False
# Check if last section is very small (might be stuck)
lastSection = allSections[-1]
elements = lastSection.get("elements", [])
if isinstance(elements, list) and elements:
lastElem = elements[-1] if elements else {}
else:
lastElem = elements if isinstance(elements, dict) else {}
# Check content size of last section
lastSectionSize = 0
if isinstance(lastElem, dict):
for key, value in lastElem.items():
if isinstance(value, str):
lastSectionSize += len(value)
elif isinstance(value, list):
lastSectionSize += len(str(value))
# If last section is very small and we've done many iterations, might be stuck
if lastSectionSize < 100 and iteration > 10:
logger.warning(f"Potential loop detected: iteration {iteration}, last section size {lastSectionSize}")
return True
return False
def extractDocumentMetadata(
self,
parsedResult: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""
Extract document metadata (title, filename) from parsed AI response.
Returns dict with 'title' and 'filename' keys if found, None otherwise.
"""
if not isinstance(parsedResult, dict):
return None
# Try to get from documents array (preferred structure)
if "documents" in parsedResult and isinstance(parsedResult["documents"], list) and len(parsedResult["documents"]) > 0:
firstDoc = parsedResult["documents"][0]
if isinstance(firstDoc, dict):
title = firstDoc.get("title")
filename = firstDoc.get("filename")
if title or filename:
return {
"title": title,
"filename": filename
}
return None
def buildFinalResultFromSections(
self,
allSections: List[Dict[str, Any]],
documentMetadata: Optional[Dict[str, Any]] = None
) -> str:
"""
Build final JSON result from accumulated sections.
Uses AI-provided metadata (title, filename) if available.
"""
if not allSections:
return ""
# Extract metadata from AI response if available
title = "Generated Document"
filename = "document.json"
if documentMetadata:
if documentMetadata.get("title"):
title = documentMetadata["title"]
if documentMetadata.get("filename"):
filename = documentMetadata["filename"]
# Build documents structure
# Assuming single document for now
documents = [{
"id": "doc_1",
"title": title,
"filename": filename,
"sections": allSections
}]
result = {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "ai_generation"
},
"documents": documents
}
return json.dumps(result, indent=2)

File diff suppressed because it is too large Load diff

View file

@ -1,508 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Structure Generation Module
Handles document structure generation, including:
- Generating document structure with sections
- Building structure prompts
"""
import json
import logging
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class StructureGenerator:
"""Handles document structure generation."""
def __init__(self, services, aiService):
"""Initialize StructureGenerator with service center and AI service access."""
self.services = services
self.aiService = aiService
def _getUserLanguage(self) -> str:
"""Get user language for document generation"""
try:
if self.services:
# Prefer detected language if available (from user intention analysis)
if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage:
return self.services.currentUserLanguage
# Fallback to user's preferred language
elif hasattr(self.services, 'user') and self.services.user and hasattr(self.services.user, 'language'):
return self.services.user.language
except Exception:
pass
return 'en' # Default fallback
async def generateStructure(
self,
userPrompt: str,
contentParts: List[ContentPart],
outputFormat: Optional[str] = None,
parentOperationId: str = None
) -> Dict[str, Any]:
"""
Phase 5C: Generiert Chapter-Struktur (Table of Contents).
Definiert für jedes Chapter:
- Level, Title
- contentParts (unified object with instruction and/or caption per part)
- generationHint
Generate document structure with per-document format determination.
Multiple documents can be produced with different formats (e.g., one PDF, one HTML).
AI determines formats per-document from user prompt. The outputFormat parameter is
only a validation fallback - used if AI doesn't return format per document.
Args:
userPrompt: User-Anfrage
contentParts: Alle vorbereiteten ContentParts mit Metadaten
outputFormat: Optional global format fallback. If omitted, formats are determined
from user prompt by AI. Used as validation fallback if AI doesn't
return format per document. Defaults to "txt" if not provided.
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
Returns:
Struktur-Dict mit documents und chapters (nicht sections!)
"""
# If outputFormat not provided, use "txt" as fallback for validation
# AI will determine formats per document from user prompt
if not outputFormat:
outputFormat = "txt"
logger.debug("outputFormat not provided - using 'txt' as validation fallback, formats determined from prompt")
# Erstelle Operation-ID für Struktur-Generierung
structureOperationId = f"{parentOperationId}_structure_generation"
# Starte ChatLog mit Parent-Referenz
formatDisplay = outputFormat if outputFormat else "auto-determined"
self.services.chat.progressLogStart(
structureOperationId,
"Chapter Structure Generation",
"Structure",
f"Generating chapter structure (format: {formatDisplay})",
parentOperationId=parentOperationId
)
try:
# Baue Chapter-Struktur-Prompt mit Content-Index
structurePrompt = self._buildChapterStructurePrompt(
userPrompt=userPrompt,
contentParts=contentParts,
outputFormat=outputFormat
)
# AI-Call für Chapter-Struktur-Generierung mit Looping-Unterstützung
# Use _callAiWithLooping instead of callAiPlanning to support continuation if response is cut
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
priority=PriorityEnum.QUALITY,
processingMode=ProcessingModeEnum.DETAILED,
compressPrompt=False,
compressContext=False,
resultFormat="json"
)
structurePrompt, templateStructure = self._buildChapterStructurePrompt(
userPrompt=userPrompt,
contentParts=contentParts,
outputFormat=outputFormat
)
# Create prompt builder for continuation support
async def buildChapterStructurePromptWithContinuation(
continuationContext: Any,
templateStructure: str,
basePrompt: str
) -> str:
"""Build chapter structure prompt with continuation context. Uses unified signature.
Note: All initial context (userPrompt, contentParts, outputFormat, etc.) is already
contained in basePrompt. This function only adds continuation-specific instructions.
"""
# Extract continuation context fields (only what's needed for continuation)
incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json
# Generate both overlap context and hierarchy context using jsonContinuation
overlapContext = ""
unifiedContext = ""
if lastRawJson:
# Get contexts directly from jsonContinuation
from modules.shared.jsonContinuation import getContexts
contexts = getContexts(lastRawJson)
overlapContext = contexts.overlapContext
unifiedContext = contexts.hierarchyContextForPrompt
elif incompletePart:
unifiedContext = incompletePart
else:
unifiedContext = "Unable to extract context - response was completely broken"
# Build unified continuation prompt format
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
The previous JSON response was incomplete. Continue from where it stopped.
Context showing structure hierarchy with cut point:
```
{unifiedContext}
```
Overlap Requirement:
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
Overlap context (start your response with this exact text):
```json
{overlapContext if overlapContext else "No overlap context available"}
```
TASK:
1. Start your response EXACTLY with the overlap context shown above (character by character)
2. Continue seamlessly from where the overlap context ends
3. Complete the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL:
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
- Continue seamlessly after the overlap context with new content
- Your response must be valid JSON matching the structure template above"""
return continuationPrompt
# Call AI with looping support
# NOTE: Do NOT pass contentParts here - we only need metadata for structure generation
# The contentParts metadata is already included in the prompt (contentPartsIndex)
# Actual content extraction happens later during section generation
checkWorkflowStopped(self.services)
aiResponseJson = await self.aiService.callAiWithLooping(
prompt=structurePrompt,
options=options,
debugPrefix="chapter_structure_generation",
promptBuilder=buildChapterStructurePromptWithContinuation,
promptArgs={
"userPrompt": userPrompt,
"outputFormat": outputFormat,
"templateStructure": templateStructure,
"basePrompt": structurePrompt
},
useCaseId="chapter_structure", # REQUIRED: Explicit use case ID
operationId=structureOperationId,
userPrompt=userPrompt,
contentParts=None # Do not pass ContentParts - only metadata needed, not content extraction
)
# Parse the complete JSON response (looping system already handles completion)
extractedJson = self.services.utils.jsonExtractString(aiResponseJson)
parsedJson, parseError, cleanedJson = self.services.utils.jsonTryParse(extractedJson)
if parseError is not None:
# Even with looping, try repair as fallback
logger.warning(f"JSON parsing failed after looping: {str(parseError)}. Attempting repair...")
from modules.shared import jsonUtils
repairedJson = jsonUtils.repairBrokenJson(extractedJson)
if repairedJson:
parsedJson, parseError, _ = self.services.utils.jsonTryParse(json.dumps(repairedJson))
if parseError is None:
logger.info("Successfully repaired and parsed JSON structure after looping")
structure = parsedJson
else:
logger.error(f"Failed to parse repaired JSON: {str(parseError)}")
raise ValueError(f"Failed to parse JSON structure after repair: {str(parseError)}")
else:
logger.error(f"Failed to repair JSON. Parse error: {str(parseError)}")
logger.error(f"Cleaned JSON preview (first 500 chars): {cleanedJson[:500]}")
raise ValueError(f"Failed to parse JSON structure: {str(parseError)}")
else:
structure = parsedJson
# State 3 Validation: Validate and auto-fix structure
# Validation 3.1: Structure missing 'documents' field
if "documents" not in structure:
raise ValueError("Structure missing 'documents' field - cannot auto-fix")
documents = structure["documents"]
# Validation 3.2: Structure has no documents
if not isinstance(documents, list) or len(documents) == 0:
raise ValueError("Structure has no documents - cannot generate without documents")
# Import renderer registry for format validation (existing infrastructure)
from modules.services.serviceGeneration.renderers.registry import getRenderer
# Validate and fix each document
for doc in documents:
# Validation 3.3 & 3.4: Document outputFormat
# outputFormat parameter is optional - if omitted, formats determined from prompt by AI
# Use as fallback only if AI doesn't return format per document
# Multiple documents can have different formats (e.g., one PDF, one HTML)
globalFormatFallback = outputFormat or "txt" # Fallback for validation
if "outputFormat" not in doc or not doc["outputFormat"]:
# AI didn't return format or returned empty - use global fallback
doc["outputFormat"] = globalFormatFallback
logger.warning(f"Document {doc.get('id')} missing outputFormat - using fallback: {doc['outputFormat']}")
else:
# AI returned format - validate using existing renderer registry
formatName = str(doc["outputFormat"]).lower().strip()
renderer = getRenderer(formatName) # Uses existing infrastructure
if not renderer:
# Format doesn't match any renderer - use txt (simple approach)
logger.warning(f"Document {doc.get('id')} has format without renderer: {formatName}, using 'txt'")
doc["outputFormat"] = "txt"
else:
# Valid format with renderer - normalize and keep AI result
doc["outputFormat"] = formatName
logger.debug(f"Document {doc.get('id')} using AI-determined format: {formatName}")
# Validation 3.5 & 3.6: Document language
# Use validated currentUserLanguage (always valid, validated during user intention analysis)
# Access via _getUserLanguage() which uses self.services.currentUserLanguage
userPromptLanguage = self._getUserLanguage() # Uses validated currentUserLanguage infrastructure
if "language" not in doc or not isinstance(doc["language"], str) or len(doc["language"]) != 2:
# AI didn't return language or invalid format - use validated currentUserLanguage
doc["language"] = userPromptLanguage
if "language" not in doc:
logger.warning(f"Document {doc.get('id')} missing language - using currentUserLanguage: {userPromptLanguage}")
else:
logger.warning(f"Document {doc.get('id')} has invalid language format from AI: {doc['language']}, using currentUserLanguage")
else:
# AI returned valid language format - normalize
doc["language"] = doc["language"].lower().strip()[:2]
logger.debug(f"Document {doc.get('id')} using AI-determined language: {doc['language']}")
# Validation 3.7: Document missing 'chapters' field
if "chapters" not in doc:
raise ValueError(f"Document {doc.get('id')} missing 'chapters' field - cannot auto-fix")
# Validation 3.8: Chapter missing 'contentParts' field
for chapter in doc["chapters"]:
if "contentParts" not in chapter:
raise ValueError(f"Chapter {chapter.get('id')} missing 'contentParts' field - cannot auto-fix")
# ChatLog abschließen
self.services.chat.progressLogFinish(structureOperationId, True)
return structure
except Exception as e:
self.services.chat.progressLogFinish(structureOperationId, False)
logger.error(f"Error in generateStructure: {str(e)}")
raise
def _buildChapterStructurePrompt(
self,
userPrompt: str,
contentParts: List[ContentPart],
outputFormat: str
) -> tuple[str, str]:
"""Baue Prompt für Chapter-Struktur-Generierung."""
# Baue ContentParts-Index - filtere leere Parts heraus
contentPartsIndex = ""
validParts = []
filteredParts = []
for part in contentParts:
contentFormat = part.metadata.get("contentFormat", "unknown")
# WICHTIG: Reference Parts haben absichtlich leere Daten - immer einschließen
if contentFormat == "reference":
validParts.append(part)
logger.debug(f"Including reference ContentPart {part.id} (intentionally empty data)")
continue
# Überspringe leere Parts (keine Daten oder nur Container ohne Inhalt)
# ABER: Reference Parts wurden bereits oben behandelt
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
# Überspringe Container-Parts ohne Daten
if part.typeGroup == "container" and not part.data:
filteredParts.append((part.id, "container without data"))
continue
# Überspringe andere leere Parts (aber nicht Reference, die wurden bereits behandelt)
if not part.data:
filteredParts.append((part.id, f"no data (format: {contentFormat})"))
continue
validParts.append(part)
logger.debug(f"Including ContentPart {part.id}: format={contentFormat}, type={part.typeGroup}, dataLength={len(str(part.data)) if part.data else 0}")
if filteredParts:
logger.debug(f"Filtered out {len(filteredParts)} empty ContentParts: {filteredParts}")
logger.info(f"Building structure prompt with {len(validParts)} valid ContentParts (from {len(contentParts)} total)")
# Baue Index nur für gültige Parts
for i, part in enumerate(validParts, 1):
contentFormat = part.metadata.get("contentFormat", "unknown")
originalFileName = part.metadata.get('originalFileName', 'N/A')
contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
contentPartsIndex += f" Format: {contentFormat}\n"
contentPartsIndex += f" Type: {part.typeGroup}\n"
contentPartsIndex += f" MIME Type: {part.mimeType or 'N/A'}\n"
contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n"
contentPartsIndex += f" Original file name: {originalFileName}\n"
contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
if not contentPartsIndex:
contentPartsIndex = "\n(No content parts available)"
# Get language from services (user intention analysis)
language = self._getUserLanguage()
logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}")
# Create template structure explicitly (not extracted from prompt)
# This ensures exact identity between initial and continuation prompts
templateStructure = f"""{{
"metadata": {{
"title": "Document Title",
"language": "{language}"
}},
"documents": [{{
"id": "doc_1",
"title": "Document Title",
"filename": "document.{outputFormat}",
"outputFormat": "{outputFormat}",
"language": "{language}",
"chapters": [
{{
"id": "chapter_1",
"level": 1,
"title": "Chapter Title",
"contentParts": {{
"extracted_part_id": {{
"instruction": "Use extracted content with ALL relevant details from user request"
}}
}},
"generationHint": "Detailed description including ALL relevant details from user request for this chapter",
"sections": []
}}
]
}}]
}}"""
prompt = f"""# TASK: Plan Document Structure (Documents + Chapters)
This is a STRUCTURE PLANNING task. You define which documents to create and which chapters each document will have.
Chapter CONTENT will be generated in a later step - here you only plan the STRUCTURE and assign content references.
Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
## USER REQUEST (for context)
```
{userPrompt}
```
## AVAILABLE CONTENT PARTS
{contentPartsIndex}
## CONTENT ASSIGNMENT RULE
CRITICAL: Every chapter MUST have contentParts assigned if it relates to documents/images/data from the user request.
If the user request mentions documents/images/data, then EVERY chapter that generates content related to those references MUST assign the relevant ContentParts explicitly.
Assignment logic:
- If chapter DISPLAYS a document/image assign "object" format ContentPart with "caption"
- If chapter generates text content ABOUT a document/image/data assign ContentPart with "instruction":
- Prefer "extracted" format if available (contains analyzed/extracted content)
- If only "object" format is available, use "object" format with "instruction" (to write ABOUT the image/document)
- If chapter's generationHint or purpose relates to a document/image/data mentioned in user request → it MUST have ContentParts assigned
- Multiple chapters might assign the same ContentPart (e.g., one chapter displays image, another writes about it)
- Use ContentPart IDs exactly as listed in AVAILABLE CONTENT PARTS above
- Empty contentParts are only allowed if chapter generates content WITHOUT referencing any documents/images/data from the user request
CRITICAL RULE: If the user request mentions BOTH:
a) Documents/images/data (listed in AVAILABLE CONTENT PARTS above), AND
b) Generic content types (article text, main content, body text, etc.)
Then chapters that generate those generic content types MUST assign the relevant ContentParts, because the content should relate to or be based on the provided documents/images/data.
## CONTENT EFFICIENCY PRINCIPLES
- Generate COMPACT content: Focus on essential information only
- AVOID verbose, lengthy, or repetitive text - be concise and direct
- Prioritize FACTS over filler text - no introductions like "In this chapter..."
- Minimize system resources: shorter content = faster processing
- Quality over quantity: precise, meaningful content rather than padding
## CHAPTER STRUCTURE REQUIREMENTS
- Generate chapters based on USER REQUEST - analyze what structure the user wants
- Create ONLY the minimum chapters needed to cover the user's request - avoid over-structuring
- IMPORTANT: Each chapter MUST have ALL these fields:
- id: Unique identifier (e.g., "chapter_1")
- level: Heading level (1, 2, 3, etc.)
- title: Chapter title
- contentParts: Object mapping ContentPart IDs to usage instructions (MUST assign if chapter relates to documents/data from user request)
- generationHint: Description of what content to generate (including formatting/styling requirements)
- sections: Empty array [] (REQUIRED - sections are generated in next phase)
- contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Assign ContentParts as required by CONTENT ASSIGNMENT RULE above
- The "instruction" field for each ContentPart MUST contain ALL relevant details from the USER REQUEST that apply to content extraction for this specific chapter. Include all formatting rules, data requirements, constraints, and specifications mentioned in the user request that are relevant for processing this ContentPart in this chapter.
- generationHint: Keep CONCISE but include relevant details from the USER REQUEST. Focus on WHAT to generate, not HOW to phrase it verbosely.
- The number of chapters depends on the user request - create only what is requested. Do NOT create chapters for topics without available data.
CRITICAL: Only create chapters for CONTENT sections, not for formatting/styling requirements. Formatting/styling requirements to be included in each generationHint if needed.
## DOCUMENT STRUCTURE
For each document, determine:
- outputFormat: From USER REQUEST (explicit mention or infer from purpose/content type). Default: "{outputFormat}". Multiple documents can have different formats.
- language: From USER REQUEST (map to ISO 639-1: de, en, fr, it...). Default: "{language}". Multiple documents can have different languages.
- chapters: Structure appropriately for the format (e.g., pptx=slides, docx=sections, xlsx=worksheets). Match format capabilities and constraints.
Required JSON fields:
- metadata: {{"title": "...", "language": "..."}}
- documents: Array with id, title, filename, outputFormat, language, chapters[]
- chapters: Array with id, level, title, contentParts, generationHint, sections[]
EXAMPLE STRUCTURE (for reference only - adapt to user request):
{{
"metadata": {{
"title": "Document Title",
"language": "{language}"
}},
"documents": [{{
"id": "doc_1",
"title": "Document Title",
"filename": "document.{outputFormat}",
"outputFormat": "{outputFormat}",
"language": "{language}",
"chapters": [
{{
"id": "chapter_1",
"level": 1,
"title": "Chapter Title",
"contentParts": {{
"extracted_part_id": {{
"instruction": "Use extracted content with ALL relevant details from user request"
}}
}},
"generationHint": "Detailed description including ALL relevant details from user request for this chapter",
"sections": []
}}
]
}}]
}}
CRITICAL INSTRUCTIONS:
- Generate chapters based on USER REQUEST, NOT based on the example above
- The example shows the JSON structure format, NOT the required chapters
- Create only the chapters that match the user's request
- Adapt chapter titles and structure to match the user's specific request
- Determine outputFormat and language for each document by analyzing the USER REQUEST above
- The example shows placeholders "{outputFormat}" and "{language}" - YOU MUST REPLACE THESE with actual values determined from the USER REQUEST
MANDATORY CONTENT ASSIGNMENT CHECK:
For each chapter, verify:
1. Does the user request mention documents/images/data? (e.g., "photo", "image", "document", "data", "based on", "about")
2. Does this chapter's generationHint, title, or purpose relate to those documents/images/data mentioned in step 1?
- Examples: "article about the photo", "text describing the image", "analysis of the document", "content based on the data"
- Even if chapter doesn't explicitly say "about the image", if user request mentions both the image AND this chapter's content type relate them
3. If YES to both chapter MUST have contentParts assigned (cannot be empty {{}})
4. If ContentPart is "object" format and chapter needs to write ABOUT it assign with "instruction" field, not just "caption"
OUTPUT FORMAT: Start with {{ and end with }}. Do NOT use markdown code fences (```json). Do NOT add explanatory text before or after the JSON. Return ONLY the JSON object itself.
"""
return prompt, templateStructure

View file

@ -1,7 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Billing service module."""
from .mainServiceBilling import BillingService, getService
__all__ = ["BillingService", "getService"]

View file

@ -1,417 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Billing Service - Central service for billing operations.
Handles:
- Balance checks before AI operations
- Cost recording after AI operations
- Provider permission checks via RBAC
- Price calculation with markup
"""
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelBilling import (
BillingModelEnum,
BillingCheckResult,
TransactionTypeEnum,
ReferenceTypeEnum,
BillingTransaction,
BillingBalanceResponse,
)
from modules.interfaces.interfaceDbBilling import getInterface as getBillingInterface
logger = logging.getLogger(__name__)
# Markup percentage for internal pricing (+50% für Infrastruktur und Platform Service + 50% für Währungsrisiko ==> Faktor 2.0)
BILLING_MARKUP_PERCENT = 100
# Singleton cache
_billingServices: Dict[str, "BillingService"] = {}
def getService(currentUser: User, mandateId: str, featureInstanceId: str = None, featureCode: str = None) -> "BillingService":
"""
Factory function to get or create a BillingService instance.
Args:
currentUser: Current user object
mandateId: Mandate ID for context
featureInstanceId: Optional feature instance ID
featureCode: Optional feature code (e.g., 'chatplayground', 'automation')
Returns:
BillingService instance
"""
cacheKey = f"{currentUser.id}_{mandateId}_{featureInstanceId}"
if cacheKey not in _billingServices:
_billingServices[cacheKey] = BillingService(currentUser, mandateId, featureInstanceId, featureCode)
else:
_billingServices[cacheKey].setContext(currentUser, mandateId, featureInstanceId, featureCode)
return _billingServices[cacheKey]
class BillingService:
"""
Central billing service for AI operations.
Responsibilities:
- Check balance before operations
- Record usage costs
- Apply pricing markup
- Check provider permissions via RBAC
"""
def __init__(
self,
currentUser: User,
mandateId: str,
featureInstanceId: str = None,
featureCode: str = None
):
"""
Initialize the billing service.
Args:
currentUser: Current user object
mandateId: Mandate ID
featureInstanceId: Optional feature instance ID
featureCode: Optional feature code
"""
self.currentUser = currentUser
self.mandateId = mandateId
self.featureInstanceId = featureInstanceId
self.featureCode = featureCode
# Get billing interface
self._billingInterface = getBillingInterface(currentUser, mandateId)
# Cache settings
self._settingsCache = None
def setContext(
self,
currentUser: User,
mandateId: str,
featureInstanceId: str = None,
featureCode: str = None
):
"""Update service context."""
self.currentUser = currentUser
self.mandateId = mandateId
self.featureInstanceId = featureInstanceId
self.featureCode = featureCode
self._billingInterface = getBillingInterface(currentUser, mandateId)
self._settingsCache = None
def _getSettings(self) -> Optional[Dict[str, Any]]:
"""Get billing settings with caching."""
if self._settingsCache is None:
self._settingsCache = self._billingInterface.getSettings(self.mandateId)
return self._settingsCache
# =========================================================================
# Price Calculation
# =========================================================================
def calculatePriceWithMarkup(self, basePriceCHF: float) -> float:
"""
Calculate final price with markup.
The AICore plugins return prices in their original currency (USD).
This method applies the configured markup percentage.
Args:
basePriceCHF: Base price from AI model (actually USD from provider)
Returns:
Final price in CHF with markup applied
"""
if basePriceCHF <= 0:
return 0.0
# Apply markup (50% = multiply by 1.5)
markup_multiplier = 1 + (BILLING_MARKUP_PERCENT / 100)
return round(basePriceCHF * markup_multiplier, 6)
# =========================================================================
# Balance Operations
# =========================================================================
def checkBalance(self, estimatedCost: float = 0.0) -> BillingCheckResult:
"""
Check if the current user/mandate has sufficient balance.
Args:
estimatedCost: Estimated cost of the operation (with markup applied)
Returns:
BillingCheckResult indicating if operation is allowed
"""
return self._billingInterface.checkBalance(
self.mandateId,
self.currentUser.id,
estimatedCost
)
def hasBalance(self, estimatedCost: float = 0.0) -> bool:
"""
Quick check if balance is sufficient.
Args:
estimatedCost: Estimated cost with markup
Returns:
True if operation is allowed
"""
result = self.checkBalance(estimatedCost)
return result.allowed
def getCurrentBalance(self) -> float:
"""
Get current balance for the user/mandate.
Returns:
Current balance in CHF
"""
result = self.checkBalance(0.0)
return result.currentBalance or 0.0
# =========================================================================
# Usage Recording
# =========================================================================
def recordUsage(
self,
priceCHF: float,
workflowId: str = None,
aicoreProvider: str = None,
aicoreModel: str = None,
description: str = None
) -> Optional[Dict[str, Any]]:
"""
Record AI usage cost as a billing transaction.
This method:
1. Applies the pricing markup
2. Creates a DEBIT transaction
3. Updates the account balance
Args:
priceCHF: Base price from AI model (before markup)
workflowId: Optional workflow ID
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
description: Optional description
Returns:
Created transaction dict or None if not recorded
"""
if priceCHF <= 0:
return None
# Apply markup
finalPrice = self.calculatePriceWithMarkup(priceCHF)
if finalPrice <= 0:
return None
# Build description
if not description:
description = f"AI Usage: {aicoreModel or aicoreProvider or 'unknown'}"
return self._billingInterface.recordUsage(
mandateId=self.mandateId,
userId=self.currentUser.id,
priceCHF=finalPrice,
workflowId=workflowId,
featureInstanceId=self.featureInstanceId,
featureCode=self.featureCode,
aicoreProvider=aicoreProvider,
aicoreModel=aicoreModel,
description=description
)
# =========================================================================
# Provider Permission Check (via RBAC)
# =========================================================================
def isProviderAllowed(self, provider: str) -> bool:
"""
Check if the user has permission to use an AICore provider.
Uses RBAC to check for resource permission:
resource.aicore.{provider}
Args:
provider: Provider name (e.g., 'anthropic', 'openai')
Returns:
True if provider is allowed
"""
try:
from modules.security.rbac import RbacClass
from modules.datamodels.datamodelRbac import AccessRuleContext
from modules.security.rootAccess import getRootDbAppConnector
# Get database connector via established pattern
dbApp = getRootDbAppConnector()
rbac = RbacClass(dbApp, dbApp)
resourceKey = f"resource.aicore.{provider}"
# Check if user has view permission for this resource (view = use for RESOURCE context)
permissions = rbac.getUserPermissions(
self.currentUser,
AccessRuleContext.RESOURCE,
resourceKey,
mandateId=self.mandateId
)
return permissions.view
except Exception as e:
logger.warning(f"Error checking provider permission: {e}")
# Default to allowed if RBAC check fails
return True
def getallowedProviders(self) -> List[str]:
"""
Get list of AICore providers the user is allowed to use.
Returns:
List of allowed provider names
"""
try:
from modules.aicore.aicoreModelRegistry import modelRegistry
# Get all available providers
connectors = modelRegistry.discoverConnectors()
allProviders = [c.getConnectorType() for c in connectors]
# Filter by RBAC permissions
return [p for p in allProviders if self.isProviderAllowed(p)]
except Exception as e:
logger.warning(f"Error getting allowed providers: {e}")
return []
# =========================================================================
# Admin Operations
# =========================================================================
def addCredit(
self,
amount: float,
description: str = "Manual credit",
referenceType: ReferenceTypeEnum = ReferenceTypeEnum.ADMIN
) -> Optional[Dict[str, Any]]:
"""
Add credit to the account (admin operation).
Args:
amount: Amount to credit (positive)
description: Transaction description
referenceType: Reference type (ADMIN, PAYMENT, SYSTEM)
Returns:
Created transaction dict or None
"""
if amount <= 0:
return None
settings = self._getSettings()
if not settings:
logger.warning(f"No billing settings for mandate {self.mandateId}")
return None
billingModel = BillingModelEnum(settings.get("billingModel", BillingModelEnum.UNLIMITED.value))
# Get or create account
if billingModel == BillingModelEnum.PREPAY_USER:
account = self._billingInterface.getOrCreateUserAccount(
self.mandateId,
self.currentUser.id,
initialBalance=0.0
)
else:
account = self._billingInterface.getOrCreateMandateAccount(
self.mandateId,
initialBalance=0.0
)
# Create credit transaction
transaction = BillingTransaction(
accountId=account["id"],
transactionType=TransactionTypeEnum.CREDIT,
amount=amount,
description=description,
referenceType=referenceType
)
return self._billingInterface.createTransaction(transaction)
# =========================================================================
# Statistics & Reporting
# =========================================================================
def getBalancesForUser(self) -> List[BillingBalanceResponse]:
"""
Get all billing balances for the current user.
Returns:
List of balance responses for each mandate
"""
return self._billingInterface.getBalancesForUser(self.currentUser.id)
def getTransactionHistory(self, limit: int = 100) -> List[Dict[str, Any]]:
"""
Get transaction history for the user across all mandates.
Args:
limit: Maximum number of transactions
Returns:
List of transactions
"""
return self._billingInterface.getTransactionsForUser(self.currentUser.id, limit=limit)
# ============================================================================
# Exception Classes
# ============================================================================
class InsufficientBalanceException(Exception):
"""Raised when there's insufficient balance for an operation."""
def __init__(self, currentBalance: float, requiredAmount: float, message: str = None):
self.currentBalance = currentBalance
self.requiredAmount = requiredAmount
self.message = message or f"Insufficient balance. Current: {currentBalance:.2f} CHF, Required: {requiredAmount:.2f} CHF"
super().__init__(self.message)
class ProviderNotAllowedException(Exception):
"""Raised when a user doesn't have permission to use an AI provider."""
def __init__(self, provider: str, message: str = None):
self.provider = provider
self.message = message or f"Provider '{provider}' is not allowed for your role"
super().__init__(self.message)
class BillingContextError(Exception):
"""Raised when billing context is incomplete (missing mandateId, user, etc.).
This is a FAIL-SAFE error: AI calls MUST NOT proceed without valid billing context.
Acts like a 0 CHF credit card pre-authorization check - validates that billing
CAN be recorded before any expensive AI operation starts.
"""
def __init__(self, message: str = None):
self.message = message or "Billing context incomplete - AI call blocked"
super().__init__(self.message)

View file

@ -1,104 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Stripe Checkout service for billing credit top-ups.
Creates Checkout Sessions for redirect-based payment flow.
"""
import logging
from typing import Optional
from modules.shared.configuration import APP_CONFIG
logger = logging.getLogger(__name__)
# Server-side allowed amounts in CHF - never trust client
ALLOWED_AMOUNTS_CHF = [10, 25, 50, 100, 250, 500]
def create_checkout_session(
mandate_id: str,
user_id: Optional[str],
amount_chf: float
) -> str:
"""
Create a Stripe Checkout Session for credit top-up.
Amount and currency are validated server-side. The client-provided amount
must match an allowed preset.
Args:
mandate_id: Target mandate ID
user_id: Target user ID (for PREPAY_USER) or None (for mandate pool)
amount_chf: Amount in CHF (must be in ALLOWED_AMOUNTS_CHF)
Returns:
Stripe Checkout Session URL for redirect
Raises:
ValueError: If amount is invalid
"""
import stripe
# Validate amount server-side
if amount_chf not in ALLOWED_AMOUNTS_CHF:
raise ValueError(
f"Invalid amount {amount_chf} CHF. Allowed: {ALLOWED_AMOUNTS_CHF}"
)
# Pin API version from config (match Stripe Dashboard)
api_version = APP_CONFIG.get("STRIPE_API_VERSION")
if api_version:
stripe.api_version = api_version
# Get secrets
secret_key = APP_CONFIG.get("STRIPE_SECRET_KEY_SECRET") or APP_CONFIG.get("STRIPE_SECRET_KEY")
if not secret_key:
raise ValueError("STRIPE_SECRET_KEY_SECRET not configured")
stripe.api_key = secret_key
frontend_url = APP_CONFIG.get("APP_FRONTEND_URL", "https://nyla-int.poweron-center.net")
base_path = "/admin/billing"
success_url = f"{frontend_url.rstrip('/')}{base_path}?success=true&session_id={{CHECKOUT_SESSION_ID}}"
cancel_url = f"{frontend_url.rstrip('/')}{base_path}?canceled=true"
# Amount in cents for Stripe (CHF uses 2 decimal places)
amount_cents = int(round(amount_chf * 100))
metadata = {
"mandateId": mandate_id,
"amountChf": str(amount_chf),
}
if user_id:
metadata["userId"] = user_id
session = stripe.checkout.Session.create(
mode="payment",
line_items=[
{
"price_data": {
"currency": "chf",
"unit_amount": amount_cents,
"product_data": {
"name": "Guthaben aufladen",
"description": "AI Service Guthaben (CHF)",
},
},
"quantity": 1,
}
],
success_url=success_url,
cancel_url=cancel_url,
metadata=metadata,
)
if not session or not session.url:
raise ValueError("Stripe Checkout Session creation failed")
logger.info(
f"Created Stripe Checkout Session {session.id} for mandate {mandate_id}, "
f"amount {amount_chf} CHF"
)
return session.url

File diff suppressed because it is too large Load diff

View file

@ -1,7 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from .mainServiceExtraction import ExtractionService
__all__ = ["ExtractionService"]

View file

@ -1,4 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

View file

@ -1,184 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import base64
import io
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
class ImageChunker(Chunker):
"""Chunker for reducing image size through resizing, compression, and tiling."""
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
"""
Chunk an image by reducing its size through various strategies.
Args:
part: ContentPart containing image data (base64 encoded)
options: Chunking options including:
- imageChunkSize: Maximum size in bytes for each chunk
- imageMaxPixels: Maximum pixels (width*height) for the image
- imageQuality: JPEG quality (0-100, default 85)
- imageTileSize: Size for tiling if image is still too large
Returns:
List of image chunks with reduced size
"""
maxBytes = int(options.get("imageChunkSize", 1000000)) # 1MB default
maxPixels = int(options.get("imageMaxPixels", 1024 * 1024)) # 1MP default
quality = int(options.get("imageQuality", 85))
tileSize = int(options.get("imageTileSize", 512)) # 512x512 tiles
chunks: List[Dict[str, Any]] = []
try:
# Lazy import PIL to avoid hanging during module import
from PIL import Image
# Decode base64 image data
imageData = base64.b64decode(part.data)
image = Image.open(io.BytesIO(imageData))
# Get original dimensions
originalWidth, originalHeight = image.size
originalPixels = originalWidth * originalHeight
# Strategy 1: If image is small enough, return as-is
if len(part.data) <= maxBytes and originalPixels <= maxPixels:
chunks.append({
"data": part.data,
"size": len(part.data),
"order": 0,
"metadata": {
"originalSize": len(part.data),
"originalPixels": originalPixels,
"strategy": "original"
}
})
return chunks
# Strategy 2: Resize to fit within pixel limit
if originalPixels > maxPixels:
# Calculate new dimensions maintaining aspect ratio
scale = (maxPixels / originalPixels) ** 0.5
newWidth = int(originalWidth * scale)
newHeight = int(originalHeight * scale)
# Ensure minimum size
newWidth = max(newWidth, 64)
newHeight = max(newHeight, 64)
image = image.resize((newWidth, newHeight), Image.Resampling.LANCZOS)
# Strategy 3: Compress with quality reduction
currentSize = len(part.data)
currentQuality = quality
while currentSize > maxBytes and currentQuality > 10:
# Compress image
output = io.BytesIO()
image.save(output, format='JPEG', quality=currentQuality, optimize=True)
compressedData = output.getvalue()
compressedB64 = base64.b64encode(compressedData).decode('utf-8')
currentSize = len(compressedB64)
if currentSize <= maxBytes:
chunks.append({
"data": compressedB64,
"size": currentSize,
"order": 0,
"metadata": {
"originalSize": len(part.data),
"originalPixels": originalPixels,
"compressedSize": currentSize,
"quality": currentQuality,
"strategy": "compressed"
}
})
return chunks
currentQuality -= 10
# Strategy 4: Tile the image if still too large
if currentSize > maxBytes:
chunks = self._tileImage(image, maxBytes, tileSize, quality, originalPixels)
return chunks
# Fallback: Return compressed version even if over limit
output = io.BytesIO()
image.save(output, format='JPEG', quality=10, optimize=True)
compressedData = output.getvalue()
compressedB64 = base64.b64encode(compressedData).decode('utf-8')
chunks.append({
"data": compressedB64,
"size": len(compressedB64),
"order": 0,
"metadata": {
"originalSize": len(part.data),
"originalPixels": originalPixels,
"compressedSize": len(compressedB64),
"quality": 10,
"strategy": "fallback_compressed"
}
})
except Exception as e:
# Fallback: Return original data with error metadata
chunks.append({
"data": part.data,
"size": len(part.data),
"order": 0,
"metadata": {
"originalSize": len(part.data),
"strategy": "error_fallback",
"error": str(e)
}
})
return chunks
def _tileImage(self, image: Any, maxBytes: int, tileSize: int, quality: int, originalPixels: int) -> List[Dict[str, Any]]:
"""Split image into tiles if it's still too large after compression."""
chunks = []
width, height = image.size
# Calculate tile grid
tilesX = (width + tileSize - 1) // tileSize
tilesY = (height + tileSize - 1) // tileSize
for y in range(tilesY):
for x in range(tilesX):
# Calculate tile boundaries
left = x * tileSize
top = y * tileSize
right = min(left + tileSize, width)
bottom = min(top + tileSize, height)
# Extract tile
tile = image.crop((left, top, right, bottom))
# Compress tile
output = io.BytesIO()
tile.save(output, format='JPEG', quality=quality, optimize=True)
tileData = output.getvalue()
tileB64 = base64.b64encode(tileData).decode('utf-8')
chunks.append({
"data": tileB64,
"size": len(tileB64),
"order": y * tilesX + x,
"metadata": {
"originalSize": len(image.tobytes()),
"originalPixels": originalPixels,
"tileSize": tileSize,
"tilePosition": f"{x},{y}",
"tileBounds": f"{left},{top},{right},{bottom}",
"quality": quality,
"strategy": "tiled"
}
})
return chunks

View file

@ -1,91 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import json
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
class StructureChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("structureChunkSize", 40000))
data = part.data or ""
# best-effort: try JSON list/object bucketing; else fallback to line-based
chunks: List[Dict[str, Any]] = []
try:
obj = json.loads(data)
def emit(bucket: Any):
text = json.dumps(bucket, ensure_ascii=False)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
if isinstance(obj, list):
bucket: list[Any] = []
size = 0
for item in obj:
text = json.dumps(item, ensure_ascii=False)
s = len(text.encode('utf-8'))
if size + s > maxBytes and bucket:
emit(bucket)
bucket = [item]
size = s
else:
bucket.append(item)
size += s
if bucket:
emit(bucket)
else:
# JSON object (dict) - check if it fits
text = json.dumps(obj, ensure_ascii=False)
textSize = len(text.encode('utf-8'))
if textSize <= maxBytes:
emit(obj)
else:
# Object too large - try to split by keys if possible
# For large objects, we need to chunk by character boundaries
# since we can't split JSON objects arbitrarily
if isinstance(obj, dict) and len(obj) > 1:
# Try to split object into multiple chunks by keys
# This preserves JSON structure better than line-based chunking
currentChunk: Dict[str, Any] = {}
currentSize = 2 # Start with "{}" overhead
for key, value in obj.items():
itemText = json.dumps({key: value}, ensure_ascii=False)
itemSize = len(itemText.encode('utf-8'))
# Account for comma and spacing between items
if currentChunk:
itemSize += 2 # ", " separator
if currentSize + itemSize > maxBytes and currentChunk:
# Current chunk is full, emit it
emit(currentChunk)
currentChunk = {key: value}
currentSize = len(itemText.encode('utf-8'))
else:
currentChunk[key] = value
currentSize += itemSize
# Emit remaining chunk
if currentChunk:
emit(currentChunk)
else:
# Single large value or can't split - fallback to line chunking
raise ValueError("too large")
except Exception:
current: List[str] = []
size = 0
for line in data.split('\n'):
s = len(line.encode('utf-8')) + 1
if size + s > maxBytes and current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
current = [line]
size = s
else:
current.append(line)
size += s
if current:
text = '\n'.join(current)
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
return chunks

View file

@ -1,30 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
class TableChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("tableChunkSize", 40000))
chunks: List[Dict[str, Any]] = []
current: List[str] = []
size = 0
for line in part.data.split('\n'):
lineSize = len(line.encode('utf-8')) + 1
if size + lineSize > maxBytes and current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
current = [line]
size = lineSize
else:
current.append(line)
size += lineSize
if current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
return chunks

View file

@ -1,58 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
logger = logging.getLogger(__name__)
class TextChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("textChunkSize", 40000))
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
chunks: List[Dict[str, Any]] = []
# Split by lines first (preferred method for text)
lines = part.data.split('\n')
current: List[str] = []
size = 0
for line in lines:
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline character
if size + lineSize > maxBytes and current:
# Current chunk is full, save it and start new one
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
current = []
size = 0
# If a single line is larger than maxBytes, split it by character boundaries
if lineSize > maxBytes:
# Split the long line into chunks
lineBytes = line.encode('utf-8')
lineStart = 0
while lineStart < len(lineBytes):
chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
chunkText = chunkBytes.decode('utf-8', errors='ignore')
chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
lineStart += maxBytes
# Don't add this line to current, it's already chunked
continue
# Add line to current chunk
current.append(line)
size += lineSize
# Add remaining lines as final chunk
if current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
return chunks

View file

@ -1,4 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

View file

@ -1,47 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import base64
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
class BinaryExtractor(Extractor):
"""
Fallback extractor for unsupported file types.
This extractor handles any file type that doesn't match other extractors.
It encodes the file as base64 and marks it as binary data.
Supported formats:
- All file types (fallback)
- MIME types: application/octet-stream (default)
- File extensions: All (fallback)
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return True
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (all)."""
return [] # Accepts all extensions as fallback
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types (all)."""
return [] # Accepts all MIME types as fallback
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/octet-stream"
return [ContentPart(
id=makeId(),
parentId=None,
label="binary",
typeGroup="binary",
mimeType=mimeType,
data=base64.b64encode(fileBytes).decode("utf-8"),
metadata={"size": len(fileBytes), "warning": "Unsupported file type"}
)]

View file

@ -1,45 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class CsvExtractor(Extractor):
"""
Extractor for CSV files.
Supported formats:
- MIME types: text/csv
- File extensions: .csv
- Special handling: Treats as table data
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".csv"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/csv"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/csv"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="table",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]

View file

@ -1,109 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import io
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
class DocxExtractor(Extractor):
"""
Extractor for Microsoft Word documents.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
- File extensions: .docx
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
- Dependencies: python-docx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global docx
import docx # python-docx
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".docx"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []
rootId = makeId()
parts.append(ContentPart(
id=rootId,
parentId=None,
label="docx",
typeGroup="container",
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
data="",
metadata={"size": len(fileBytes)}
))
if not self._haveLibs:
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="binary",
typeGroup="binary",
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
data="",
metadata={"size": len(fileBytes), "warning": "DOCX lib not available"}
))
return parts
with io.BytesIO(fileBytes) as buf:
d = docx.Document(buf)
# paragraphs
for i, para in enumerate(d.paragraphs):
text = para.text or ""
if text.strip():
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"p_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"size": len(text.encode('utf-8'))}
))
# tables → CSV rows
for ti, table in enumerate(d.tables):
rows: list[str] = []
for row in table.rows:
cells = [ (cell.text or "").replace('"', '""') for cell in row.cells ]
rows.append(",".join([f'"{c}"' for c in cells]))
csvData = "\n".join(rows)
if csvData:
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"table_{ti+1}",
typeGroup="table",
mimeType="text/csv",
data=csvData,
metadata={"size": len(csvData.encode('utf-8'))}
))
return parts

View file

@ -1,50 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from bs4 import BeautifulSoup
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class HtmlExtractor(Extractor):
"""
Extractor for HTML files.
Supported formats:
- MIME types: text/html
- File extensions: .html, .htm
- Special handling: Uses BeautifulSoup for parsing
- Dependencies: beautifulsoup4
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".html", ".htm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/html"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "text/html"
text = fileBytes.decode("utf-8", errors="replace")
try:
BeautifulSoup(text, "html.parser")
except Exception:
pass
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=text,
metadata={"size": len(fileBytes)}
)]

View file

@ -1,77 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import base64
import logging
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class ImageExtractor(Extractor):
"""
Extractor for image files.
Supported formats:
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
- Special handling: GIF files are converted to PNG during extraction
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return ((mimeType or "").startswith("image/") or
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "image/unknown"
fileName = context.get("fileName", "")
# Convert GIF to PNG during extraction
if mimeType.lower() == "image/gif":
try:
from PIL import Image
import io
# Open GIF and convert to PNG
with Image.open(io.BytesIO(fileBytes)) as img:
# Convert to RGB (removes animation)
if img.mode in ('RGBA', 'LA', 'P'):
img = img.convert('RGB')
# Save as PNG in memory
png_buffer = io.BytesIO()
img.save(png_buffer, format='PNG')
png_data = png_buffer.getvalue()
# Update mimeType and fileBytes
mimeType = "image/png"
fileBytes = png_data
logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes")
except Exception as e:
logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original")
# Keep original GIF data if conversion fails
return [ContentPart(
id=makeId(),
parentId=None,
label="image",
typeGroup="image",
mimeType=mimeType,
data=base64.b64encode(fileBytes).decode("utf-8"),
metadata={"size": len(fileBytes)}
)]

View file

@ -1,50 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import json
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class JsonExtractor(Extractor):
"""
Extractor for JSON files.
Supported formats:
- MIME types: application/json
- File extensions: .json
- Special handling: Validates JSON format, falls back to text if invalid
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".json"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/json"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/json"
text = fileBytes.decode("utf-8", errors="replace")
# verify JSON is well-formed; fall back to text if not
try:
json.loads(text)
except Exception:
pass
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=text,
metadata={"size": len(fileBytes)}
)]

View file

@ -1,156 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import base64
import io
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
class PdfExtractor(Extractor):
"""
Extractor for PDF files.
Supported formats:
- MIME types: application/pdf
- File extensions: .pdf
- Special handling: Extracts text per page and embedded images
- Dependencies: PyPDF2, PyMuPDF (fitz)
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global PyPDF2, fitz
import PyPDF2
import fitz # PyMuPDF
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pdf"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/pdf"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []
rootId = makeId()
parts.append(ContentPart(
id=rootId,
parentId=None,
label="pdf",
typeGroup="container",
mimeType="application/pdf",
data="",
metadata={"size": len(fileBytes)}
))
if not self._haveLibs:
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="binary",
typeGroup="binary",
mimeType="application/pdf",
data=base64.b64encode(fileBytes).decode("utf-8"),
metadata={"size": len(fileBytes), "warning": "PDF libs not available"}
))
return parts
# Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
try:
with io.BytesIO(fileBytes) as buf:
doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
for i in range(len(doc)):
try:
page = doc[i]
text = page.get_text() or ""
if text.strip():
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"page_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
))
except Exception:
continue
doc.close()
except Exception:
pass
# Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
if not has_text:
try:
with io.BytesIO(fileBytes) as buf:
reader = PyPDF2.PdfReader(buf)
for i, page in enumerate(reader.pages):
try:
text = page.extract_text() or ""
if text.strip():
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"page_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
))
except Exception:
continue
except Exception:
pass
# Extract images with PyMuPDF
try:
with io.BytesIO(fileBytes) as buf2:
doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
for i in range(len(doc)):
page = doc[i]
images = page.get_images(full=True)
for j, img in enumerate(images):
try:
xref = img[0]
baseImage = doc.extract_image(xref)
if baseImage:
imgBytes = baseImage.get("image", b"")
ext = baseImage.get("ext", "png")
if imgBytes:
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"image_{i+1}_{j}",
typeGroup="image",
mimeType=f"image/{ext}",
data=base64.b64encode(imgBytes).decode("utf-8"),
metadata={"pageIndex": i, "size": len(imgBytes)}
))
except Exception:
continue
doc.close()
except Exception:
pass
return parts

View file

@ -1,227 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import base64
from typing import List, Dict, Any, Optional
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
from ..subRegistry import Extractor
logger = logging.getLogger(__name__)
class PptxExtractor(Extractor):
"""
Extractor for PowerPoint files.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
- File extensions: .pptx, .ppt
- Special handling: Extracts slide content, tables, and images
- Dependencies: python-pptx
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global Presentation
from pptx import Presentation
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".pptx", ".ppt"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
"""
Extract content from PowerPoint files.
Args:
fileBytes: Raw file data as bytes
context: Context dictionary with file information
Returns:
List of ContentPart objects with extracted content
"""
self._load()
if not self._haveLibs:
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data="Error: python-pptx library not installed",
metadata={"error": True, "error_message": "python-pptx library not installed"}
)]
try:
import io
# Load presentation from bytes
presentation = Presentation(io.BytesIO(fileBytes))
parts = []
slide_index = 0
# Extract content from each slide
for slide in presentation.slides:
slide_index += 1
slide_content = []
# Extract text from slide
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content.append(shape.text.strip())
# Extract table data
for shape in slide.shapes:
if shape.has_table:
table = shape.table
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
row_data.append(cell.text.strip())
table_data.append(row_data)
if table_data:
# Convert table to markdown format
table_md = self._table_to_markdown(table_data)
slide_content.append(table_md)
# Extract images
for shape in slide.shapes:
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
try:
image = shape.image
image_bytes = image.blob
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Create image part
image_part = ContentPart(
id=f"slide_{slide_index}_image_{len(parts)}",
label=f"Slide {slide_index} Image",
typeGroup="image",
mimeType="image/png", # Default to PNG
data=image_b64,
metadata={
"slide_number": slide_index,
"shape_type": "image",
"extracted_from": "powerpoint"
}
)
parts.append(image_part)
except Exception as e:
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
# Create slide content part
if slide_content:
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
slide_part = ContentPart(
id=f"slide_{slide_index}",
label=f"Slide {slide_index} Content",
typeGroup="structure",
mimeType="text/plain",
data=slide_text,
metadata={
"slide_number": slide_index,
"content_type": "slide",
"extracted_from": "powerpoint",
"text_length": len(slide_text)
}
)
parts.append(slide_part)
# Create presentation overview
file_name = context.get("fileName", "presentation.pptx")
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
overview_text += f"**Content Parts:** {len(parts)}\n\n"
# Add slide summaries
for i, slide in enumerate(presentation.slides, 1):
slide_text_parts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text_parts.append(shape.text.strip())
if slide_text_parts:
overview_text += f"## Slide {i}\n"
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
overview_text += "\n\n"
# Create overview part
overview_part = ContentPart(
id="presentation_overview",
label="Presentation Overview",
typeGroup="text",
mimeType="text/plain",
data=overview_text,
metadata={
"content_type": "overview",
"extracted_from": "powerpoint",
"total_slides": len(presentation.slides),
"text_length": len(overview_text)
}
)
parts.insert(0, overview_part) # Insert at beginning
return parts
except Exception as e:
logger.error(f"Error extracting PowerPoint content: {str(e)}")
return [ContentPart(
id="error",
label="PowerPoint Extraction Error",
typeGroup="text",
mimeType="text/plain",
data=f"Error extracting PowerPoint content: {str(e)}",
metadata={"error": True, "error_message": str(e)}
)]
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""Convert table data to markdown format."""
if not table_data:
return ""
markdown_lines = []
# Header row
if table_data:
header = "| " + " | ".join(table_data[0]) + " |"
markdown_lines.append(header)
# Separator row
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
markdown_lines.append(separator)
# Data rows
for row in table_data[1:]:
data_row = "| " + " | ".join(row) + " |"
markdown_lines.append(data_row)
return "\n".join(markdown_lines)

View file

@ -1,58 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class SqlExtractor(Extractor):
"""
Extractor for SQL files.
Supported formats:
- MIME types: text/x-sql, application/sql
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
- Special handling: Treats as structured text with SQL syntax
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return (mimeType in ("text/x-sql", "application/sql") or
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["text/x-sql", "application/sql"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/x-sql"
data = fileBytes.decode("utf-8", errors="replace")
# Add SQL-specific metadata
metadata = {
"size": len(fileBytes),
"file_type": "sql",
"line_count": len(data.splitlines()),
"has_select": "SELECT" in data.upper(),
"has_insert": "INSERT" in data.upper(),
"has_update": "UPDATE" in data.upper(),
"has_delete": "DELETE" in data.upper(),
"has_create": "CREATE" in data.upper(),
"has_drop": "DROP" in data.upper()
}
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=data,
metadata=metadata
)]

View file

@ -1,105 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class TextExtractor(Extractor):
"""
Extractor for plain text files and code files.
Supported formats:
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
# Check MIME types
if mimeType and mimeType.startswith("text/"):
return True
# Check file extensions
if fileName:
ext = fileName.lower()
return ext.endswith((
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
))
return False
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [
# Basic text files
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
# Programming languages
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
# Web technologies
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
# Configuration files
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
# Scripts and automation
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
# Data files
".csv", ".tsv", ".tab", ".dat", ".data",
# Documentation
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
# Other text formats
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
".env", ".env.local", ".env.development", ".env.production", ".env.test",
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return [
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
"application/x-yaml", "application/x-toml", "application/x-ini",
"application/x-config", "application/x-properties", "application/x-log"
]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
fileName = context.get("fileName")
mimeType = context.get("mimeType") or "text/plain"
data = fileBytes.decode("utf-8", errors="replace")
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="text",
mimeType=mimeType,
data=data,
metadata={"size": len(fileBytes)}
)]

View file

@ -1,114 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import io
from datetime import datetime
from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
class XlsxExtractor(Extractor):
"""
Extractor for Microsoft Excel spreadsheets.
Supported formats:
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
- File extensions: .xlsx, .xlsm
- Special handling: Extracts all sheets as CSV data
- Dependencies: openpyxl
"""
def __init__(self):
self._loaded = False
self._haveLibs = False
def _load(self):
if self._loaded:
return
self._loaded = True
try:
global openpyxl
import openpyxl
self._haveLibs = True
except Exception:
self._haveLibs = False
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xlsx", ".xlsm"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
self._load()
parts: List[ContentPart] = []
rootId = makeId()
parts.append(ContentPart(
id=rootId,
parentId=None,
label="xlsx",
typeGroup="container",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
data="",
metadata={"size": len(fileBytes)}
))
if not self._haveLibs:
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label="binary",
typeGroup="binary",
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
data="",
metadata={"size": len(fileBytes), "warning": "openpyxl not available"}
))
return parts
with io.BytesIO(fileBytes) as buf:
wb = openpyxl.load_workbook(buf, data_only=True)
for sheetName in wb.sheetnames:
ws = wb[sheetName]
# extract rectangular data region by min/max
min_row = ws.min_row
max_row = ws.max_row
min_col = ws.min_column
max_col = ws.max_column
lines: list[str] = []
for r in range(min_row, max_row + 1):
cells: list[str] = []
for c in range(min_col, max_col + 1):
cell = ws.cell(row=r, column=c)
v = cell.value
if v is None:
cells.append("")
elif isinstance(v, (int, float)):
cells.append(str(v))
elif isinstance(v, datetime):
cells.append(v.strftime("%Y-%m-%d %H:%M:%S"))
else:
escaped_value = str(v).replace('"', '""')
cells.append(f'"{escaped_value}"')
lines.append(",".join(cells))
csvData = "\n".join(lines)
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"sheet_{sheetName}",
typeGroup="table",
mimeType="text/csv",
data=csvData,
metadata={"sheet": sheetName, "size": len(csvData.encode('utf-8'))}
))
return parts

View file

@ -1,49 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
import xml.etree.ElementTree as ET
from modules.datamodels.datamodelExtraction import ContentPart
from ..subUtils import makeId
from ..subRegistry import Extractor
class XmlExtractor(Extractor):
"""
Extractor for XML files.
Supported formats:
- MIME types: application/xml
- File extensions: .xml, .rss, .atom
- Special handling: Uses ElementTree for parsing
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions."""
return [".xml", ".rss", ".atom"]
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return ["application/xml"]
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
mimeType = context.get("mimeType") or "application/xml"
text = fileBytes.decode("utf-8", errors="replace")
try:
ET.fromstring(text)
except Exception:
pass
return [ContentPart(
id=makeId(),
parentId=None,
label="main",
typeGroup="structure",
mimeType=mimeType,
data=text,
metadata={"size": len(fileBytes)}
)]

File diff suppressed because it is too large Load diff

View file

@ -1,2 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.

View file

@ -1,13 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
class DefaultMerger:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Default merger that passes through parts unchanged.
Used for image, binary, metadata, container typeGroups.
"""
return parts

View file

@ -1,154 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TableMerger:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge table parts based on strategy.
Strategy options:
- groupBy: "parentId" (default), "documentId", "sheet", "none"
- maxSize: maximum size per merged part
- combineSheets: bool - whether to combine multiple sheets into one table
"""
if not parts:
return parts
groupBy = strategy.groupBy
maxSize = strategy.maxSize or 0
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
# Group parts
groups = self._groupParts(parts, groupBy, combineSheets)
merged: List[ContentPart] = []
for groupKey, groupParts in groups.items():
if maxSize > 0:
merged.extend(self._mergeWithSizeLimit(groupParts, maxSize, groupKey))
else:
merged.extend(self._mergeGroup(groupParts, groupKey))
return merged
def _groupParts(self, parts: List[ContentPart], groupBy: str, combineSheets: bool) -> Dict[str, List[ContentPart]]:
groups: Dict[str, List[ContentPart]] = {}
for part in parts:
if part.typeGroup != "table":
# Non-table parts go in their own group
key = f"nontable_{part.id}"
if key not in groups:
groups[key] = []
groups[key].append(part)
continue
if groupBy == "parentId":
key = part.parentId or "root"
elif groupBy == "documentId":
key = part.metadata.get("documentId", "unknown")
elif groupBy == "sheet" and not combineSheets:
key = part.metadata.get("sheet", "unknown")
else: # "none" or combineSheets=True
key = "all_tables"
if key not in groups:
groups[key] = []
groups[key].append(part)
return groups
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
if not parts:
return []
if len(parts) == 1:
return parts
# For tables, we typically keep them separate unless explicitly combining
# But we can add metadata about the group
for i, part in enumerate(parts):
part.metadata["groupKey"] = groupKey
part.metadata["groupIndex"] = i
part.metadata["groupSize"] = len(parts)
return parts
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int, groupKey: str) -> List[ContentPart]:
if not parts:
return []
# For tables, we typically don't merge across different tables
# Instead, we chunk individual large tables
merged: List[ContentPart] = []
for part in parts:
partSize = part.metadata.get("size", 0)
if partSize <= maxSize:
# Part fits within limit
part.metadata["groupKey"] = groupKey
merged.append(part)
else:
# Chunk the large table
chunks = self._chunkTable(part, maxSize)
merged.extend(chunks)
return merged
def _chunkTable(self, part: ContentPart, maxSize: int) -> List[ContentPart]:
"""Chunk a large table by rows while preserving CSV structure."""
lines = part.data.split('\n')
if not lines:
return [part]
chunks: List[ContentPart] = []
currentChunk: List[str] = []
currentSize = 0
for line in lines:
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline
if currentSize + lineSize > maxSize and currentChunk:
# Flush current chunk
chunkData = '\n'.join(currentChunk)
chunks.append(ContentPart(
id=makeId(),
parentId=part.parentId,
label=f"{part.label}_chunk_{len(chunks)}",
typeGroup="table",
mimeType=part.mimeType,
data=chunkData,
metadata={
"size": len(chunkData.encode('utf-8')),
"chunk": True,
"originalPart": part.id,
"chunkIndex": len(chunks)
}
))
currentChunk = [line]
currentSize = lineSize
else:
currentChunk.append(line)
currentSize += lineSize
# Flush remaining chunk
if currentChunk:
chunkData = '\n'.join(currentChunk)
chunks.append(ContentPart(
id=makeId(),
parentId=part.parentId,
label=f"{part.label}_chunk_{len(chunks)}",
typeGroup="table",
mimeType=part.mimeType,
data=chunkData,
metadata={
"size": len(chunkData.encode('utf-8')),
"chunk": True,
"originalPart": part.id,
"chunkIndex": len(chunks)
}
))
return chunks

View file

@ -1,138 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
from ..subUtils import makeId
class TextMerger:
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
"""
Merge text parts based on strategy.
Strategy options:
- groupBy: "parentId" (default), "documentId", "none"
- orderBy: "label", "pageIndex", "sheetIndex", "none"
- maxSize: maximum size per merged part
"""
if not parts:
return parts
groupBy = strategy.groupBy
orderBy = strategy.orderBy
maxSize = strategy.maxSize or 0
# Group parts
groups = self._groupParts(parts, groupBy)
merged: List[ContentPart] = []
for groupKey, groupParts in groups.items():
# Sort within group
sortedParts = self._sortParts(groupParts, orderBy)
# Merge respecting maxSize
if maxSize > 0:
merged.extend(self._mergeWithSizeLimit(sortedParts, maxSize))
else:
merged.extend(self._mergeGroup(sortedParts, groupKey))
return merged
def _groupParts(self, parts: List[ContentPart], groupBy: str) -> Dict[str, List[ContentPart]]:
groups: Dict[str, List[ContentPart]] = {}
for part in parts:
if part.typeGroup != "text":
# Non-text parts go in their own group
key = f"nontext_{part.id}"
if key not in groups:
groups[key] = []
groups[key].append(part)
continue
if groupBy == "parentId":
key = part.parentId or "root"
elif groupBy == "documentId":
key = part.metadata.get("documentId", "unknown")
else: # "none"
key = "all"
if key not in groups:
groups[key] = []
groups[key].append(part)
return groups
def _sortParts(self, parts: List[ContentPart], orderBy: str) -> List[ContentPart]:
if orderBy == "pageIndex":
return sorted(parts, key=lambda p: p.metadata.get("pageIndex", 0))
elif orderBy == "sheetIndex":
return sorted(parts, key=lambda p: p.metadata.get("sheetIndex", 0))
elif orderBy == "label":
return sorted(parts, key=lambda p: p.label)
else: # "none"
return parts
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
if not parts:
return []
if len(parts) == 1:
return parts
# Merge all text parts in group
textParts = [p for p in parts if p.typeGroup == "text"]
nonTextParts = [p for p in parts if p.typeGroup != "text"]
if not textParts:
return nonTextParts
# Combine text data
combinedData = "\n".join([p.data for p in textParts])
totalSize = sum(p.metadata.get("size", 0) for p in textParts)
mergedPart = ContentPart(
id=makeId(),
parentId=textParts[0].parentId,
label=f"merged_{groupKey}",
typeGroup="text",
mimeType="text/plain",
data=combinedData,
metadata={
"size": totalSize,
"merged": len(textParts),
"originalParts": [p.id for p in textParts]
}
)
return [mergedPart] + nonTextParts
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int) -> List[ContentPart]:
if not parts:
return []
textParts = [p for p in parts if p.typeGroup == "text"]
nonTextParts = [p for p in parts if p.typeGroup != "text"]
if not textParts:
return nonTextParts
merged: List[ContentPart] = []
currentGroup: List[ContentPart] = []
currentSize = 0
for part in textParts:
partSize = part.metadata.get("size", 0)
if currentSize + partSize > maxSize and currentGroup:
# Flush current group
merged.extend(self._mergeGroup(currentGroup, f"chunk_{len(merged)}"))
currentGroup = [part]
currentSize = partSize
else:
currentGroup.append(part)
currentSize += partSize
# Flush remaining group
if currentGroup:
merged.extend(self._mergeGroup(currentGroup, f"chunk_{len(merged)}"))
return merged + nonTextParts

View file

@ -1,211 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
"""
from typing import List, Dict, Any
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from .subUtils import makeId
logger = logging.getLogger(__name__)
class IntelligentTokenAwareMerger:
"""
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
Strategy:
1. Calculate token count for each chunk
2. Group chunks to maximize token usage without exceeding limits
3. Preserve document structure and semantic boundaries
4. Minimize total number of AI calls
"""
def __init__(self, modelCapabilities: Dict[str, Any]):
self.maxTokens = modelCapabilities.get("maxTokens", 4000)
self.safetyMargin = modelCapabilities.get("safetyMargin", 0.1)
self.effectiveMaxTokens = int(self.maxTokens * (1 - self.safetyMargin))
self.charsPerToken = modelCapabilities.get("charsPerToken", 4) # Rough estimation
def mergeChunksIntelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
"""
Merge chunks intelligently based on token limits.
Args:
chunks: List of ContentPart chunks to merge
prompt: AI prompt to account for in token calculation
Returns:
List of optimally merged ContentPart objects
"""
if not chunks:
return chunks
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, maxTokens={self.effectiveMaxTokens}")
# Calculate tokens for prompt
promptTokens = self._estimateTokens(prompt)
availableTokens = self.effectiveMaxTokens - promptTokens
logger.info(f"📊 Prompt tokens: {promptTokens}, Available for content: {availableTokens}")
# Group chunks by document and type for semantic coherence
groupedChunks = self._groupChunksByDocumentAndType(chunks)
mergedParts = []
for groupKey, groupChunks in groupedChunks.items():
logger.info(f"📁 Processing group: {groupKey} ({len(groupChunks)} chunks)")
# Merge chunks within this group optimally
groupMerged = self._mergeGroupOptimally(groupChunks, availableTokens)
mergedParts.extend(groupMerged)
logger.info(f"✅ Intelligent merging complete: {len(chunks)}{len(mergedParts)} parts")
return mergedParts
def _groupChunksByDocumentAndType(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
"""Group chunks by document and type for semantic coherence."""
groups = {}
for chunk in chunks:
# Create group key: document_id + type_group
docId = chunk.metadata.get("documentId", "unknown")
typeGroup = chunk.typeGroup
groupKey = f"{docId}_{typeGroup}"
if groupKey not in groups:
groups[groupKey] = []
groups[groupKey].append(chunk)
return groups
def _mergeGroupOptimally(self, chunks: List[ContentPart], availableTokens: int) -> List[ContentPart]:
"""Merge chunks within a group optimally to minimize AI calls."""
if not chunks:
return []
# Sort chunks by size (smallest first for better packing)
sortedChunks = sorted(chunks, key=lambda c: self._estimateTokens(c.data))
mergedParts = []
currentGroup = []
currentTokens = 0
for chunk in sortedChunks:
chunkTokens = self._estimateTokens(chunk.data)
# Special case: If single chunk is already at max size, process it alone
if chunkTokens >= availableTokens * 0.9: # 90% of available tokens
# Finalize current group if it exists
if currentGroup:
mergedPart = self._createMergedPart(currentGroup, currentTokens)
mergedParts.append(mergedPart)
currentGroup = []
currentTokens = 0
# Process large chunk individually
mergedParts.append(chunk)
logger.debug(f"🔍 Large chunk processed individually: {chunkTokens} tokens")
continue
# If adding this chunk would exceed limit, finalize current group
if currentTokens + chunkTokens > availableTokens and currentGroup:
mergedPart = self._createMergedPart(currentGroup, currentTokens)
mergedParts.append(mergedPart)
currentGroup = [chunk]
currentTokens = chunkTokens
else:
currentGroup.append(chunk)
currentTokens += chunkTokens
# Finalize remaining group
if currentGroup:
mergedPart = self._createMergedPart(currentGroup, currentTokens)
mergedParts.append(mergedPart)
logger.info(f"📦 Group merged: {len(chunks)}{len(mergedParts)} parts")
return mergedParts
def _createMergedPart(self, chunks: List[ContentPart], totalTokens: int) -> ContentPart:
"""Create a merged ContentPart from multiple chunks."""
if len(chunks) == 1:
return chunks[0] # No need to merge single chunk
# Combine data with semantic separators
combinedData = self._combineChunkData(chunks)
# Use metadata from first chunk as base
baseChunk = chunks[0]
mergedMetadata = baseChunk.metadata.copy()
mergedMetadata.update({
"merged": True,
"originalChunkCount": len(chunks),
"totalTokens": totalTokens,
"originalChunkIds": [c.id for c in chunks],
"size": len(combinedData.encode('utf-8'))
})
mergedPart = ContentPart(
id=makeId(),
parentId=baseChunk.parentId,
label=f"merged_{len(chunks)}_chunks",
typeGroup=baseChunk.typeGroup,
mimeType=baseChunk.mimeType,
data=combinedData,
metadata=mergedMetadata
)
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {totalTokens} tokens")
return mergedPart
def _combineChunkData(self, chunks: List[ContentPart]) -> str:
"""Combine chunk data with appropriate separators."""
if not chunks:
return ""
# Use different separators based on content type
if chunks[0].typeGroup == "text":
separator = "\n\n---\n\n" # Clear text separation
elif chunks[0].typeGroup == "table":
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
else:
separator = "\n\n---\n\n" # Default separation
return separator.join([chunk.data for chunk in chunks])
def _estimateTokens(self, text: str) -> int:
"""Estimate token count for text."""
if not text:
return 0
return len(text) // self.charsPerToken
def calculateOptimizationStats(self, originalChunks: List[ContentPart], mergedParts: List[ContentPart]) -> Dict[str, Any]:
"""Calculate optimization statistics with detailed analysis."""
originalCalls = len(originalChunks)
optimizedCalls = len(mergedParts)
reductionPercent = ((originalCalls - optimizedCalls) / originalCalls * 100) if originalCalls > 0 else 0
# Analyze chunk sizes
largeChunks = [c for c in originalChunks if self._estimateTokens(c.data) >= self.effectiveMaxTokens * 0.9]
smallChunks = [c for c in originalChunks if self._estimateTokens(c.data) < self.effectiveMaxTokens * 0.9]
# Calculate theoretical maximum optimization (if all small chunks could be merged)
theoreticalMinCalls = len(largeChunks) + max(1, len(smallChunks) // 3) # Assume 3 small chunks per call
theoreticalReduction = ((originalCalls - theoreticalMinCalls) / originalCalls * 100) if originalCalls > 0 else 0
return {
"original_ai_calls": originalCalls,
"optimized_ai_calls": optimizedCalls,
"reduction_percent": round(reductionPercent, 1),
"cost_savings": f"{reductionPercent:.1f}%",
"efficiency_gain": f"{originalCalls / optimizedCalls:.1f}x" if optimizedCalls > 0 else "",
"analysis": {
"large_chunks": len(largeChunks),
"small_chunks": len(smallChunks),
"theoretical_min_calls": theoreticalMinCalls,
"theoretical_reduction": round(theoreticalReduction, 1),
"optimization_potential": "high" if reductionPercent > 50 else "moderate" if reductionPercent > 20 else "low"
}
}

View file

@ -1,48 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import List
import logging
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
logger = logging.getLogger(__name__)
# REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
extractor = extractorRegistry.resolve(mimeType, fileName)
if extractor is None:
# fallback: single binary part
part = ContentPart(
id=makeId(),
parentId=None,
label="file",
typeGroup="binary",
mimeType=mimeType or "application/octet-stream",
data="",
metadata={"warning": "No extractor registered"}
)
return ContentExtracted(id=makeId(), parts=[part])
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
# REMOVED: Chunking logic - now handled in AI call phase
# Apply merging strategy if provided (preserve existing logic)
if options.mergeStrategy:
# Use module-level applyMerging function
from .mainServiceExtraction import applyMerging
parts = applyMerging(parts, options.mergeStrategy)
return ContentExtracted(id=makeId(), parts=parts)
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
# REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access

View file

@ -1,214 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Prompt builder for document extraction.
This module builds prompts for extracting content from documents.
"""
import json
import logging
from typing import Dict, Any, Optional
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
# Type hint for renderer parameter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from modules.services.serviceGeneration.renderers.documentRendererBaseTemplate import BaseRenderer
_RendererLike = BaseRenderer
else:
_RendererLike = Any
logger = logging.getLogger(__name__)
async def buildExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None,
renderer: _RendererLike = None
) -> str:
"""
Build unified extraction prompt for extracting content from documents.
Always uses multi-file format (single doc = multi with n=1).
Args:
outputFormat: Target output format
userPrompt: User's prompt describing what to extract
title: Document title
aiService: Optional AI service for intent parsing
services: Services instance
renderer: Optional renderer for format-specific guidelines
Returns:
Complete extraction prompt string
"""
# Flat extraction format - returns extracted content as structured data, not documents/sections
# This format allows merging multiple contentParts into one response
json_example = {
"extracted_content": {
"text": "Extracted text content from the document...",
"tables": [
{
"headers": ["Column 1", "Column 2"],
"rows": [
["Value 1", "Value 2"],
["Value 3", "Value 4"]
]
}
],
"headings": [
{
"level": 1,
"text": "Main Heading"
},
{
"level": 2,
"text": "Subheading"
}
],
"lists": [
{
"type": "bullet",
"items": ["Item 1", "Item 2", "Item 3"]
}
],
"images": [
{
"description": "Description of image content, including all visible text, tables, and visual elements"
}
]
}
}
structure_instruction = """CRITICAL EXTRACTION REQUIREMENTS:
1. Extract content from the provided ContentPart(s) - process what is provided in this call
2. If this ContentPart contains tables, extract them with proper structure (headers and rows)
3. If this ContentPart contains text, extract it as structured text
4. Return ONE JSON object with extracted content from this ContentPart
5. Preserve all original data - do not summarize or interpret
6. The system will merge results from multiple ContentParts automatically - focus on extracting this ContentPart's content accurately"""
# Parse extraction intent if AI service is available
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
# Extract user language for document language instruction
userLanguage = 'en' # Default fallback
if services:
try:
# Prefer detected language if available
if hasattr(services, 'currentUserLanguage') and services.currentUserLanguage:
userLanguage = services.currentUserLanguage
elif hasattr(services, 'user') and services.user and hasattr(services.user, 'language'):
userLanguage = services.user.language
except Exception:
pass
# Build base prompt with clear user prompt markers
sanitized_user_prompt = services.utils.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt
adaptive_prompt = f"""
{'='*80}
USER REQUEST / USER PROMPT:
{'='*80}
{sanitized_user_prompt}
{'='*80}
END OF USER REQUEST / USER PROMPT
{'='*80}
You are a document processing assistant that extracts content from documents. Your task is to analyze the provided ContentPart(s) and extract their content into a structured JSON format.
TASK: Extract content from the provided ContentPart(s). Extract all tables, text, headings, lists, and other content types accurately. The system processes ContentParts individually and merges results automatically.
LANGUAGE REQUIREMENT: All extracted content must be in the language '{userLanguage}'. Extract and preserve content in this language.
{extraction_intent}
{structure_instruction}
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(json_example, indent=2)}
CRITICAL EXTRACTION RULES:
- Extract only content that is ACTUALLY PRESENT in the ContentPart - never create fake or placeholder data
- Return empty arrays [] or empty strings "" when content is missing - this is normal and expected
- Extract all tables, text, headings, lists accurately with proper structure
- Preserve all original data - do not summarize or interpret
- Return ONE JSON object per ContentPart (the system merges multiple ContentParts automatically)
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract only actual content from the ContentPart. Return empty arrays/strings when content is missing - never create fake data.
""".strip()
# Add renderer-specific guidelines if provided
if renderer:
try:
if hasattr(renderer, 'getExtractionGuidelines'):
formatGuidelines = renderer.getExtractionGuidelines()
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
except Exception:
pass
# Save extraction prompt to debug file - only if debug enabled
from modules.shared.debugLogger import writeDebugFile
writeDebugFile(adaptive_prompt, "extraction_prompt")
return adaptive_prompt
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
"""
Parse user prompt to extract the core extraction intent.
"""
if not aiService:
return f"Extract content from the provided documents and create a {outputFormat} report."
try:
analysis_prompt = f"""
Analyze this user request and extract the core extraction intent:
User request: "{userPrompt}"
Target format: {outputFormat}
Extract the main intent and requirements for document processing. Focus on:
1. What content needs to be extracted
2. How it should be organized
3. Any specific requirements or preferences
Respond with a clear, concise statement of the extraction intent.
"""
request_options = AiCallOptions()
request_options.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
return response.content.strip()
else:
return f"Extract content from the provided documents and create a {outputFormat} report."
except Exception as e:
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
return f"Extract content from the provided documents and create a {outputFormat} report."

View file

@ -1,208 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from typing import Any, Dict, Optional
import logging
from modules.datamodels.datamodelExtraction import ContentPart
logger = logging.getLogger(__name__)
class Extractor:
"""
Base class for all document extractors.
Each extractor should implement:
- detect(): Check if this extractor can handle the given file
- extract(): Extract content from the file
- getSupportedExtensions(): Return supported file extensions
- getSupportedMimeTypes(): Return supported MIME types
"""
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
"""Check if this extractor can handle the given file."""
return False
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
"""Extract content from the file bytes."""
raise NotImplementedError
def getSupportedExtensions(self) -> list[str]:
"""Return list of supported file extensions (including dots)."""
return []
def getSupportedMimeTypes(self) -> list[str]:
"""Return list of supported MIME types."""
return []
class Chunker:
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
return []
class ExtractorRegistry:
def __init__(self):
self._map: Dict[str, Extractor] = {}
self._fallback: Optional[Extractor] = None
self._auto_discover_extractors()
def _auto_discover_extractors(self):
"""Auto-discover and register all extractors from the extractors directory."""
try:
import os
import importlib
from pathlib import Path
# Get the extractors directory
current_dir = Path(__file__).parent
extractors_dir = current_dir / "extractors"
if not extractors_dir.exists():
logger.error(f"Extractors directory not found: {extractors_dir}")
return
# Import all extractor modules
extractor_modules = []
for file_path in extractors_dir.glob("extractor*.py"):
if file_path.name == "__init__.py":
continue
module_name = file_path.stem
try:
# Import the module
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
# Find all extractor classes in the module
for attr_name in dir(module):
attr = getattr(module, attr_name)
if (isinstance(attr, type) and
issubclass(attr, Extractor) and
attr != Extractor and
not attr_name.startswith('_')):
# Create instance and auto-register
extractor_instance = attr()
self._auto_register_extractor(extractor_instance)
extractor_modules.append(attr_name)
except Exception as e:
logger.warning(f"Failed to import {module_name}: {str(e)}")
continue
# Set fallback extractor
try:
from .extractors.extractorBinary import BinaryExtractor
self.setFallback(BinaryExtractor())
except Exception as e:
logger.warning(f"Failed to set fallback extractor: {str(e)}")
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
except Exception as e:
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
import traceback
traceback.print_exc()
def _auto_register_extractor(self, extractor: Extractor):
"""Auto-register an extractor based on its declared supported formats."""
try:
# Register MIME types
mime_types = extractor.getSupportedMimeTypes()
for mime_type in mime_types:
self.register(mime_type, extractor)
# Register file extensions
extensions = extractor.getSupportedExtensions()
for ext in extensions:
# Remove leading dot for registry key
ext_key = ext.lstrip('.')
self.register(ext_key, extractor)
except Exception as e:
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
def register(self, key: str, extractor: Extractor):
self._map[key] = extractor
def setFallback(self, extractor: Extractor):
self._fallback = extractor
def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]:
if mimeType in self._map:
return self._map[mimeType]
# simple extension fallback
if "." in fileName:
ext = fileName.lower().rsplit(".", 1)[-1]
if ext in self._map:
return self._map[ext]
return self._fallback
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
"""
Get all supported formats from all registered extractors.
Returns:
Dictionary with format information:
{
"extensions": {
"extractor_name": [".ext1", ".ext2", ...]
},
"mime_types": {
"extractor_name": ["mime/type1", "mime/type2", ...]
}
}
"""
formats = {"extensions": {}, "mime_types": {}}
# Get formats from registered extractors
for key, extractor in self._map.items():
if hasattr(extractor, 'getSupportedExtensions'):
extensions = extractor.getSupportedExtensions()
if extensions:
formats["extensions"][key] = extensions
if hasattr(extractor, 'getSupportedMimeTypes'):
mime_types = extractor.getSupportedMimeTypes()
if mime_types:
formats["mime_types"][key] = mime_types
# Add fallback extractor info
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
return formats
class ChunkerRegistry:
def __init__(self):
self._map: Dict[str, Chunker] = {}
self._noop = Chunker()
# Register default chunkers
try:
from .chunking.chunkerText import TextChunker
from .chunking.chunkerTable import TableChunker
from .chunking.chunkerStructure import StructureChunker
from .chunking.chunkerImage import ImageChunker
self.register("text", TextChunker())
self.register("table", TableChunker())
self.register("structure", StructureChunker())
self.register("image", ImageChunker())
# Use text chunker for container and binary content
self.register("container", TextChunker())
self.register("binary", TextChunker())
except Exception as e:
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
import traceback
traceback.print_exc()
def register(self, typeGroup: str, chunker: Chunker):
self._map[typeGroup] = chunker
def resolve(self, typeGroup: str) -> Chunker:
return self._map.get(typeGroup, self._noop)

View file

@ -1,7 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import uuid
def makeId() -> str:
return str(uuid.uuid4())

View file

@ -1,587 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import logging
import uuid
import base64
import traceback
from typing import Any, Dict, List, Optional, Callable
from modules.datamodels.datamodelDocument import RenderedDocument
from modules.datamodels.datamodelChat import ChatDocument
from modules.services.serviceGeneration.subDocumentUtility import (
getFileExtension,
getMimeTypeFromExtension,
detectMimeTypeFromContent,
detectMimeTypeFromData,
convertDocumentDataToString
)
logger = logging.getLogger(__name__)
class GenerationService:
def __init__(self, serviceCenter=None):
# Directly use interfaces from the provided service center (no self.service calls)
self.services = serviceCenter
self.interfaceDbComponent = serviceCenter.interfaceDbComponent
self.interfaceDbChat = serviceCenter.interfaceDbChat
def processActionResultDocuments(self, actionResult, action) -> List[Dict[str, Any]]:
"""
Process documents produced by AI actions and convert them to ChatDocument format.
This function handles AI-generated document data, not document references.
Returns a list of processed document dictionaries.
"""
try:
# Read documents from the standard documents field (not data.documents)
documents = actionResult.documents if actionResult and hasattr(actionResult, 'documents') else []
if not documents:
return []
# Process each document from the AI action result
processedDocuments = []
for doc in documents:
processedDoc = self.processSingleDocument(doc, action)
if processedDoc:
processedDocuments.append(processedDoc)
return processedDocuments
except Exception as e:
logger.error(f"Error processing action result documents: {str(e)}")
return []
def processSingleDocument(self, doc: Any, action) -> Optional[Dict[str, Any]]:
"""Process a single document from action result with simplified logic"""
try:
# ActionDocument objects have documentName, documentData, and mimeType
mime_type = doc.mimeType
if mime_type == "application/octet-stream":
content = doc.documentData
# Detect MIME without relying on a service center
mime_type = detectMimeTypeFromContent(content, doc.documentName)
# WICHTIG: Für ActionDocuments mit validationMetadata (z.B. context.extractContent)
# müssen wir das gesamte ActionDocument serialisieren, nicht nur documentData
document_data = doc.documentData
if hasattr(doc, 'validationMetadata') and doc.validationMetadata:
# Wenn validationMetadata vorhanden ist, serialisiere das gesamte ActionDocument-Format
if mime_type == "application/json":
# Erstelle ActionDocument-Format mit validationMetadata und documentData
if hasattr(document_data, 'model_dump'):
# Pydantic v2
document_data_dict = document_data.model_dump()
elif hasattr(document_data, 'dict'):
# Pydantic v1
document_data_dict = document_data.dict()
elif isinstance(document_data, dict):
document_data_dict = document_data
elif isinstance(document_data, str):
# JSON-String: parsen und als dict speichern (z.B. von outlook.composeAndDraftEmailWithContext)
import json
try:
document_data_dict = json.loads(document_data)
except json.JSONDecodeError:
# Kein valides JSON - als plain text speichern
document_data_dict = {"data": document_data}
else:
document_data_dict = {"data": str(document_data)}
# Erstelle ActionDocument-Format
document_data = {
"validationMetadata": doc.validationMetadata,
"documentData": document_data_dict
}
return {
'fileName': doc.documentName,
'fileSize': len(str(document_data)),
'mimeType': mime_type,
'content': document_data,
'document': doc
}
except Exception as e:
logger.error(f"Error processing single document: {str(e)}")
return None
def createDocumentsFromActionResult(self, actionResult, action, workflow, message_id=None) -> List[Any]:
"""
Create actual document objects from action result and store them in the system.
Returns a list of created document objects with proper workflow context.
"""
try:
processed_docs = self.processActionResultDocuments(actionResult, action)
createdDocuments = []
for i, doc_data in enumerate(processed_docs):
try:
documentName = doc_data['fileName']
documentData = doc_data['content']
mimeType = doc_data['mimeType']
# Handle binary data (images, PDFs, Office docs) differently from text
# Check if this is a binary MIME type
binaryMimeTypes = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/pdf",
"image/png", "image/jpeg", "image/jpg", "image/gif", "image/webp", "image/bmp", "image/svg+xml",
}
isBinaryMimeType = mimeType in binaryMimeTypes
base64encoded = False
content = None
if isBinaryMimeType:
# For binary data, handle bytes vs base64 string vs regular string
if isinstance(documentData, bytes):
# Already bytes - encode to base64 string for storage
# base64 is already imported at module level
content = base64.b64encode(documentData).decode('utf-8')
base64encoded = True
elif isinstance(documentData, str):
# Check if it's already valid base64
# base64 is already imported at module level
try:
# Try to decode to verify it's base64
base64.b64decode(documentData, validate=True)
# Valid base64 - use as is
content = documentData
base64encoded = True
except Exception:
# Not valid base64 - might be raw string, try encoding
try:
content = base64.b64encode(documentData.encode('utf-8')).decode('utf-8')
base64encoded = True
except Exception:
logger.warning(f"Could not process binary data for {documentName}, skipping")
continue
else:
# Other types - convert to string then base64
# base64 is already imported at module level
try:
content = base64.b64encode(str(documentData).encode('utf-8')).decode('utf-8')
base64encoded = True
except Exception:
logger.warning(f"Could not encode binary data for {documentName}, skipping")
continue
else:
# Text data - convert to string
content = convertDocumentDataToString(documentData, getFileExtension(documentName))
# Skip empty or minimal content
minimalContentPatterns = ['{}', '[]', 'null', '""', "''"]
if not content or content.strip() == "" or content.strip() in minimalContentPatterns:
logger.warning(f"Empty or minimal content for document {documentName}, skipping")
continue
# Normalize file extension based on mime type if missing or incorrect
try:
mime_to_ext = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/pdf": ".pdf",
"text/html": ".html",
"text/markdown": ".md",
"text/plain": ".txt",
"application/json": ".json",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/webp": ".webp",
"image/bmp": ".bmp",
"image/svg+xml": ".svg",
}
expectedExt = mime_to_ext.get(mimeType)
if expectedExt:
if not documentName.lower().endswith(expectedExt):
# Append/replace extension to match mime type
if "." in documentName:
documentName = documentName.rsplit(".", 1)[0] + expectedExt
else:
documentName = documentName + expectedExt
except Exception:
pass
# Create document with file in one step using interfaces directly
document = self._createDocument(
fileName=documentName,
mimeType=mimeType,
content=content,
base64encoded=base64encoded,
messageId=message_id
)
if document:
# Set workflow context on the document if possible
self._setDocumentWorkflowContext(document, action, workflow)
createdDocuments.append(document)
else:
logger.error(f"Failed to create ChatDocument object for {documentName}")
except Exception as e:
logger.error(f"Error creating document {doc_data.get('fileName', 'unknown')}: {str(e)}")
continue
return createdDocuments
except Exception as e:
logger.error(f"Error creating documents from action result: {str(e)}")
return []
def _setDocumentWorkflowContext(self, document, action, workflow):
"""Set workflow context on a document for proper routing and labeling"""
try:
# Get current workflow context directly from workflow object
workflowContext = self._getWorkflowContext(workflow)
workflowStats = self._getWorkflowStats(workflow)
currentRound = workflowContext.get('currentRound', 0)
currentTask = workflowContext.get('currentTask', 0)
currentAction = workflowContext.get('currentAction', 0)
# Try to set workflow context attributes if they exist
if hasattr(document, 'roundNumber'):
document.roundNumber = currentRound
if hasattr(document, 'taskNumber'):
document.taskNumber = currentTask
if hasattr(document, 'actionNumber'):
document.actionNumber = currentAction
if hasattr(document, 'actionId'):
document.actionId = action.id if hasattr(action, 'id') else None
# Set additional workflow metadata if available
if hasattr(document, 'workflowId'):
document.workflowId = workflowStats.get('workflowId', workflow.id if hasattr(workflow, 'id') else None)
if hasattr(document, 'workflowStatus'):
document.workflowStatus = workflowStats.get('workflowStatus', workflow.status if hasattr(workflow, 'status') else 'unknown')
except Exception as e:
logger.warning(f"Could not set workflow context on document: {str(e)}")
def _createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True, messageId: str = None) -> Optional[ChatDocument]:
"""Create file and ChatDocument using interfaces without service indirection."""
try:
if not self.interfaceDbComponent:
logger.error("Component interface not available for document creation")
return None
# Convert content to bytes
if base64encoded:
# base64 is already imported at module level
content_bytes = base64.b64decode(content)
else:
content_bytes = content.encode('utf-8')
# Create file and store data
file_item = self.interfaceDbComponent.createFile(
name=fileName,
mimeType=mimeType,
content=content_bytes
)
self.interfaceDbComponent.createFileData(file_item.id, content_bytes)
# Collect file info
file_info = self._getFileInfo(file_item.id)
if not file_info:
logger.error(f"Could not get file info for fileId: {file_item.id}")
return None
# Build ChatDocument
document = ChatDocument(
id=str(uuid.uuid4()),
messageId=messageId or "",
fileId=file_item.id,
fileName=file_info.get("fileName", fileName),
fileSize=file_info.get("size", 0),
mimeType=file_info.get("mimeType", mimeType)
)
# Ensure document can access component interface later
if hasattr(document, 'setComponentInterface') and self.interfaceDbComponent:
try:
document.setComponentInterface(self.interfaceDbComponent)
except Exception:
pass
return document
except Exception as e:
logger.error(f"Error creating document: {str(e)}")
return None
def _getFileInfo(self, fileId: str) -> Optional[Dict[str, Any]]:
try:
if not self.interfaceDbComponent:
return None
file_item = self.interfaceDbComponent.getFile(fileId)
if file_item:
return {
"id": file_item.id,
"fileName": file_item.fileName,
"size": file_item.fileSize,
"mimeType": file_item.mimeType,
"fileHash": getattr(file_item, 'fileHash', None),
"creationDate": getattr(file_item, 'creationDate', None)
}
return None
except Exception as e:
logger.error(f"Error getting file info for {fileId}: {str(e)}")
return None
def _getWorkflowContext(self, workflow) -> Dict[str, int]:
try:
return {
'currentRound': getattr(workflow, 'currentRound', 0),
'currentTask': getattr(workflow, 'currentTask', 0),
'currentAction': getattr(workflow, 'currentAction', 0)
}
except Exception:
return {'currentRound': 0, 'currentTask': 0, 'currentAction': 0}
def _getWorkflowStats(self, workflow) -> Dict[str, Any]:
try:
context = self._getWorkflowContext(workflow)
return {
'currentRound': context['currentRound'],
'currentTask': context['currentTask'],
'currentAction': context['currentAction'],
'totalTasks': getattr(workflow, 'totalTasks', 0),
'totalActions': getattr(workflow, 'totalActions', 0),
'workflowStatus': getattr(workflow, 'status', 'unknown'),
'workflowId': getattr(workflow, 'id', 'unknown')
}
except Exception:
return {
'currentRound': 0,
'currentTask': 0,
'currentAction': 0,
'totalTasks': 0,
'totalActions': 0,
'workflowStatus': 'unknown',
'workflowId': 'unknown'
}
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to the specified output format.
Processes EACH document separately and calls renderer for each.
Each renderer can return 1..n documents (e.g., HTML + images).
Per-document format and language are extracted from structure (validated in State 3).
Multiple documents can have different formats and languages.
Args:
extractedContent: Structured JSON document with documents array
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) - Global fallback
language: Language (global fallback) - Per-document language extracted from structure
title: Report title
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
parentOperationId: Optional parent operation ID for hierarchical logging
Returns:
List of RenderedDocument objects.
Each RenderedDocument represents one rendered file (main document or supporting file)
"""
try:
# Validate JSON input
if not isinstance(extractedContent, dict):
raise ValueError("extractedContent must be a JSON dictionary")
# Unified approach: Always expect "documents" array
if "documents" not in extractedContent:
raise ValueError("extractedContent must contain 'documents' array")
documents = extractedContent["documents"]
if len(documents) == 0:
raise ValueError("No documents found in 'documents' array")
metadata = extractedContent.get("metadata", {})
allRenderedDocuments = []
# Process EACH document separately
for docIndex, doc in enumerate(documents):
if not isinstance(doc, dict):
logger.warning(f"Skipping invalid document at index {docIndex}")
continue
if "sections" not in doc:
logger.warning(f"Document {doc.get('id', docIndex)} has no sections, skipping")
continue
# Determine format for this document
# Check outputFormat field first (per-document), then format field (legacy), then global fallback
docFormat = doc.get("outputFormat") or doc.get("format") or outputFormat
# Determine language for this document
# Extract per-document language from structure (validated in State 3), fallback to global
docLanguage = doc.get("language") or language
# Validate language format (should be 2-character ISO code, validated in State 3)
if not isinstance(docLanguage, str) or len(docLanguage) != 2:
logger.warning(f"Document {doc.get('id')} has invalid language format: {docLanguage}, using fallback")
docLanguage = language # Use global fallback
# Get renderer for this document's format
renderer = self._getFormatRenderer(docFormat)
if not renderer:
logger.warning(f"Unsupported format '{docFormat}' for document {doc.get('id', docIndex)}, skipping")
continue
# Check output style classification (code/document/image/etc.) from renderer
from modules.services.serviceGeneration.renderers.registry import getOutputStyle
outputStyle = getOutputStyle(docFormat)
if outputStyle:
logger.debug(f"Document {doc.get('id', docIndex)} format '{docFormat}' classified as '{outputStyle}' style")
# Store style in document metadata for potential use in processing paths
if "metadata" not in doc:
doc["metadata"] = {}
doc["metadata"]["outputStyle"] = outputStyle
# Create JSON structure with single document (preserving metadata)
singleDocContent = {
"metadata": {**metadata, "language": docLanguage}, # Add per-document language to metadata
"documents": [doc] # Only this document
}
# Use document title or fallback to provided title
docTitle = doc.get("title", title)
# Render this document (can return multiple files, e.g., HTML + images)
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService)
allRenderedDocuments.extend(renderedDocs)
logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)")
return allRenderedDocuments
except Exception as e:
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
raise
async def generateDocumentWithTwoPhases(
self,
userPrompt: str,
cachedContent: Optional[Dict[str, Any]] = None,
contentParts: Optional[List[Any]] = None,
maxSectionLength: int = 500,
parallelGeneration: bool = True,
progressCallback: Optional[Callable] = None
) -> Dict[str, Any]:
"""
Generate document using two-phase approach:
1. Generate structure skeleton with empty sections
2. Generate content for each section iteratively
This is the core logic for document generation in AI calls.
Args:
userPrompt: User's original prompt
cachedContent: Optional extracted content cache (from extraction phase)
contentParts: Optional list of ContentParts to use for structure generation
maxSectionLength: Maximum words for simple sections
parallelGeneration: Enable parallel section generation
progressCallback: Optional callback function(progress, total, message) for progress updates
Returns:
Complete document structure with populated elements ready for rendering
"""
try:
from modules.services.serviceGeneration.subStructureGenerator import StructureGenerator
from modules.services.serviceGeneration.subContentGenerator import ContentGenerator
# Phase 1: Generate structure skeleton
if progressCallback:
progressCallback(0, 100, "Generating document structure...")
structureGenerator = StructureGenerator(self.services)
# Extract imageDocuments from cachedContent if available
existingImages = None
if cachedContent and cachedContent.get("imageDocuments"):
existingImages = cachedContent.get("imageDocuments")
structure = await structureGenerator.generateStructure(
userPrompt=userPrompt,
documentList=None, # Not used in current implementation
cachedContent=cachedContent,
contentParts=contentParts, # Pass ContentParts for structure generation
maxSectionLength=maxSectionLength,
existingImages=existingImages
)
if progressCallback:
progressCallback(30, 100, "Structure generated, starting content generation...")
# Phase 2: Generate content for each section
contentGenerator = ContentGenerator(self.services)
# Create progress callback wrapper for content generation phase (30-90%)
def contentProgressCallback(sectionIndex: int, totalSections: int, message: str):
if progressCallback:
# Map section progress to overall progress (30% to 90%)
if totalSections > 0:
overallProgress = 30 + int(60 * (sectionIndex / totalSections))
else:
overallProgress = 30
progressCallback(overallProgress, 100, f"Section {sectionIndex}/{totalSections}: {message}")
completeStructure = await contentGenerator.generateContent(
structure=structure,
cachedContent=cachedContent,
userPrompt=userPrompt,
contentParts=contentParts, # Pass ContentParts for content generation
progressCallback=contentProgressCallback,
parallelGeneration=parallelGeneration
)
if progressCallback:
progressCallback(100, 100, "Document generation complete")
return completeStructure
except Exception as e:
logger.error(f"Error in two-phase document generation: {str(e)}")
logger.debug(traceback.format_exc())
raise
async def getAdaptiveExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""Get adaptive extraction prompt."""
from modules.services.serviceExtraction.subPromptBuilderExtraction import buildExtractionPrompt
return await buildExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
def _getFormatRenderer(self, output_format: str):
"""Get the appropriate document renderer for the specified format."""
try:
from .renderers.registry import getRenderer, getSupportedFormats
renderer = getRenderer(output_format, services=self.services, outputStyle='document')
if renderer:
return renderer
# Log available formats for debugging
availableFormats = getSupportedFormats()
logger.error(
f"No renderer found for format '{output_format}'. "
f"Available formats: {availableFormats}"
)
# Fallback to text renderer if no specific renderer found
logger.warning(f"Falling back to text renderer for format {output_format}")
fallbackRenderer = getRenderer('text', services=self.services, outputStyle='document')
if fallbackRenderer:
return fallbackRenderer
logger.error("Even text renderer fallback failed")
return None
except Exception as e:
logger.error(f"Error getting renderer for {output_format}: {str(e)}")
# traceback is already imported at module level
logger.debug(traceback.format_exc())
return None

View file

@ -1,939 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Code Generation Path
Handles code generation with multi-file project support, dependency handling,
and proper cross-file references.
"""
import json
import logging
import time
import re
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
from modules.datamodels.datamodelExtraction import ContentPart
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.shared.jsonUtils import extractJsonString
logger = logging.getLogger(__name__)
class CodeGenerationPath:
"""Code generation path."""
def __init__(self, services):
self.services = services
async def generateCode(
self,
userPrompt: str,
outputFormat: str = None,
contentParts: Optional[List[ContentPart]] = None,
title: str = "Generated Code",
parentOperationId: Optional[str] = None
) -> AiResponse:
"""
Generate code files with multi-file project support.
Returns: AiResponse with code files as documents
"""
# Create operation ID
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
codeOperationId = f"code_gen_{workflowId}_{int(time.time())}"
# Start progress tracking
self.services.chat.progressLogStart(
codeOperationId,
"Code Generation",
"Code Generation",
f"Format: {outputFormat or 'txt'}",
parentOperationId=parentOperationId
)
try:
# Detect language and project type from prompt or outputFormat
language, projectType = self._detectLanguageAndProjectType(userPrompt, outputFormat)
# Phase 1: Code structure generation (with looping)
self.services.chat.progressLogUpdate(codeOperationId, 0.2, "Generating code structure")
codeStructure = await self._generateCodeStructure(
userPrompt=userPrompt,
language=language,
outputFormat=outputFormat,
contentParts=contentParts
)
# Phase 2: Code content generation (with dependency handling)
self.services.chat.progressLogUpdate(codeOperationId, 0.5, "Generating code content")
codeFiles = await self._generateCodeContent(
codeStructure,
codeOperationId,
userPrompt=userPrompt,
contentParts=contentParts
)
# Phase 3: Code formatting & validation
self.services.chat.progressLogUpdate(codeOperationId, 0.8, "Formatting code files")
formattedFiles = await self._formatAndValidateCode(codeFiles)
# Phase 4: Code Rendering (Renderer-Based)
self.services.chat.progressLogUpdate(codeOperationId, 0.9, "Rendering code files")
# Group files by format
filesByFormat = {}
for file in formattedFiles:
fileType = file.get("fileType", outputFormat or "txt")
if fileType not in filesByFormat:
filesByFormat[fileType] = []
filesByFormat[fileType].append(file)
# Render each format group using appropriate renderer
allRenderedDocuments = []
for fileType, files in filesByFormat.items():
# Get renderer for this format
renderer = self._getCodeRenderer(fileType)
if renderer:
# Use code renderer
renderedDocs = await renderer.renderCodeFiles(
codeFiles=files,
metadata=codeStructure.get("metadata", {}),
userPrompt=userPrompt
)
allRenderedDocuments.extend(renderedDocs)
else:
# Fallback: output directly (for formats without renderers)
for file in files:
mimeType = self._getMimeType(file.get("fileType", "txt"))
content = file.get("content", "")
contentBytes = content.encode('utf-8') if isinstance(content, str) else content
from modules.datamodels.datamodelDocument import RenderedDocument
allRenderedDocuments.append(
RenderedDocument(
documentData=contentBytes,
mimeType=mimeType,
filename=file.get("filename", "generated.txt"),
metadata=codeStructure.get("metadata", {})
)
)
# Convert RenderedDocument to DocumentData
documents = []
for renderedDoc in allRenderedDocuments:
documents.append(DocumentData(
documentName=renderedDoc.filename,
documentData=renderedDoc.documentData,
mimeType=renderedDoc.mimeType,
sourceJson=renderedDoc.metadata if hasattr(renderedDoc, 'metadata') else None
))
metadata = AiResponseMetadata(
title=title,
operationType=OperationTypeEnum.DATA_GENERATE.value
)
# Create summary JSON for content field
summaryContent = {
"type": "code_generation",
"metadata": codeStructure.get("metadata", {}),
"files": [
{
"filename": doc.documentName,
"mimeType": doc.mimeType
}
for doc in documents
],
"fileCount": len(documents)
}
self.services.chat.progressLogFinish(codeOperationId, True)
return AiResponse(
documents=documents,
content=json.dumps(summaryContent, ensure_ascii=False),
metadata=metadata
)
except Exception as e:
logger.error(f"Error in code generation: {str(e)}")
self.services.chat.progressLogFinish(codeOperationId, False)
raise
def _detectLanguageAndProjectType(self, userPrompt: str, outputFormat: Optional[str]) -> tuple:
"""Detect programming language and project type from prompt or format."""
promptLower = userPrompt.lower()
# Detect language
language = None
if outputFormat:
if outputFormat == "py":
language = "python"
elif outputFormat in ["js", "ts"]:
language = outputFormat
elif outputFormat == "html":
language = "html"
if not language:
if "python" in promptLower or ".py" in promptLower:
language = "python"
elif "javascript" in promptLower or ".js" in promptLower:
language = "javascript"
elif "typescript" in promptLower or ".ts" in promptLower:
language = "typescript"
elif "html" in promptLower:
language = "html"
else:
language = "python" # Default
# Detect project type
projectType = "single_file"
if "multi" in promptLower or "multiple files" in promptLower or "project" in promptLower:
projectType = "multi_file"
return language, projectType
async def _generateCodeStructure(
self,
userPrompt: str,
language: str,
outputFormat: Optional[str],
contentParts: Optional[List[ContentPart]]
) -> Dict[str, Any]:
"""Generate code structure using looping system."""
# Build content parts index (similar to document generation)
contentPartsIndex = ""
if contentParts:
validParts = []
for part in contentParts:
contentFormat = part.metadata.get("contentFormat", "unknown")
originalFileName = part.metadata.get('originalFileName', 'N/A')
# Include reference parts and parts with data
if contentFormat == "reference" or (part.data and len(str(part.data).strip()) > 0):
validParts.append(part)
if validParts:
contentPartsIndex = "\n## AVAILABLE CONTENT PARTS\n"
for i, part in enumerate(validParts, 1):
contentFormat = part.metadata.get("contentFormat", "unknown")
originalFileName = part.metadata.get('originalFileName', 'N/A')
contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
contentPartsIndex += f" Format: {contentFormat}\n"
contentPartsIndex += f" Type: {part.typeGroup}\n"
contentPartsIndex += f" MIME Type: {part.mimeType or 'N/A'}\n"
contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n"
contentPartsIndex += f" Original file name: {originalFileName}\n"
contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
if not contentPartsIndex:
contentPartsIndex = "\n(No content parts available)"
# Create template structure explicitly (not extracted from prompt)
templateStructure = f"""{{
"metadata": {{
"language": "{language}",
"projectType": "single_file|multi_file",
"projectName": ""
}},
"files": [
{{
"id": "",
"filename": "",
"fileType": "",
"dependencies": [],
"imports": [],
"functions": [],
"classes": []
}}
]
}}"""
# Build structure generation prompt
structurePrompt = f"""# TASK: Generate Code Project Structure
This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
## USER REQUEST (for context)
```
{userPrompt}
```
{contentPartsIndex}
## LANGUAGE
{language}
## TASK DESCRIPTION
Analyze the USER REQUEST above and create a project structure that fulfills ALL requirements mentioned in the request.
IMPORTANT: If the request mentions multiple files (e.g., "3 files", "config.json and customers.json", etc.), you MUST include ALL requested files in the files array. Set projectType to "multi_file" when multiple files are requested.
## CONTENT PARTS USAGE (if available)
If AVAILABLE CONTENT PARTS are listed above, use them to inform the file structure:
**Analyzing Content Parts:**
- Review each ContentPart's format, type, original file name, and usage hint
- Content parts with "reference" format = documents/images that will be processed/extracted
- Content parts with "extracted" format = pre-processed data ready to use
- Content parts with "object" format = images/documents to be displayed or processed
**Mapping Content Parts to Files:**
- If content parts contain data (e.g., expense receipts, customer lists), create data files (JSON/CSV) that will store/represent that data
- If content parts are documents to be processed (e.g., PDFs), you may need code files that parse/process them
- Use the original file names and usage hints to determine appropriate filenames and file types
**Populating File Structure Fields:**
- **dependencies**: List file IDs that this file depends on (e.g., if a Python script reads a JSON config file, the script depends on the config file)
- **imports**: For code files, list imports needed based on content parts (e.g., if processing PDFs: ["import PyPDF2"], if processing CSV: ["import csv"], if processing JSON: ["import json"])
- **functions**: For CODE files only - list function signatures if the USER REQUEST specifies functionality (e.g., {{"name": "parseReceipt", "signature": "def parseReceipt(pdf_path: str) -> dict"}})
- **classes**: For CODE files only - list class definitions if the USER REQUEST specifies OOP structure
- **functions/classes for DATA files**: Leave as empty arrays [] - data files (JSON/CSV/XML) don't contain executable code
## FILE STRUCTURE REQUIREMENTS
Create a JSON structure with:
1. metadata: {{"language": "{language}", "projectType": "single_file|multi_file", "projectName": "..."}}
- projectName: Derive from USER REQUEST or content parts (e.g., "expense-tracker", "customer-manager")
2. files: Array of file structures, each with:
- id: Unique identifier (e.g., "file_1", "file_2")
- filename: File name matching USER REQUEST requirements (e.g., "config.json", "customers.json", "expenses.csv")
- fileType: File extension matching the requested format (e.g., "json", "py", "js", "csv", "xml")
- dependencies: List of file IDs this file depends on (for multi-file projects where files reference each other)
- imports: List of import statements that this file will need (e.g., ["import json", "import csv"] for Python files processing JSON/CSV)
- functions: Array of function signatures {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV)
- classes: Array of class definitions {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV)
IMPORTANT FOR DATA FILES (JSON, CSV, XML):
- For pure data files (config.json, customers.json, expenses.csv), leave functions and classes as empty arrays []
- These files contain structured data, not executable code
- Use imports only if the file will be processed by code (e.g., a Python script that reads the CSV)
IMPORTANT FOR CODE FILES (Python, JavaScript, etc.):
- Include functions/classes if the USER REQUEST specifies functionality
- Use dependencies to indicate which data files this code file reads/processes
- Use imports to specify what libraries/modules are needed
For single-file projects, return one file. For multi-file projects, include ALL requested files in the files array.
Return ONLY valid JSON matching the request above.
"""
# Build continuation prompt builder
async def buildCodeStructurePromptWithContinuation(
continuationContext: Any,
templateStructure: str,
basePrompt: str
) -> str:
"""Build code structure prompt with continuation context. Uses unified signature.
Note: All initial context (userPrompt, contentParts, etc.) is already
contained in basePrompt. This function only adds continuation-specific instructions.
"""
# Extract continuation context fields (only what's needed for continuation)
incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json
# Generate both overlap context and hierarchy context using jsonContinuation
overlapContext = ""
unifiedContext = ""
if lastRawJson:
# Get contexts directly from jsonContinuation
from modules.shared.jsonContinuation import getContexts
contexts = getContexts(lastRawJson)
overlapContext = contexts.overlapContext
unifiedContext = contexts.hierarchyContextForPrompt
elif incompletePart:
unifiedContext = incompletePart
else:
unifiedContext = "Unable to extract context - response was completely broken"
# Build unified continuation prompt format
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
The previous JSON response was incomplete. Continue from where it stopped.
Context showing structure hierarchy with cut point:
```
{unifiedContext}
```
Overlap Requirement:
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
Overlap context (start your response with this exact text):
```json
{overlapContext if overlapContext else "No overlap context available"}
```
TASK:
1. Start your response EXACTLY with the overlap context shown above (character by character)
2. Continue seamlessly from where the overlap context ends
3. Complete the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL:
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
- Continue seamlessly after the overlap context with new content
- Your response must be valid JSON matching the structure template above"""
return continuationPrompt
# Use generic looping system with code_structure use case
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
resultFormat="json"
)
structureJson = await self.services.ai.callAiWithLooping(
prompt=structurePrompt,
options=options,
promptBuilder=buildCodeStructurePromptWithContinuation,
promptArgs={
"userPrompt": userPrompt,
"contentParts": contentParts,
"templateStructure": templateStructure,
"basePrompt": structurePrompt
},
useCaseId="code_structure",
debugPrefix="code_structure_generation",
contentParts=contentParts
)
# Extract JSON from markdown fences if present
extractedJson = extractJsonString(structureJson)
parsed = json.loads(extractedJson)
return parsed
async def _generateCodeContent(
self,
codeStructure: Dict[str, Any],
parentOperationId: str,
userPrompt: str = None,
contentParts: Optional[List[ContentPart]] = None
) -> List[Dict[str, Any]]:
"""Generate code content for each file with dependency handling."""
files = codeStructure.get("files", [])
metadata = codeStructure.get("metadata", {})
if not files:
raise ValueError("No files found in code structure")
# Step 1: Resolve dependency order
orderedFiles = self._resolveDependencyOrder(files)
# Step 2: Generate dependency files first (requirements.txt, package.json, etc.)
dependencyFiles = await self._generateDependencyFiles(metadata, orderedFiles)
# Step 3: Generate code files in dependency order (not fully parallel)
codeFiles = []
generatedFileContext = {} # Track what's been generated for cross-file references
for idx, fileStructure in enumerate(orderedFiles):
# Update progress
progress = 0.5 + (0.4 * (idx / len(orderedFiles)))
self.services.chat.progressLogUpdate(
parentOperationId,
progress,
f"Generating {fileStructure.get('filename', 'file')}"
)
# Provide context about already-generated files for proper imports
fileContext = self._buildFileContext(generatedFileContext, fileStructure)
# Generate this file with context
fileContent = await self._generateSingleFileContent(
fileStructure,
fileContext=fileContext,
allFilesStructure=orderedFiles,
metadata=metadata,
userPrompt=userPrompt,
contentParts=contentParts
)
codeFiles.append(fileContent)
# Update context with generated file info (for next files)
generatedFileContext[fileStructure["id"]] = {
"filename": fileContent.get("filename", fileStructure.get("filename")),
"functions": fileContent.get("functions", []),
"classes": fileContent.get("classes", []),
"exports": fileContent.get("exports", [])
}
# Combine dependency files and code files
return dependencyFiles + codeFiles
def _resolveDependencyOrder(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Resolve file generation order based on dependencies using topological sort."""
# Build dependency graph
fileMap = {f["id"]: f for f in files}
dependencies = {}
for file in files:
fileId = file["id"]
deps = file.get("dependencies", []) # List of file IDs this file depends on
dependencies[fileId] = deps
# Topological sort
ordered = []
visited = set()
tempMark = set()
def visit(fileId: str):
if fileId in tempMark:
# Circular dependency detected - break it
logger.warning(f"Circular dependency detected involving {fileId}")
return
if fileId in visited:
return
tempMark.add(fileId)
for depId in dependencies.get(fileId, []):
if depId in fileMap:
visit(depId)
tempMark.remove(fileId)
visited.add(fileId)
ordered.append(fileMap[fileId])
for file in files:
if file["id"] not in visited:
visit(file["id"])
return ordered
async def _generateDependencyFiles(
self,
metadata: Dict[str, Any],
files: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Generate dependency files (requirements.txt, package.json, etc.)."""
language = metadata.get("language", "").lower()
dependencyFiles = []
# Generate requirements.txt for Python
if language in ["python", "py"]:
requirementsContent = await self._generateRequirementsTxt(files)
if requirementsContent:
dependencyFiles.append({
"filename": "requirements.txt",
"content": requirementsContent,
"fileType": "txt",
"id": "requirements_txt"
})
# Generate package.json for JavaScript/TypeScript
elif language in ["javascript", "typescript", "js", "ts"]:
packageJson = await self._generatePackageJson(files, metadata)
if packageJson:
dependencyFiles.append({
"filename": "package.json",
"content": json.dumps(packageJson, indent=2),
"fileType": "json",
"id": "package_json"
})
return dependencyFiles
async def _generateRequirementsTxt(
self,
files: List[Dict[str, Any]]
) -> Optional[str]:
"""Generate requirements.txt content from Python imports."""
pythonPackages = set()
for file in files:
imports = file.get("imports", [])
if isinstance(imports, list):
for imp in imports:
if isinstance(imp, str):
# Extract package name from import
# Handle: "from flask import", "import flask", "from flask import Flask"
imp = imp.strip()
if "import" in imp:
if "from" in imp:
# "from package import ..."
parts = imp.split("from")
if len(parts) > 1:
package = parts[1].split("import")[0].strip()
if package and not package.startswith("."):
pythonPackages.add(package.split(".")[0]) # Get root package
else:
# "import package" or "import package.module"
parts = imp.split("import")
if len(parts) > 1:
package = parts[1].strip().split(".")[0].strip()
if package and not package.startswith("."):
pythonPackages.add(package)
if pythonPackages:
return "\n".join(sorted(pythonPackages))
return None
async def _generatePackageJson(
self,
files: List[Dict[str, Any]],
metadata: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""Generate package.json content from JavaScript/TypeScript imports."""
npmPackages = {}
for file in files:
imports = file.get("imports", [])
if isinstance(imports, list):
for imp in imports:
if isinstance(imp, str):
# Extract npm package from import
# Handle: "import express from 'express'", "const express = require('express')"
imp = imp.strip()
if "from" in imp:
# ES6 import: "import ... from 'package'"
parts = imp.split("from")
if len(parts) > 1:
package = parts[1].strip().strip("'\"")
if package and not package.startswith(".") and not package.startswith("/"):
npmPackages[package] = "*"
elif "require" in imp:
# CommonJS: "require('package')"
match = re.search(r"require\(['\"]([^'\"]+)['\"]\)", imp)
if match:
package = match.group(1)
if not package.startswith(".") and not package.startswith("/"):
npmPackages[package] = "*"
if npmPackages:
return {
"name": metadata.get("projectName", "generated-project"),
"version": "1.0.0",
"dependencies": npmPackages
}
return None
def _buildFileContext(
self,
generatedFileContext: Dict[str, Dict[str, Any]],
currentFile: Dict[str, Any]
) -> Dict[str, Any]:
"""Build context about other files for proper imports/references."""
context = {
"availableFiles": [],
"availableFunctions": {},
"availableClasses": {}
}
# Add info about already-generated files
for fileId, fileInfo in generatedFileContext.items():
context["availableFiles"].append({
"id": fileId,
"filename": fileInfo["filename"],
"functions": fileInfo.get("functions", []),
"classes": fileInfo.get("classes", []),
"exports": fileInfo.get("exports", [])
})
# Build function/class maps for easy lookup
for func in fileInfo.get("functions", []):
funcName = func.get("name", "")
if funcName:
context["availableFunctions"][funcName] = {
"file": fileInfo["filename"],
"signature": func.get("signature", "")
}
for cls in fileInfo.get("classes", []):
className = cls.get("name", "")
if className:
context["availableClasses"][className] = {
"file": fileInfo["filename"]
}
return context
async def _generateSingleFileContent(
self,
fileStructure: Dict[str, Any],
fileContext: Dict[str, Any] = None,
allFilesStructure: List[Dict[str, Any]] = None,
metadata: Dict[str, Any] = None,
userPrompt: str = None,
contentParts: Optional[List[ContentPart]] = None
) -> Dict[str, Any]:
"""Generate code content for a single file with context about other files."""
# Build prompt with context about other files for proper imports
filename = fileStructure.get("filename", "generated.py")
fileType = fileStructure.get("fileType", "py")
dependencies = fileStructure.get("dependencies", [])
functions = fileStructure.get("functions", [])
classes = fileStructure.get("classes", [])
contextInfo = ""
if fileContext and fileContext.get("availableFiles"):
contextInfo = "\n\nAvailable files and their exports:\n"
for fileInfo in fileContext["availableFiles"]:
contextInfo += f"- {fileInfo['filename']}: "
funcs = [f.get("name", "") for f in fileInfo.get("functions", [])]
cls = [c.get("name", "") for c in fileInfo.get("classes", [])]
exports = []
if funcs:
exports.extend(funcs)
if cls:
exports.extend(cls)
if exports:
contextInfo += ", ".join(exports)
contextInfo += "\n"
# Build content parts section if available
contentPartsSection = ""
if contentParts:
relevantParts = []
for part in contentParts:
# Include parts that might be relevant to this file
usageHint = part.metadata.get('usageHint', '').lower()
originalFileName = part.metadata.get('originalFileName', '').lower()
filenameLower = filename.lower()
# Check if this content part is relevant to this file
if (filenameLower in usageHint or
filenameLower in originalFileName or
part.metadata.get('contentFormat') == 'reference' or
(part.data and len(str(part.data).strip()) > 0)):
relevantParts.append(part)
if relevantParts:
contentPartsSection = "\n## AVAILABLE CONTENT PARTS\n"
for i, part in enumerate(relevantParts, 1):
contentFormat = part.metadata.get("contentFormat", "unknown")
originalFileName = part.metadata.get('originalFileName', 'N/A')
contentPartsSection += f"\n{i}. ContentPart ID: {part.id}\n"
contentPartsSection += f" Format: {contentFormat}\n"
contentPartsSection += f" Type: {part.typeGroup}\n"
contentPartsSection += f" Original file name: {originalFileName}\n"
contentPartsSection += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
# Include actual content if it's small enough (for data files like CSV, JSON)
if part.data and isinstance(part.data, str) and len(part.data) < 2000:
contentPartsSection += f" Content preview: {part.data[:500]}...\n"
# Build user request section
userRequestSection = ""
if userPrompt:
userRequestSection = f"""
## ORIGINAL USER REQUEST
```
{userPrompt}
```
"""
# Create template structure explicitly (not extracted from prompt)
templateStructure = f"""{{
"files": [
{{
"filename": "{filename}",
"content": "// Complete code here",
"functions": {json.dumps(functions, indent=2) if functions else '[]'},
"classes": {json.dumps(classes, indent=2) if classes else '[]'}
}}
]
}}"""
# Build base prompt
contentPrompt = f"""# TASK: Generate Code File Content
Generate complete, executable code for the file: {filename}
{userRequestSection}## FILE SPECIFICATIONS
File Type: {fileType}
Language: {metadata.get('language', 'python') if metadata else 'python'}
{contentPartsSection}
Required functions:
{json.dumps(functions, indent=2) if functions else 'None specified'}
Required classes:
{json.dumps(classes, indent=2) if classes else 'None specified'}
Dependencies on other files: {', '.join(dependencies) if dependencies else 'None'}
{contextInfo}
Generate complete, production-ready code with:
1. Proper imports (including imports from other files in the project if dependencies exist)
2. All required functions and classes
3. Error handling
4. Documentation/docstrings
5. Type hints where appropriate
Return ONLY valid JSON in this format:
{templateStructure}
"""
# Build continuation prompt builder
async def buildCodeContentPromptWithContinuation(
continuationContext: Any,
templateStructure: str,
basePrompt: str
) -> str:
"""Build code content prompt with continuation context. Uses unified signature.
Note: All initial context (filename, fileType, functions, etc.) is already
contained in basePrompt. This function only adds continuation-specific instructions.
"""
# Extract continuation context fields (only what's needed for continuation)
incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json
# Generate both overlap context and hierarchy context using jsonContinuation
overlapContext = ""
unifiedContext = ""
if lastRawJson:
# Get contexts directly from jsonContinuation
from modules.shared.jsonContinuation import getContexts
contexts = getContexts(lastRawJson)
overlapContext = contexts.overlapContext
unifiedContext = contexts.hierarchyContextForPrompt
elif incompletePart:
unifiedContext = incompletePart
else:
unifiedContext = "Unable to extract context - response was completely broken"
# Build unified continuation prompt format
continuationPrompt = f"""{basePrompt}
--- CONTINUATION REQUEST ---
The previous JSON response was incomplete. Continue from where it stopped.
Context showing structure hierarchy with cut point:
```
{unifiedContext}
```
Overlap Requirement:
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
Overlap context (start your response with this exact text):
```json
{overlapContext if overlapContext else "No overlap context available"}
```
TASK:
1. Start your response EXACTLY with the overlap context shown above (character by character)
2. Continue seamlessly from where the overlap context ends
3. Complete the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL:
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
- Continue seamlessly after the overlap context with new content
- Your response must be valid JSON matching the structure template above"""
return continuationPrompt
# Use generic looping system with code_content use case
options = AiCallOptions(
operationType=OperationTypeEnum.DATA_GENERATE,
resultFormat="json"
)
contentJson = await self.services.ai.callAiWithLooping(
prompt=contentPrompt,
options=options,
promptBuilder=buildCodeContentPromptWithContinuation,
promptArgs={
"filename": filename,
"fileType": fileType,
"functions": functions,
"classes": classes,
"dependencies": dependencies,
"metadata": metadata,
"userPrompt": userPrompt,
"contentParts": contentParts,
"contextInfo": contextInfo,
"templateStructure": templateStructure,
"basePrompt": contentPrompt
},
useCaseId="code_content",
debugPrefix=f"code_content_{fileStructure.get('id', 'file')}",
)
# Extract JSON from markdown fences if present
extractedJson = extractJsonString(contentJson)
parsed = json.loads(extractedJson)
# Extract file content and metadata
files = parsed.get("files", [])
if files and len(files) > 0:
fileData = files[0]
return {
"filename": fileData.get("filename", filename),
"content": fileData.get("content", ""),
"fileType": fileType,
"functions": fileData.get("functions", functions),
"classes": fileData.get("classes", classes),
"id": fileStructure.get("id")
}
# Fallback if structure is different
return {
"filename": filename,
"content": parsed.get("content", ""),
"fileType": fileType,
"functions": functions,
"classes": classes,
"id": fileStructure.get("id")
}
async def _formatAndValidateCode(self, codeFiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Format and validate generated code files."""
# For now, just return files as-is
# TODO: Add code formatting (black, prettier, etc.) and validation
formatted = []
for file in codeFiles:
content = file.get("content", "")
# Basic cleanup: remove markdown code fences if present
if isinstance(content, str):
content = re.sub(r'^```[\w]*\n', '', content, flags=re.MULTILINE)
content = re.sub(r'\n```$', '', content, flags=re.MULTILINE)
file["content"] = content.strip()
formatted.append(file)
return formatted
def _getMimeType(self, fileType: str) -> str:
"""Get MIME type for file type."""
mimeTypes = {
"py": "text/x-python",
"js": "application/javascript",
"ts": "application/typescript",
"html": "text/html",
"css": "text/css",
"json": "application/json",
"txt": "text/plain",
"md": "text/markdown",
"java": "text/x-java-source",
"cpp": "text/x-c++src",
"c": "text/x-csrc",
"csv": "text/csv",
"xml": "application/xml"
}
return mimeTypes.get(fileType.lower(), "text/plain")
def _getCodeRenderer(self, fileType: str):
"""Get code renderer for file type."""
from modules.services.serviceGeneration.renderers.registry import getRenderer
# Map file types to renderer formats (code path)
formatMap = {
'json': 'json',
'csv': 'csv',
'xml': 'xml'
}
rendererFormat = formatMap.get(fileType.lower())
if rendererFormat:
renderer = getRenderer(rendererFormat, self.services, outputStyle='code')
# Check if renderer supports code rendering
if renderer and hasattr(renderer, 'renderCodeFiles'):
return renderer
return None

View file

@ -1,214 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Generation Path
Handles document generation using existing chapter/section model.
"""
import json
import logging
import time
import copy
from typing import Dict, Any, List, Optional
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.datamodels.datamodelDocument import RenderedDocument
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
logger = logging.getLogger(__name__)
class DocumentGenerationPath:
"""Document generation path (existing functionality, refactored)."""
def __init__(self, services):
self.services = services
async def generateDocument(
self,
userPrompt: str,
documentList: Optional[Any] = None, # DocumentReferenceList
documentIntents: Optional[List[DocumentIntent]] = None,
contentParts: Optional[List[ContentPart]] = None,
outputFormat: str = "txt",
title: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> AiResponse:
"""
Generate document using existing chapter/section model.
Returns: AiResponse with documents list
"""
# Create operation ID
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
docOperationId = f"doc_gen_{workflowId}_{int(time.time())}"
# Start progress tracking
self.services.chat.progressLogStart(
docOperationId,
"Document Generation",
"Document Generation",
f"Format: {outputFormat}",
parentOperationId=parentOperationId
)
try:
# Schritt 5A: Kläre Dokument-Intents
documents = []
if documentList:
documents = self.services.chat.getChatDocumentsFromDocumentList(documentList)
# Filter: Entferne Original-Dokumente, wenn bereits Pre-Extracted JSONs existieren
# (um Duplikate zu vermeiden - Pre-Extracted JSONs enthalten bereits die ContentParts)
# Schritt 1: Identifiziere alle Original-Dokument-IDs, die durch Pre-Extracted JSONs abgedeckt werden
originalDocIdsCoveredByPreExtracted = set()
for doc in documents:
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
if preExtracted:
originalDocId = preExtracted["originalDocument"]["id"]
originalDocIdsCoveredByPreExtracted.add(originalDocId)
logger.debug(f"Found pre-extracted JSON {doc.id} covering original document {originalDocId}")
# Schritt 2: Filtere Dokumente - entferne Original-Dokumente, die bereits durch Pre-Extracted JSONs abgedeckt werden
filteredDocuments = []
for doc in documents:
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
if preExtracted:
# Pre-Extracted JSON behalten
filteredDocuments.append(doc)
elif doc.id in originalDocIdsCoveredByPreExtracted:
# Original-Dokument, das bereits durch Pre-Extracted JSON abgedeckt wird - entfernen
logger.info(f"Skipping original document {doc.id} ({doc.fileName}) - already covered by pre-extracted JSON")
else:
# Normales Dokument ohne Pre-Extracted JSON - behalten
filteredDocuments.append(doc)
documents = filteredDocuments
checkWorkflowStopped(self.services)
if not documentIntents and documents:
documentIntents = await self.services.ai.clarifyDocumentIntents(
documents,
userPrompt,
{"outputFormat": outputFormat},
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5B: Extrahiere und bereite Content vor
if documents:
preparedContentParts = await self.services.ai.extractAndPrepareContent(
documents,
documentIntents or [],
docOperationId
)
# Merge mit bereitgestellten contentParts (falls vorhanden)
if contentParts:
# Prüfe auf pre-extracted Content
for part in contentParts:
if part.metadata.get("skipExtraction", False):
# Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig
part.metadata.setdefault("contentFormat", "extracted")
part.metadata.setdefault("isPreExtracted", True)
preparedContentParts.extend(contentParts)
contentParts = preparedContentParts
# Schritt 5B.5: Documents are converted to contentParts (like pre-processed JSON files)
# No AI extraction here - AI extraction happens during section generation
if contentParts:
logger.info(f"Using {len(contentParts)} content parts for generation (no AI extraction at this stage)")
checkWorkflowStopped(self.services)
# Schritt 5C: Generiere Struktur
structure = await self.services.ai.generateStructure(
userPrompt,
contentParts or [],
outputFormat,
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5D: Fülle Struktur
# Language will be extracted from services (user intention analysis) in fillStructure
filledStructure = await self.services.ai.fillStructure(
structure,
contentParts or [],
userPrompt,
docOperationId
)
checkWorkflowStopped(self.services)
# Schritt 5E: Rendere Resultat
# Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder)
# Language is already validated in structure (State 3) and preserved in filled structure (State 4)
# Per-document language will be extracted in renderReport() from filledStructure
# Use validated currentUserLanguage as global fallback (always valid infrastructure)
language = self.services.currentUserLanguage if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage else "en"
# IMPORTANT: Create deep copy BEFORE renderResult to preserve filledStructure with elements
# renderResult might modify the structure, so we need to preserve the original for sourceJson
# This ensures sourceJson contains the complete structure with elements for validation
filledStructureForSourceJson = copy.deepcopy(filledStructure) if filledStructure else None
renderedDocuments = await self.services.ai.renderResult(
filledStructure,
outputFormat,
language, # Global fallback (per-document language extracted from structure in renderReport)
title or "Generated Document",
userPrompt,
docOperationId
)
# Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData
documentDataList = []
for renderedDoc in renderedDocuments:
try:
# Erstelle DocumentData für jedes gerenderte Dokument
# Use the preserved filledStructureForSourceJson (with elements) for sourceJson
docDataObj = DocumentData(
documentName=renderedDoc.filename,
documentData=renderedDoc.documentData,
mimeType=renderedDoc.mimeType,
sourceJson=filledStructureForSourceJson if len(documentDataList) == 0 else None # Nur für erstes Dokument
)
documentDataList.append(docDataObj)
logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})")
except Exception as e:
logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}")
if not documentDataList:
raise ValueError("No documents were rendered")
metadata = AiResponseMetadata(
title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"),
operationType=OperationTypeEnum.DATA_GENERATE.value
)
# Debug-Log (harmonisiert)
self.services.utils.writeDebugFile(
json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str),
"document_generation_response"
)
self.services.chat.progressLogFinish(docOperationId, True)
return AiResponse(
content=json.dumps(filledStructure),
metadata=metadata,
documents=documentDataList
)
except Exception as e:
logger.error(f"Error in document generation: {str(e)}")
self.services.chat.progressLogFinish(docOperationId, False)
raise

View file

@ -1,128 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Image Generation Path
Handles image generation with support for single and batch generation.
"""
import logging
import time
from typing import List, Optional
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallRequest
logger = logging.getLogger(__name__)
class ImageGenerationPath:
"""Image generation path."""
def __init__(self, services):
self.services = services
async def generateImages(
self,
userPrompt: str,
count: int = 1,
style: Optional[str] = None,
format: str = "png",
title: Optional[str] = None,
parentOperationId: Optional[str] = None
) -> AiResponse:
"""
Generate image files.
Returns: AiResponse with image files as documents
"""
# Create operation ID
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
imageOperationId = f"image_gen_{workflowId}_{int(time.time())}"
# Start progress tracking
self.services.chat.progressLogStart(
imageOperationId,
"Image Generation",
"Image Generation",
f"Format: {format}",
parentOperationId=parentOperationId
)
try:
self.services.chat.progressLogUpdate(imageOperationId, 0.4, "Calling AI for image generation")
# Build prompt with style if provided
imagePrompt = userPrompt
if style:
imagePrompt = f"{userPrompt}\n\nStyle: {style}"
# Use IMAGE_GENERATE operation
options = AiCallOptions(
operationType=OperationTypeEnum.IMAGE_GENERATE,
resultFormat=format
)
request = AiCallRequest(
prompt=imagePrompt,
context="",
options=options
)
response = await self.services.ai.callAi(request)
if not response.content:
errorMsg = f"No image data returned: {response.content}"
logger.error(f"Error in AI image generation: {errorMsg}")
self.services.chat.progressLogFinish(imageOperationId, False)
raise ValueError(errorMsg)
# Handle response content (could be base64 string or bytes)
imageData = response.content
if isinstance(imageData, str):
# Assume base64 encoded string
import base64
try:
imageData = base64.b64decode(imageData)
except Exception:
# If not base64, try encoding as bytes
imageData = imageData.encode('utf-8')
elif not isinstance(imageData, bytes):
imageData = bytes(imageData)
# Create document
imageDoc = DocumentData(
documentName=f"generated_image.{format}",
documentData=imageData,
mimeType=f"image/{format}"
)
metadata = AiResponseMetadata(
title=title or "Generated Image",
operationType=OperationTypeEnum.IMAGE_GENERATE.value
)
# Note: Stats are now stored centrally in callAi() - no need to duplicate here
self.services.chat.progressLogUpdate(imageOperationId, 0.9, "Image generated")
self.services.chat.progressLogFinish(imageOperationId, True)
# Create content string describing the image generation
import json
contentJson = json.dumps({
"type": "image",
"format": format,
"prompt": userPrompt,
"filename": imageDoc.documentName
}, ensure_ascii=False)
return AiResponse(
content=contentJson, # JSON string describing the image generation
metadata=metadata,
documents=[imageDoc]
)
except Exception as e:
logger.error(f"Error in image generation: {str(e)}")
self.services.chat.progressLogFinish(imageOperationId, False)
raise

View file

@ -1,45 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Base renderer class for code format renderers.
"""
from abc import abstractmethod
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import logging
logger = logging.getLogger(__name__)
class BaseCodeRenderer(BaseRenderer):
"""Base class for code format renderers."""
@abstractmethod
async def renderCodeFiles(
self,
codeFiles: List[Dict[str, Any]],
metadata: Dict[str, Any],
userPrompt: str = None
) -> List[RenderedDocument]:
"""
Render code files to format-specific output.
Args:
codeFiles: List of file dictionaries with:
- filename: str
- fileType: str (json, csv, xml, etc.)
- content: str (generated code)
- id: str (optional)
metadata: Project metadata (language, projectType, etc.)
userPrompt: Original user prompt
Returns:
List of RenderedDocument objects (can be 1..n files)
"""
pass
def _validateCodeFile(self, codeFile: Dict[str, Any]) -> bool:
"""Validate code file structure."""
required = ['filename', 'fileType', 'content']
return all(key in codeFile for key in required)

View file

@ -1,484 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Base renderer class for all format renderers.
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, List, Tuple, Optional
from modules.datamodels.datamodelJson import supportedSectionTypes
from modules.datamodels.datamodelDocument import RenderedDocument
import json
import logging
import re
from datetime import datetime, UTC
import base64
import io
from PIL import Image
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
logger = logging.getLogger(__name__)
class BaseRenderer(ABC):
"""Base class for all format renderers."""
def __init__(self, services=None):
self.logger = logger
self.services = services # Add services attribute
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""
Return list of supported format names for this renderer.
Override this method in subclasses to specify supported formats.
"""
return []
@classmethod
def getFormatAliases(cls) -> List[str]:
"""
Return list of format aliases for this renderer.
Override this method in subclasses to specify format aliases.
"""
return []
@classmethod
def getPriority(cls) -> int:
"""
Return priority for this renderer (higher number = higher priority).
Used when multiple renderers support the same format.
"""
return 0
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""
Return the output style classification for this renderer.
Returns: 'code', 'document', 'image', or other (e.g., 'video' for future use)
Override this method in subclasses to specify the output style.
Args:
formatName: Optional format name (e.g., 'txt', 'js', 'csv') - useful for renderers
that handle multiple formats with different styles (e.g., RendererText)
"""
return 'document' # Default to document style
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that this renderer accepts.
This allows renderers to declare which section types they can process.
Default implementation returns all supported section types.
Override this method in subclasses to restrict accepted types.
Args:
formatName: Optional format name (e.g., 'txt', 'js', 'csv') - useful for renderers
that handle multiple formats with different accepted types (e.g., RendererText)
Returns:
List of accepted section content types (e.g., ["table", "paragraph", "heading"])
Valid types: "table", "bullet_list", "heading", "paragraph", "code_block", "image"
"""
# Default: accept all section types
return list(supportedSectionTypes)
@abstractmethod
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render extracted JSON content to multiple documents.
Each renderer must implement this method.
Can return 1..n documents (e.g., HTML + images).
Args:
extractedContent: Structured JSON content with sections and metadata (contains single document)
title: Report title
userPrompt: Original user prompt for context
aiService: AI service instance for additional processing
Returns:
List of RenderedDocument objects.
First document is the main document, additional documents are supporting files (e.g., images).
Even if only one document is returned, it must be wrapped in a list.
"""
pass
def _determineFilename(self, title: str, mimeType: str) -> str:
"""Determine filename from title and mimeType."""
import re
# Get extension from mimeType
extensionMap = {
"text/html": "html",
"application/pdf": "pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"text/plain": "txt",
"text/markdown": "md",
"application/json": "json",
"text/csv": "csv"
}
extension = extensionMap.get(mimeType, "txt")
# Sanitize title for filename
sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", title)
sanitized = re.sub(r"_+", "_", sanitized).strip("_")
if not sanitized:
sanitized = "document"
return f"{sanitized}.{extension}"
def _extractSections(self, reportData: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract sections from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
Phase 5: Supports multiple documents - extracts all sections from all documents.
"""
if "documents" not in reportData:
raise ValueError("Report data must follow standardized schema with 'documents' array")
documents = reportData.get("documents", [])
if not isinstance(documents, list) or len(documents) == 0:
raise ValueError("Standardized schema must contain at least one document in 'documents' array")
# Phase 5: Extract sections from ALL documents
all_sections = []
for doc in documents:
if isinstance(doc, dict) and "sections" in doc:
sections = doc.get("sections", [])
if isinstance(sections, list):
all_sections.extend(sections)
if not all_sections:
raise ValueError("No sections found in any document")
return all_sections
def _extractMetadata(self, reportData: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract metadata from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
"""
if "metadata" not in reportData:
raise ValueError("Report data must follow standardized schema with 'metadata' field")
metadata = reportData.get("metadata", {})
if not isinstance(metadata, dict):
raise ValueError("Metadata in standardized schema must be a dictionary")
return metadata
def _getTitle(self, reportData: Dict[str, Any], fallbackTitle: str) -> str:
"""Get title from report data or use fallback."""
metadata = reportData.get('metadata', {})
return metadata.get('title', fallbackTitle)
def _validateJsonStructure(self, jsonContent: Dict[str, Any]) -> bool:
"""
Validate that JSON content follows standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
"""
if not isinstance(jsonContent, dict):
return False
# Validate metadata field exists
if "metadata" not in jsonContent:
return False
if not isinstance(jsonContent.get("metadata"), dict):
return False
# Validate documents array exists and is not empty
if "documents" not in jsonContent:
return False
documents = jsonContent.get("documents", [])
if not isinstance(documents, list) or len(documents) == 0:
return False
# Validate first document has sections
firstDoc = documents[0]
if not isinstance(firstDoc, dict) or "sections" not in firstDoc:
return False
sections = firstDoc.get("sections", [])
if not isinstance(sections, list):
return False
# Validate each section has content_type and elements
for section in sections:
if not isinstance(section, dict):
return False
if "content_type" not in section or "elements" not in section:
return False
return True
def _getSectionType(self, section: Dict[str, Any]) -> str:
"""Get the type of a section; default to 'paragraph' for non-dict inputs."""
if isinstance(section, dict):
return section.get("content_type", "paragraph")
# If section is a list or any other type, treat as paragraph elements
return "paragraph"
def _getSectionData(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Get the elements of a section; if a list is provided directly, return it."""
if isinstance(section, dict):
return section.get("elements", [])
if isinstance(section, list):
return section
return []
def _getSectionId(self, section: Dict[str, Any]) -> str:
"""Get the ID of a section (if available)."""
if isinstance(section, dict):
return section.get("id", "unknown")
return "unknown"
def _validateImageData(self, base64Data: str, altText: str) -> bool:
"""Validate image data."""
if not base64Data:
self.logger.warning("Image section has no base64 data")
return False
if not altText:
self.logger.warning("Image section has no alt text")
return False
# Basic base64 validation
try:
base64.b64decode(base64Data, validate=True)
return True
except Exception as e:
self.logger.warning(f"Invalid base64 image data: {str(e)}")
return False
def _getImageDimensions(self, base64Data: str) -> Tuple[int, int]:
"""
Get image dimensions from base64 data.
This is a helper method that format-specific renderers can use.
"""
try:
# Decode base64 data
imageData = base64.b64decode(base64Data)
image = Image.open(io.BytesIO(imageData))
return image.size # Returns (width, height)
except Exception as e:
self.logger.warning(f"Could not determine image dimensions: {str(e)}")
return (0, 0)
def _resizeImageIfNeeded(self, base64Data: str, maxWidth: int = 800, maxHeight: int = 600) -> str:
"""
Resize image if it exceeds maximum dimensions.
Returns the resized image as base64 string.
"""
try:
# Decode base64 data
imageData = base64.b64decode(base64Data)
image = Image.open(io.BytesIO(imageData))
# Check if resizing is needed
width, height = image.size
if width <= maxWidth and height <= maxHeight:
return base64Data # No resizing needed
# Calculate new dimensions maintaining aspect ratio
ratio = min(maxWidth / width, maxHeight / height)
newWidth = int(width * ratio)
newHeight = int(height * ratio)
# Resize image
resizedImage = image.resize((newWidth, newHeight), Image.Resampling.LANCZOS)
# Convert back to base64
buffer = io.BytesIO()
resizedImage.save(buffer, format=image.format or 'PNG')
resizedData = buffer.getvalue()
return base64.b64encode(resizedData).decode('utf-8')
except Exception as e:
self.logger.warning(f"Could not resize image: {str(e)}")
return base64Data # Return original if resize fails
def _getSupportedSectionTypes(self) -> List[str]:
"""Return list of supported section types (from unified schema)."""
return supportedSectionTypes
def _isValidSectionType(self, sectionType: str) -> bool:
"""Check if a section type is valid."""
return sectionType in self._getSupportedSectionTypes()
def _formatTimestamp(self, timestamp: str = None) -> str:
"""Format timestamp for display."""
if timestamp:
return timestamp
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
# ===== GENERIC AI STYLING HELPERS =====
async def _getAiStyles(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]:
"""
Generic AI styling method that can be used by all renderers.
Args:
aiService: AI service instance
styleTemplate: Format-specific style template
defaultStyles: Default styles to fall back to
Returns:
Dict with styling definitions
"""
# DEBUG: Show which renderer is calling this method
if not aiService:
return defaultStyles
try:
requestOptions = AiCallOptions()
requestOptions.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=styleTemplate, context="", options=requestOptions)
# DEBUG: Show the actual prompt being sent to AI
self.logger.debug(f"AI Style Template Prompt:")
self.logger.debug(f"{styleTemplate}")
response = await aiService.callAi(request)
# Save styling prompt and response to debug (fire and forget - don't block on slow file I/O)
# The writeDebugFile calls os.listdir() which can be slow with many files
# Run in background thread to avoid blocking rendering
import threading
def _writeDebugFiles():
try:
self.services.utils.writeDebugFile(styleTemplate, "renderer_styling_prompt")
self.services.utils.writeDebugFile(response.content or '', "renderer_styling_response")
except Exception:
pass # Silently fail - debug writing should never block rendering
threading.Thread(target=_writeDebugFiles, daemon=True).start()
# Clean and parse JSON
result = response.content.strip() if response and response.content else ""
# Check if result is empty
if not result:
self.logger.warning("AI styling returned empty response, using defaults")
return defaultStyles
# Extract JSON from markdown if present
jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if jsonMatch:
result = jsonMatch.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to parse JSON
try:
styles = json.loads(result)
except json.JSONDecodeError as jsonError:
self.logger.warning(f"AI styling returned invalid JSON: {jsonError}")
# Use print instead of logger to avoid truncation
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER")
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER")
self.logger.warning(f"Raw content that failed to parse: {result}")
# Try to fix incomplete JSON by adding missing closing braces
openBraces = result.count('{')
closeBraces = result.count('}')
if openBraces > closeBraces:
# JSON is incomplete, add missing closing braces
missingBraces = openBraces - closeBraces
result = result + '}' * missingBraces
self.logger.info(f"Added {missingBraces} missing closing brace(s)")
self.logger.debug(f"Fixed JSON: {result}")
# Try parsing the fixed JSON
try:
styles = json.loads(result)
self.logger.info("Successfully fixed incomplete JSON")
except json.JSONDecodeError as fixError:
self.logger.warning(f"Fixed JSON still invalid: {fixError}")
self.logger.warning(f"Fixed JSON content: {result}")
# Try to extract just the JSON part if it's embedded in text
jsonStart = result.find('{')
jsonEnd = result.rfind('}')
if jsonStart != -1 and jsonEnd != -1 and jsonEnd > jsonStart:
jsonPart = result[jsonStart:jsonEnd+1]
try:
styles = json.loads(jsonPart)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return defaultStyles
else:
return defaultStyles
else:
# Try to extract just the JSON part if it's embedded in text
jsonStart = result.find('{')
jsonEnd = result.rfind('}')
if jsonStart != -1 and jsonEnd != -1 and jsonEnd > jsonStart:
jsonPart = result[jsonStart:jsonEnd+1]
try:
styles = json.loads(jsonPart)
self.logger.info("Successfully extracted JSON from explanatory text")
except json.JSONDecodeError:
self.logger.warning("Could not extract valid JSON from response, using defaults")
return defaultStyles
else:
return defaultStyles
# Convert colors to appropriate format
styles = self._convertColorsFormat(styles)
return styles
except Exception as e:
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
return defaultStyles
def _convertColorsFormat(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert colors to appropriate format based on renderer type.
Override this method in subclasses for format-specific color handling.
"""
return styles
def _createAiStyleTemplate(self, formatName: str, userPrompt: str, styleSchema: Dict[str, Any]) -> str:
"""
Create a standardized AI style template for any format.
Args:
formatName: Name of the format (e.g., "docx", "xlsx", "pptx")
userPrompt: User's original prompt
styleSchema: Format-specific style schema
Returns:
Formatted prompt string
"""
schemaJson = json.dumps(styleSchema, indent=4)
# DEBUG: Show the schema being sent
return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {formatName.upper()} documents.
User request: {userPrompt}
Use this schema as a template:
{schemaJson}
Requirements:
- Return ONLY the complete JSON object (no markdown, no explanations)
- If the user request contains style/formatting/design instructions (in any language), customize the styling accordingly (adapt styles and add styles if needed)
- If the user request has NO style instructions, return the default schema values unchanged
- Ensure all objects are properly closed with closing braces
- Only modify styles if style instructions are present in the user request
Return the complete JSON:"""

View file

@ -1,238 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Renderer registry for automatic discovery and registration of renderers.
Renderers are indexed by (format, outputStyle) so that document generation
and code generation each get the correct renderer for the same format.
"""
import logging
import importlib
from typing import Dict, Type, List, Optional, Tuple
from .documentRendererBaseTemplate import BaseRenderer
logger = logging.getLogger(__name__)
class RendererRegistry:
"""Registry for automatic renderer discovery and management.
Maintains separate renderer mappings per outputStyle ('document', 'code', etc.)
so that document-generation and code-generation paths each resolve to the
correct renderer, even when both support the same format (e.g. 'csv').
"""
def __init__(self):
# Key: (formatName, outputStyle) -> rendererClass
self._renderers: Dict[Tuple[str, str], Type[BaseRenderer]] = {}
self._format_mappings: Dict[str, str] = {}
self._discovered = False
def discoverRenderers(self) -> None:
"""Automatically discover and register all renderers by scanning files."""
if self._discovered:
return
try:
from pathlib import Path
currentDir = Path(__file__).parent
packageName = __name__.rsplit('.', 1)[0]
for filePath in currentDir.glob("*.py"):
if filePath.name in ['registry.py', 'documentRendererBaseTemplate.py', 'codeRendererBaseTemplate.py', '__init__.py']:
continue
moduleName = filePath.stem
try:
fullModuleName = f"{packageName}.{moduleName}"
module = importlib.import_module(fullModuleName)
for attrName in dir(module):
attr = getattr(module, attrName)
if (isinstance(attr, type) and
issubclass(attr, BaseRenderer) and
attr != BaseRenderer and
hasattr(attr, 'getSupportedFormats')):
self._registerRendererClass(attr)
except Exception as e:
logger.warning(f"Could not load renderer from {moduleName}: {str(e)}")
continue
self._discovered = True
except Exception as e:
logger.error(f"Error during renderer discovery: {str(e)}")
self._discovered = True
def _registerRendererClass(self, rendererClass: Type[BaseRenderer]) -> None:
"""Register a renderer class keyed by (format, outputStyle)."""
try:
supportedFormats = rendererClass.getSupportedFormats()
outputStyle = rendererClass.getOutputStyle() if hasattr(rendererClass, 'getOutputStyle') else 'document'
priority = rendererClass.getPriority() if hasattr(rendererClass, 'getPriority') else 0
for formatName in supportedFormats:
formatKey = formatName.lower()
registryKey = (formatKey, outputStyle)
if registryKey in self._renderers:
existingRenderer = self._renderers[registryKey]
existingPriority = existingRenderer.getPriority() if hasattr(existingRenderer, 'getPriority') else 0
if priority > existingPriority:
logger.debug(f"Replacing {existingRenderer.__name__} with {rendererClass.__name__} for ({formatKey}, {outputStyle}) (priority {priority} > {existingPriority})")
self._renderers[registryKey] = rendererClass
else:
logger.debug(f"Keeping {existingRenderer.__name__} for ({formatKey}, {outputStyle}) (priority {existingPriority} >= {priority})")
else:
self._renderers[registryKey] = rendererClass
# Register aliases
if hasattr(rendererClass, 'getFormatAliases'):
aliases = rendererClass.getFormatAliases()
for alias in aliases:
self._format_mappings[alias.lower()] = formatKey
logger.debug(f"Registered {rendererClass.__name__} for formats={supportedFormats}, style={outputStyle}, priority={priority}")
except Exception as e:
logger.error(f"Error registering renderer {rendererClass.__name__}: {str(e)}")
def getRenderer(self, outputFormat: str, services=None, outputStyle: str = None) -> Optional[BaseRenderer]:
"""Get a renderer instance for the specified format and style.
Args:
outputFormat: Format name (e.g. 'csv', 'json', 'pdf')
services: Services instance passed to renderer constructor
outputStyle: 'document' or 'code'. If None, returns the first match
with preference: document > code (most callers are document path).
"""
if not self._discovered:
self.discoverRenderers()
formatName = outputFormat.lower().strip()
if formatName in self._format_mappings:
formatName = self._format_mappings[formatName]
rendererClass = None
if outputStyle:
# Exact match by style
rendererClass = self._renderers.get((formatName, outputStyle))
else:
# No style specified — prefer 'document', then 'code', then any
for style in ['document', 'code']:
rendererClass = self._renderers.get((formatName, style))
if rendererClass:
break
# Fallback: check any registered style
if not rendererClass:
for key, cls in self._renderers.items():
if key[0] == formatName:
rendererClass = cls
break
if rendererClass:
try:
return rendererClass(services=services)
except Exception as e:
logger.error(f"Error creating renderer instance for {formatName}: {str(e)}")
return None
logger.warning(f"No renderer found for format={outputFormat}, style={outputStyle}")
return None
def getSupportedFormats(self) -> List[str]:
"""Get list of all supported formats."""
if not self._discovered:
self.discoverRenderers()
formats = set()
for (fmt, _style) in self._renderers.keys():
formats.add(fmt)
formats.update(self._format_mappings.keys())
return sorted(formats)
def getRendererInfo(self) -> Dict[str, Dict[str, str]]:
"""Get information about all registered renderers."""
if not self._discovered:
self.discoverRenderers()
info = {}
for (formatName, style), rendererClass in self._renderers.items():
key = f"{formatName}:{style}"
info[key] = {
'class_name': rendererClass.__name__,
'module': rendererClass.__module__,
'outputStyle': style,
'description': getattr(rendererClass, '__doc__', 'No description').strip().split('\n')[0] if rendererClass.__doc__ else 'No description'
}
return info
def getOutputStyle(self, outputFormat: str) -> Optional[str]:
"""
Get the output style classification for a given format.
When both 'document' and 'code' renderers exist for a format,
returns the default ('document') since this is called during document generation.
"""
if not self._discovered:
self.discoverRenderers()
formatName = outputFormat.lower().strip()
if formatName in self._format_mappings:
formatName = self._format_mappings[formatName]
# Check document first, then code
for style in ['document', 'code']:
rendererClass = self._renderers.get((formatName, style))
if rendererClass:
try:
return rendererClass.getOutputStyle(formatName)
except Exception:
pass
# Fallback: any style
for key, rendererClass in self._renderers.items():
if key[0] == formatName:
try:
return rendererClass.getOutputStyle(formatName)
except Exception:
pass
logger.warning(f"No renderer found for format: {outputFormat}, cannot determine output style")
return None
# Global registry instance
_registry = RendererRegistry()
def getRenderer(outputFormat: str, services=None, outputStyle: str = None) -> Optional[BaseRenderer]:
"""Get a renderer instance for the specified format and style.
Args:
outputFormat: Format name (e.g. 'csv', 'json', 'pdf')
services: Services instance
outputStyle: 'document' or 'code'. If None, prefers document renderer.
"""
return _registry.getRenderer(outputFormat, services, outputStyle=outputStyle)
def getSupportedFormats() -> List[str]:
"""Get list of all supported formats."""
return _registry.getSupportedFormats()
def getRendererInfo() -> Dict[str, Dict[str, str]]:
"""Get information about all registered renderers."""
return _registry.getRendererInfo()
def getOutputStyle(outputFormat: str) -> Optional[str]:
"""Get the output style classification for a given format."""
return _registry.getOutputStyle(outputFormat)

View file

@ -1,159 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
CSV code renderer for code generation.
"""
from .codeRendererBaseTemplate import BaseCodeRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import csv
import io
class RendererCodeCsv(BaseCodeRenderer):
"""Renders CSV code files."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported CSV formats."""
return ['csv']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return []
@classmethod
def getPriority(cls) -> int:
"""Return priority for CSV code renderer."""
return 75 # Higher than document renderer (70) for code generation
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: CSV requires specific structure."""
return 'code'
async def renderCodeFiles(
self,
codeFiles: List[Dict[str, Any]],
metadata: Dict[str, Any],
userPrompt: str = None
) -> List[RenderedDocument]:
"""
Render CSV code files.
For single file: output as-is (validate structure)
For multiple files: output separately (each is independent CSV)
"""
renderedDocs = []
for codeFile in codeFiles:
if not self._validateCodeFile(codeFile):
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
continue
filename = codeFile['filename']
content = codeFile['content']
# Validate CSV structure (header row, consistent columns)
validatedContent = self._validateAndFixCsv(content)
# Extract CSV statistics for validation
csvStats = self._extractCsvStatistics(validatedContent)
# Merge file-specific metadata with project metadata
fileMetadata = dict(metadata) if metadata else {}
fileMetadata.update({
"filename": filename,
"fileType": "csv",
"statistics": csvStats
})
renderedDocs.append(
RenderedDocument(
documentData=validatedContent.encode('utf-8'),
mimeType="text/csv",
filename=filename,
metadata=fileMetadata
)
)
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
"""
# Check if this is code generation (has files array) or document generation (has documents array)
if "files" in extractedContent:
# Code generation path - use renderCodeFiles
files = extractedContent.get("files", [])
metadata = extractedContent.get("metadata", {})
return await self.renderCodeFiles(files, metadata, userPrompt)
else:
# Document generation path - delegate to document renderer
from .rendererCsv import RendererCsv
documentRenderer = RendererCsv(self.services)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
def _validateAndFixCsv(self, content: str) -> str:
"""Validate CSV structure and fix common issues."""
try:
# Parse CSV to validate structure
reader = csv.reader(io.StringIO(content))
rows = list(reader)
if not rows:
return content # Empty CSV
# Check header row exists
headerRow = rows[0]
headerCount = len(headerRow)
# Validate all rows have same column count
fixedRows = [headerRow] # Start with header
for i, row in enumerate(rows[1:], 1):
if len(row) != headerCount:
self.logger.debug(f"Row {i} has {len(row)} columns, expected {headerCount}. Auto-fixing...")
# Pad or truncate to match header
if len(row) < headerCount:
row.extend([''] * (headerCount - len(row)))
else:
row = row[:headerCount]
fixedRows.append(row)
# Convert back to CSV string
output = io.StringIO()
writer = csv.writer(output)
for row in fixedRows:
writer.writerow(row)
return output.getvalue()
except Exception as e:
self.logger.warning(f"CSV validation failed: {e}, returning original content")
return content
def _extractCsvStatistics(self, content: str) -> Dict[str, Any]:
"""Extract CSV statistics for validation (row count, column count, headers)."""
try:
reader = csv.reader(io.StringIO(content))
rows = list(reader)
if not rows:
return {"rowCount": 0, "columnCount": 0, "headerRow": []}
headerRow = rows[0]
columnCount = len(headerRow)
rowCount = len(rows) - 1 # Exclude header
return {
"rowCount": rowCount,
"columnCount": columnCount,
"headerRow": headerRow,
"dataRowCount": rowCount
}
except Exception as e:
self.logger.warning(f"CSV statistics extraction failed: {e}")
return {}

View file

@ -1,141 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
JSON code renderer for code generation.
"""
from .codeRendererBaseTemplate import BaseCodeRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import json
class RendererCodeJson(BaseCodeRenderer):
"""Renders JSON code files."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported JSON formats."""
return ['json']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return []
@classmethod
def getPriority(cls) -> int:
"""Return priority for JSON code renderer."""
return 85 # Higher than document renderer (80) for code generation
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: JSON is structured data format."""
return 'code'
async def renderCodeFiles(
self,
codeFiles: List[Dict[str, Any]],
metadata: Dict[str, Any],
userPrompt: str = None
) -> List[RenderedDocument]:
"""
Render JSON code files.
For single file: output as-is
For multiple files: output separately (each file is independent JSON)
"""
renderedDocs = []
for codeFile in codeFiles:
if not self._validateCodeFile(codeFile):
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
continue
filename = codeFile['filename']
content = codeFile['content']
# Validate JSON syntax and extract statistics
parsed = None
try:
parsed = json.loads(content) # Validate JSON
except json.JSONDecodeError as e:
self.logger.warning(f"Invalid JSON in {filename}: {e}")
# Could fix/format JSON here if needed
# Format JSON (pretty print)
try:
if parsed is None:
parsed = json.loads(content)
formattedContent = json.dumps(parsed, indent=2, ensure_ascii=False)
except Exception:
formattedContent = content # Use original if formatting fails
# Extract JSON statistics for validation
jsonStats = self._extractJsonStatistics(parsed) if parsed else {}
# Merge file-specific metadata with project metadata
fileMetadata = dict(metadata) if metadata else {}
fileMetadata.update({
"filename": filename,
"fileType": "json",
"statistics": jsonStats
})
renderedDocs.append(
RenderedDocument(
documentData=formattedContent.encode('utf-8'),
mimeType="application/json",
filename=filename,
metadata=fileMetadata
)
)
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
"""
# Check if this is code generation (has files array) or document generation (has documents array)
if "files" in extractedContent:
# Code generation path - use renderCodeFiles
files = extractedContent.get("files", [])
metadata = extractedContent.get("metadata", {})
return await self.renderCodeFiles(files, metadata, userPrompt)
else:
# Document generation path - delegate to document renderer
# Import here to avoid circular dependency
from .rendererJson import RendererJson
documentRenderer = RendererJson(self.services)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
def _extractJsonStatistics(self, parsed: Any) -> Dict[str, Any]:
"""Extract JSON statistics for validation (object count, array count, key count)."""
try:
stats = {
"isArray": isinstance(parsed, list),
"isObject": isinstance(parsed, dict),
"itemCount": 0,
"keyCount": 0
}
if isinstance(parsed, list):
stats["itemCount"] = len(parsed)
# Count nested objects/arrays
objectCount = sum(1 for item in parsed if isinstance(item, dict))
arrayCount = sum(1 for item in parsed if isinstance(item, list))
stats["objectCount"] = objectCount
stats["arrayCount"] = arrayCount
elif isinstance(parsed, dict):
stats["keyCount"] = len(parsed)
stats["keys"] = list(parsed.keys())
# Count nested objects/arrays
objectCount = sum(1 for v in parsed.values() if isinstance(v, dict))
arrayCount = sum(1 for v in parsed.values() if isinstance(v, list))
stats["objectCount"] = objectCount
stats["arrayCount"] = arrayCount
return stats
except Exception as e:
self.logger.warning(f"JSON statistics extraction failed: {e}")
return {}

View file

@ -1,148 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
XML code renderer for code generation.
"""
from .codeRendererBaseTemplate import BaseCodeRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import xml.etree.ElementTree as ET
from xml.dom import minidom
class RendererCodeXml(BaseCodeRenderer):
"""Renders XML code files."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported XML formats."""
return ['xml']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return []
@classmethod
def getPriority(cls) -> int:
"""Return priority for XML code renderer."""
return 80
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: XML is structured data format."""
return 'code'
async def renderCodeFiles(
self,
codeFiles: List[Dict[str, Any]],
metadata: Dict[str, Any],
userPrompt: str = None
) -> List[RenderedDocument]:
"""
Render XML code files.
Validates XML syntax and formats (pretty print).
"""
renderedDocs = []
for codeFile in codeFiles:
if not self._validateCodeFile(codeFile):
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
continue
filename = codeFile['filename']
content = codeFile['content']
# Validate and format XML
formattedContent = self._validateAndFormatXml(content)
# Extract XML statistics for validation
xmlStats = self._extractXmlStatistics(formattedContent)
# Merge file-specific metadata with project metadata
fileMetadata = dict(metadata) if metadata else {}
fileMetadata.update({
"filename": filename,
"fileType": "xml",
"statistics": xmlStats
})
renderedDocs.append(
RenderedDocument(
documentData=formattedContent.encode('utf-8'),
mimeType="application/xml",
filename=filename,
metadata=fileMetadata
)
)
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
For XML, we only support code generation (no document renderer exists yet).
"""
# Check if this is code generation (has files array)
if "files" in extractedContent:
# Code generation path - use renderCodeFiles
files = extractedContent.get("files", [])
metadata = extractedContent.get("metadata", {})
return await self.renderCodeFiles(files, metadata, userPrompt)
else:
# Document generation path - not supported yet, return error
self.logger.warning("XML document generation not supported, only code generation")
return [
RenderedDocument(
documentData=f"XML document generation not yet supported".encode('utf-8'),
mimeType="text/plain",
filename="error.txt",
metadata={}
)
]
def _validateAndFormatXml(self, content: str) -> str:
"""Validate XML syntax and format (pretty print)."""
try:
# Parse XML to validate
root = ET.fromstring(content)
# Format XML (pretty print)
rough_string = ET.tostring(root, encoding='unicode')
reparsed = minidom.parseString(rough_string)
formatted = reparsed.toprettyxml(indent=" ")
# Remove extra blank lines
lines = [line for line in formatted.split('\n') if line.strip()]
return '\n'.join(lines)
except ET.ParseError as e:
self.logger.warning(f"Invalid XML: {e}, returning original content")
return content
except Exception as e:
self.logger.warning(f"XML formatting failed: {e}, returning original content")
return content
def _extractXmlStatistics(self, content: str) -> Dict[str, Any]:
"""Extract XML statistics for validation (element count, attribute count, root element)."""
try:
root = ET.fromstring(content)
# Count all elements recursively
elementCount = len(list(root.iter()))
# Count attributes
attributeCount = sum(len(elem.attrib) for elem in root.iter())
# Get root element name
rootElement = root.tag
return {
"elementCount": elementCount,
"attributeCount": attributeCount,
"rootElement": rootElement,
"hasRoot": True
}
except Exception as e:
self.logger.warning(f"XML statistics extraction failed: {e}")
return {}

View file

@ -1,415 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
CSV renderer for report generation.
"""
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
class RendererCsv(BaseRenderer):
"""Renders content to CSV format with format-specific extraction."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported CSV formats."""
return ['csv']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['spreadsheet', 'table']
@classmethod
def getPriority(cls) -> int:
"""Return priority for CSV renderer."""
return 70
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: CSV document renderer converts structured document content to CSV."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that CSV renderer accepts.
CSV renderer accepts table sections and code_block sections (for raw CSV content).
"""
return ["table", "code_block"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""Render extracted JSON content to CSV format. Produces one CSV file per table section."""
try:
# Validate JSON structure
if not self._validateJsonStructure(extractedContent):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata
sections = self._extractSections(extractedContent)
metadata = self._extractMetadata(extractedContent)
# Determine base filename from document or title
documents = extractedContent.get("documents", [])
baseFilename = None
if documents and isinstance(documents[0], dict):
baseFilename = documents[0].get("filename")
if not baseFilename:
baseFilename = self._determineFilename(title, "text/csv")
# Remove extension from base filename if present
if baseFilename.endswith('.csv'):
baseFilename = baseFilename[:-4]
# Collect CSV-producing sections: table sections AND code_block sections with CSV language
tableSections = []
codeBlockCsvSections = []
for section in sections:
sectionType = section.get("content_type", "paragraph")
if sectionType == "table":
tableSections.append(section)
elif sectionType == "code_block":
# Check if any element is a code_block with language "csv"
for element in section.get("elements", []):
content = element.get("content", {})
if isinstance(content, dict) and content.get("language", "").lower() == "csv":
codeBlockCsvSections.append(section)
break
# If no usable sections found, return empty CSV
if not tableSections and not codeBlockCsvSections:
self.logger.warning("No table or CSV code_block sections found in CSV document - returning empty CSV")
emptyCsv = self._convertRowsToCsv([["No table data available"]])
return [
RenderedDocument(
documentData=emptyCsv.encode('utf-8'),
mimeType="text/csv",
filename=self._determineFilename(title, "text/csv"),
documentType=metadata.get("documentType") if isinstance(metadata, dict) else None,
metadata=metadata if isinstance(metadata, dict) else None
)
]
allCsvSections = tableSections + codeBlockCsvSections
# Generate one CSV file per section
renderedDocuments = []
for i, csvSection in enumerate(allCsvSections):
sectionType = csvSection.get("content_type", "paragraph")
sectionTitle = csvSection.get("title")
csvContent = ""
if sectionType == "code_block":
# Extract raw CSV content directly from code_block elements
rawCsvParts = []
for element in csvSection.get("elements", []):
content = element.get("content", {})
if isinstance(content, dict) and content.get("language", "").lower() == "csv":
code = content.get("code", "")
if code:
rawCsvParts.append(code)
csvContent = "\n".join(rawCsvParts)
else:
# Table section — render via table logic
csvRows = []
if sectionTitle:
csvRows.append([sectionTitle])
csvRows.append([]) # Empty row after title
elements = csvSection.get("elements", [])
for element in elements:
tableRows = self._renderJsonTableToCsv(element)
if tableRows:
csvRows.extend(tableRows)
csvContent = self._convertRowsToCsv(csvRows)
# Determine filename
if len(allCsvSections) == 1:
filename = f"{baseFilename}.csv"
else:
sectionId = csvSection.get("id", f"csv_{i+1}")
if sectionTitle:
safeTitle = "".join(c for c in sectionTitle if c.isalnum() or c in (' ', '-', '_')).strip()
safeTitle = safeTitle.replace(' ', '_')[:30]
filename = f"{baseFilename}_{safeTitle}.csv"
else:
filename = f"{baseFilename}_{sectionId}.csv"
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
renderedDocuments.append(
RenderedDocument(
documentData=csvContent.encode('utf-8'),
mimeType="text/csv",
filename=filename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
)
return renderedDocuments
except Exception as e:
self.logger.error(f"Error rendering CSV: {str(e)}")
# Return minimal CSV fallback
fallbackCsv = self._convertRowsToCsv([["Title", "Content"], [title, f"Error rendering report: {str(e)}"]])
return [
RenderedDocument(
documentData=fallbackCsv.encode('utf-8'),
mimeType="text/csv",
filename=self._determineFilename(title, "text/csv"),
metadata=extractedContent.get("metadata", {}) if extractedContent else None
)
]
async def _generateCsvFromJson(self, jsonContent: Dict[str, Any], title: str) -> str:
"""Generate CSV content from structured JSON document. DEPRECATED: Use render() method instead."""
# This method is kept for backward compatibility but is no longer used
# The render() method now handles CSV generation directly
try:
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(jsonContent):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
sections = self._extractSections(jsonContent)
metadata = self._extractMetadata(jsonContent)
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
documentTitle = title if title else metadata.get("title", "Generated Document")
# Generate CSV content
csvRows = []
# Add title row
if documentTitle:
csvRows.append([documentTitle])
csvRows.append([]) # Empty row
# Process each section in order - only table sections
for section in sections:
sectionType = section.get("content_type", "paragraph")
if sectionType == "table":
sectionCsv = self._renderJsonSectionToCsv(section)
if sectionCsv:
csvRows.extend(sectionCsv)
csvRows.append([]) # Empty row between sections
# Convert to CSV string
csvContent = self._convertRowsToCsv(csvRows)
return csvContent
except Exception as e:
self.logger.error(f"Error generating CSV from JSON: {str(e)}")
raise Exception(f"CSV generation failed: {str(e)}")
def _renderJsonSectionToCsv(self, section: Dict[str, Any]) -> List[List[str]]:
"""Render a single JSON section to CSV rows."""
try:
sectionType = section.get("content_type", "paragraph")
elements = section.get("elements", [])
csvRows = []
# Add section title if available
sectionTitle = section.get("title")
if sectionTitle:
csvRows.append([f"# {sectionTitle}"])
# Process each element in the section
for element in elements:
if sectionType == "table":
csvRows.extend(self._renderJsonTableToCsv(element))
elif sectionType == "list":
csvRows.extend(self._renderJsonListToCsv(element))
elif sectionType == "heading":
csvRows.extend(self._renderJsonHeadingToCsv(element))
elif sectionType == "paragraph":
csvRows.extend(self._renderJsonParagraphToCsv(element))
elif sectionType == "code":
csvRows.extend(self._renderJsonCodeToCsv(element))
else:
# Fallback to paragraph for unknown types
csvRows.extend(self._renderJsonParagraphToCsv(element))
return csvRows
except Exception as e:
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
return [["[Error rendering section]"]]
def _renderJsonTableToCsv(self, tableData: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON table to CSV rows."""
try:
# Extract from nested content structure
content = tableData.get("content", {})
if not isinstance(content, dict):
return []
headers = content.get("headers", [])
rows = content.get("rows", [])
csvRows = []
if headers:
csvRows.append(headers)
if rows:
csvRows.extend(rows)
return csvRows
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return [["[Error rendering table]"]]
def _renderJsonListToCsv(self, listData: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON list to CSV rows."""
try:
# Extract from nested content structure
content = listData.get("content", {})
if not isinstance(content, dict):
return []
items = content.get("items", [])
csvRows = []
for item in items:
if isinstance(item, dict):
text = item.get("text", "")
subitems = item.get("subitems", [])
csvRows.append([text])
# Add subitems as indented rows
for subitem in subitems:
if isinstance(subitem, dict):
csvRows.append([f" - {subitem.get('text', '')}"])
else:
csvRows.append([f" - {subitem}"])
else:
csvRows.append([str(item)])
return csvRows
except Exception as e:
self.logger.warning(f"Error rendering list: {str(e)}")
return [["[Error rendering list]"]]
def _renderJsonHeadingToCsv(self, headingData: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON heading to CSV rows."""
try:
# Extract from nested content structure
content = headingData.get("content", {})
if not isinstance(content, dict):
return []
text = content.get("text", "")
level = content.get("level", 1)
if text:
# Use # symbols for heading levels
headingText = f"{'#' * level} {text}"
return [[headingText]]
return []
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
return [["[Error rendering heading]"]]
def _renderJsonParagraphToCsv(self, paragraphData: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON paragraph to CSV rows."""
try:
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
# Split long paragraphs into multiple rows if needed
if len(text) > 100:
words = text.split()
rows = []
currentRow = []
currentLength = 0
for word in words:
if currentLength + len(word) > 100 and currentRow:
rows.append([" ".join(currentRow)])
currentRow = [word]
currentLength = len(word)
else:
currentRow.append(word)
currentLength += len(word) + 1
if currentRow:
rows.append([" ".join(currentRow)])
return rows
else:
return [[text]]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return [["[Error rendering paragraph]"]]
def _renderJsonCodeToCsv(self, codeData: Dict[str, Any]) -> List[List[str]]:
"""Render a JSON code block to CSV rows."""
try:
# Extract from nested content structure
content = codeData.get("content", {})
if not isinstance(content, dict):
return []
code = content.get("code", "")
language = content.get("language", "")
csvRows = []
if language:
csvRows.append([f"Code ({language}):"])
if code:
# Split code into lines
codeLines = code.split('\n')
for line in codeLines:
csvRows.append([f" {line}"])
return csvRows
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
return [["[Error rendering code block]"]]
def _convertRowsToCsv(self, rows: List[List[str]]) -> str:
"""Convert rows to CSV string."""
import csv
import io
output = io.StringIO()
writer = csv.writer(output)
for row in rows:
if row: # Only write non-empty rows
writer.writerow(row)
return output.getvalue()
def _cleanCsvContent(self, content: str, title: str) -> str:
"""Clean and validate CSV content from AI."""
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```") and content.endswith("```"):
lines = content.split('\n')
if len(lines) > 2:
content = '\n'.join(lines[1:-1]).strip()
return content

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more