refactor: modules/services/ abgeloest durch serviceCenter + serviceHub
serviceCenter = DI-Container (Resolver, Registry, Context) fuer Service-Instanziierung serviceHub = Consumer-facing Aggregation (DB-Interfaces, Runtime-State, lazy Service-Resolution via serviceCenter) - modules/serviceHub/ erstellt: ServiceHub, PublicService, getInterface() - 22 Consumer-Dateien migriert (routes, features, tests): imports von modules.services auf serviceHub bzw. serviceCenter umgestellt - resolver.py: legacy fallback auf altes services/ entfernt - modules/services/ komplett geloescht (83 Dateien inkl. dead code mainAiChat.py) - pre-extraction: progress callback durch chunk-pipeline propagiert, operationType DATA_EXTRACT->DATA_ANALYSE fuer guenstigeres Modell
This commit is contained in:
parent
6919a23d4f
commit
c8b7517209
144 changed files with 1050 additions and 38313 deletions
|
|
@ -44,7 +44,7 @@ APP_FRONTEND_URL = http://localhost:5176
|
|||
# AI configuration
|
||||
Connector_AiOpenai_API_SECRET = DEV_ENC:Z0FBQUFBQnBaSnM4TWFRRmxVQmNQblVIYmc1Y0Q3aW9zZUtDWlNWdGZjbFpncGp2NHN2QjkxMWxibUJnZDBId252MWk5TXN3Yk14ajFIdi1CTkx2ZWx2QzF5OFR6LUx5azQ3dnNLaXJBOHNxc0tlWmtZcTFVelF4eXBSM2JkbHd2eTM0VHNXdHNtVUprZWtPVzctNlJsZHNmM20tU1N6Q1Q2cHFYSi1tNlhZNDNabTVuaEVGWmIydEhadTcyMlBURmw2aUJxOF9GTzR0dTZiNGZfOFlHaVpPZ1A1LXhhOEFtN1J5TEVNNWtMcGpyNkMzSl8xRnZsaTF1WTZrOUZmb0cxVURjSGFLS2dIYTQyZEJtTm90bEYxVWxNNXVPdTVjaVhYbXhxT3JsVDM5VjZMVFZKSE1tZnM9
|
||||
Connector_AiAnthropic_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpENmFBWG16STFQUVZxNzZZRzRLYTA4X3lRanF1VkF4cU45OExNMzlsQmdISGFxTUxud1dXODBKcFhMVG9KNjdWVnlTTFFROVc3NDlsdlNHLUJXeG41NDBHaXhHR0VHVWl5UW9RNkVWbmlhakRKVW5pM0R4VHk0LUw0TV9LdkljNHdBLXJua21NQkl2b3l4UkVkMGN1YjBrMmJEeWtMay1jbmxrYWJNbUV0aktCXzU1djR2d2RSQXZORTNwcG92ZUVvVGMtQzQzTTVncEZTRGRtZUFIZWQ0dz09
|
||||
Connector_AiPerplexity_API_SECRET = DEV_ENC:Z0FBQUFBQm82Mzk2Q1MwZ0dNcUVBcUtuRDJIcTZkMXVvYnpjM3JEMzJiT1NKSHljX282ZDIyZTJYc09VSTdVNXAtOWU2UXp5S193NTk5dHJsWlFjRjhWektFOG1DVGY4ZUhHTXMzS0RPN1lNcF9nSlVWbW5BZ1hkZDVTejl6bVZNRFVvX29xamJidWRFMmtjQmkyRUQ2RUh6UTN1aWNPSUJBPT0=
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiTavily_API_SECRET = DEV_ENC:Z0FBQUFBQm8xSUpEQTdnUHMwd2pIaXNtMmtCTFREd0pyQXRKb1F5eGtHSnkyOGZiUnlBOFc0b3Vzcndrc3ViRm1nMDJIOEZKYWxqdWNkZGh5N0Z4R0JlQmxXSG5pVnJUR2VYckZhMWNMZ1FNeXJ3enJLVlpiblhOZTNleUg3ZzZyUzRZanFSeDlVMkI=
|
||||
Connector_AiPrivateLlm_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGRHM5eFdUVmVZU1R1cHBwN1RlMUx4T0NlLTJLUFFVX3J2OElDWFpuZmJHVmp4Z3BNNWMwZUVVZUd2TFhRSjVmVkVlcFlVRWtybXh0ZHloZ01ZcnVvX195YjdlWVdEcjZSWFFTTlNBWUlaTlNoLWhqVFBIb0thVlBiaWhjYjFQOFY=
|
||||
Connector_AiMistral_API_SECRET = DEV_ENC:Z0FBQUFBQnBudkpGeEQxYUIxOHhia0JlQWpWQ2dWQWZzY3l6SWwyUnJoR1hRQWloX2lxb2lGNkc4UnA4U2tWNjJaYzB1d1hvNG9fWUp1N3V4OW9FMGhaWVhjSlVwWEc1X2loVDBSZDEtdHdfcTA5QkcxQTR4OHc4RkRzclJrU2d1RFZpNDJkRDRURlE=
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla-int.poweron-center.net
|
|||
# AI configuration
|
||||
Connector_AiOpenai_API_SECRET = INT_ENC:Z0FBQUFBQnBaSnM4MENkQ2xJVmE5WFZKUkh2SHJFby1YVXN3ZmVxRkptS3ZWRmlwdU93ZEJjSjlMV2NGbU5mS3NCdmFfcmFYTEJNZXFIQ3ozTWE4ZC1pemlQNk9wbjU1d3BPS0ZCTTZfOF8yWmVXMWx0TU1DamlJLVFhSTJXclZsY3hMVWlPcXVqQWtMdER4T252NHZUWEhUOTdIN1VGR3ltazEweXFqQ0lvb0hYWmxQQnpxb0JwcFNhRDNGWXdoRTVJWm9FalZpTUF5b1RqZlRaYnVKYkp0NWR5Vko1WWJ0Wmg2VWJzYXZ0Z3Q4UkpsTldDX2dsekhKMmM4YjRoa2RwemMwYVQwM2cyMFlvaU5mOTVTWGlROU8xY2ZVRXlxZzJqWkxURWlGZGI2STZNb0NpdEtWUnM9
|
||||
Connector_AiAnthropic_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRjT1ZlRWVJdVZMT3ljSFJDcFdxRFBRVkZhS204NnN5RDBlQ0tpenhTM0FFVktuWW9mWHNwRWx2dHB0eDBSZ0JFQnZKWlp6c01pVGREWHd1eGpERnU0Q2xhaks1clQ1ZXVsdnd2ZzhpNXNQS1BhY3FjSkdkVEhHalNaRGR4emhpakZncnpDQUVxOHVXQzVUWmtQc0FsYmFwTF9TSG5FOUFtWk5Ick1NcHFvY2s1T1c2WXlRUFFJZnh6TWhuaVpMYmppcDR0QUx0a0R6RXlwbGRYb1R4dzJkUT09
|
||||
Connector_AiPerplexity_API_SECRET = INT_ENC:Z0FBQUFBQm82Mzk2UWZJdUFhSW8yc3RKc0tKRXphd0xWMkZOVlFpSGZ4SGhFWnk0cTF5VjlKQVZjdS1QSWdkS0pUSWw4OFU5MjUxdTVQel9aeWVIZTZ5TXRuVmFkZG0zWEdTOGdHMHpsTzI0TGlWYURKU1Q0VVpKTlhxUk5FTmN6SUJScDZ3ZldIaUJZcWpaQVRiSEpyQm9tRTNDWk9KTnZBPT0=
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiTavily_API_SECRET = INT_ENC:Z0FBQUFBQm8xSVRkdkJMTDY0akhXNzZDWHVYSEt1cDZoOWEzSktneHZEV2JndTNmWlNSMV9KbFNIZmQzeVlrNE5qUEIwcUlBSGM1a0hOZ3J6djIyOVhnZzI3M1dIUkdicl9FVXF3RGktMmlEYmhnaHJfWTdGUkktSXVUSGdQMC1vSEV6VE8zR2F1SVk=
|
||||
Connector_AiPrivateLlm_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGSjZ1NWh0aWc1R3Z4MHNaeS1HamtUbndhcUZFZDlqUDhjSmg5eHFfdlVkU0RsVkJ2UVRaMWs3aWhraG5jSlc0YkxNWHVmR2JoSW5ENFFCdkJBM0VienlKSnhzNnBKbTJOUTFKczRfWlQ3bWpmUkRTT1I1OGNUSTlQdExacGRpeXg=
|
||||
Connector_AiMistral_API_SECRET = INT_ENC:Z0FBQUFBQnBudkpGZTNtZ1E4TWIxSEU1OUlreUpxZkJIR0Vxcm9xRHRUbnBxbTQ1cXlkbnltWkJVdTdMYWZ4c3Fsam42TERWUTVhNzZFMU9xVjdyRGFCYml6bmZsZFd2YmJzemlrSWN6Q3o3X0NXX2xXNUQteTNONHdKYzJ5YVpLLWdhU2JhSTJQZnI=
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ APP_FRONTEND_URL = https://nyla.poweron-center.net
|
|||
# AI configuration
|
||||
Connector_AiOpenai_API_SECRET = PROD_ENC:Z0FBQUFBQnBaSnM4TWJOVm4xVkx6azRlNDdxN3UxLUdwY2hhdGYxRGp4VFJqYXZIcmkxM1ZyOWV2M0Z4MHdFNkVYQ0ROb1d6LUZFUEdvMHhLMEtXYVBCRzM5TlYyY3ROYWtJRk41cDZxd0tYYi00MjVqMTh4QVcyTXl0bmVocEFHbXQwREpwNi1vODdBNmwzazE5bkpNelE2WXpvblIzWlQwbGdEelI2WXFqT1RibXVHcjNWbVhwYzBOM25XTzNmTDAwUjRvYk4yNjIyZHc5c2RSZzREQUFCdUwyb0ZuOXN1dzI2c2FKdXI4NGxEbk92czZWamJXU3ZSbUlLejZjRklRRk4tLV9aVUFZekI2bTU4OHYxNTUybDg3RVo0ZTh6dXNKRW5GNXVackZvcm9laGI0X3R6V3M9
|
||||
Connector_AiAnthropic_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3TnhYdlhSLW5RbXJyMHFXX0V0bHhuTDlTaFJsRDl2dTdIUTFtVFAwTE8tY3hLbzNSMnVTLXd3RUZualN3MGNzc1kwOTIxVUN2WW1rYi1TendFRVVBSVNqRFVjckEzNExyTGNaUkJLMmozazUwemI1cnhrcEtZVXJrWkdaVFFramp3MWZ6RmY2aGlRMXVEYjM2M3ZlbmxMdnNCRDM1QWR0Wmd6MWVnS1I1c01nV3hRLXg3d2NTZXVfTi1Wdm16UnRyNGsyRTZ0bG9TQ1g1OFB5Z002bmQ3QT09
|
||||
Connector_AiPerplexity_API_SECRET = PROD_ENC:Z0FBQUFBQm82Mzk2Q1FGRkJEUkI4LXlQbHYzT2RkdVJEcmM4WGdZTWpJTEhoeUF1NW5LUVpJdDBYN3k1WFN4a2FQSWJSQmd0U0xJbzZDTmFFN05FcXl0Z3V1OEpsZjYydV94TXVjVjVXRTRYSWdLMkd5XzZIbFV6emRCZHpuOUpQeThadE5xcDNDVGV1RHJrUEN0c1BBYXctZFNWcFRuVXhRPT0=
|
||||
Connector_AiPerplexity_API_SECRET = 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
|
||||
Connector_AiTavily_API_SECRET = PROD_ENC:Z0FBQUFBQnBDM1Z3NmItcDh6V0JpcE5Jc0NlUWZqcmllRHB5eDlNZmVnUlNVenhNTm5xWExzbjJqdE1GZ0hTSUYtb2dvdWNhTnlQNmVWQ2NGVDgwZ0MwMWZBMlNKWEhzdlF3TlZzTXhCZWM4Z1Uwb18tSTRoU1JBVTVkSkJHOTJwX291b3dPaVphVFg=
|
||||
Connector_AiPrivateLlm_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGanZ6U3pzZWkwXzVPWGtIQ040XzFrTXc5QWRnazdEeEktaUJ0akJmNnEzbWUzNHczLTJfc2dIdzBDY0FTaXZYcDhxNFdNbTNtbEJTb2VRZ0ZYd05hdlNLR1h6SUFzVml2Z1FLY1BjTl90UWozUGxtak1URnhhZmNDRWFTb0dKVUo=
|
||||
Connector_AiMistral_API_SECRET = PROD_ENC:Z0FBQUFBQnBudkpGc2tQc2lvMk1YZk01Q1dob1U5cnR0dG03WWE3WkpoOWo0SEpvLU9Rc2lCNDExdy1wZExaN3lpT2FEQkxnaHRmWmZUUUZUUUJmblZreGlpaFpOdnFhbzlEd1RsVVJtX216cmhxTm5BcTN2eUZ2T054cDE5bmlEamJ3NGR6MVpFQnA=
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ class AiAnthropic(BaseConnectorAi):
|
|||
processingMode=ProcessingModeEnum.DETAILED,
|
||||
operationTypes=createOperationTypeRatings(
|
||||
(OperationTypeEnum.PLAN, 10),
|
||||
(OperationTypeEnum.DATA_ANALYSE, 10),
|
||||
(OperationTypeEnum.DATA_ANALYSE, 8),
|
||||
(OperationTypeEnum.DATA_GENERATE, 10),
|
||||
(OperationTypeEnum.DATA_EXTRACT, 9)
|
||||
),
|
||||
|
|
|
|||
|
|
@ -288,7 +288,16 @@ class AiTavily(BaseConnectorAi):
|
|||
if maxResults < minResults or maxResults > maxAllowedResults:
|
||||
raise ValueError(f"maxResults must be between {minResults} and {maxAllowedResults}")
|
||||
|
||||
# Perform actual API call
|
||||
# Tavily enforces a 400-character query limit
|
||||
TAVILY_MAX_QUERY_LENGTH = 400
|
||||
if len(query) > TAVILY_MAX_QUERY_LENGTH:
|
||||
truncated = query[:TAVILY_MAX_QUERY_LENGTH]
|
||||
lastSpace = truncated.rfind(' ')
|
||||
if lastSpace > TAVILY_MAX_QUERY_LENGTH // 2:
|
||||
truncated = truncated[:lastSpace]
|
||||
logger.warning(f"Tavily query truncated from {len(query)} to {len(truncated)} chars")
|
||||
query = truncated
|
||||
|
||||
# Build kwargs only for provided options to avoid API rejections
|
||||
kwargs: dict = {"query": query, "max_results": maxResults}
|
||||
if searchDepth is not None:
|
||||
|
|
|
|||
|
|
@ -123,6 +123,12 @@ class BillingTransaction(BaseModel):
|
|||
aicoreProvider: Optional[str] = Field(None, description="AICore provider (anthropic, openai, etc.)")
|
||||
aicoreModel: Optional[str] = Field(None, description="AICore model name (e.g., claude-4-sonnet, gpt-4o)")
|
||||
createdByUserId: Optional[str] = Field(None, description="User who created/caused this transaction")
|
||||
|
||||
# AI call metadata (for per-call analytics)
|
||||
processingTime: Optional[float] = Field(None, description="Processing time in seconds")
|
||||
bytesSent: Optional[int] = Field(None, description="Bytes sent to AI model")
|
||||
bytesReceived: Optional[int] = Field(None, description="Bytes received from AI model")
|
||||
errorCount: Optional[int] = Field(None, description="Number of errors in this call")
|
||||
|
||||
|
||||
registerModelLabels(
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatStat, ChatDocument."""
|
||||
"""Chat models: ChatWorkflow, ChatMessage, ChatLog, ChatDocument."""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from enum import Enum
|
||||
|
|
@ -10,44 +10,6 @@ from modules.shared.timeUtils import getUtcTimestamp
|
|||
import uuid
|
||||
|
||||
|
||||
class ChatStat(BaseModel):
|
||||
"""Statistics for chat operations. User-owned, no mandate context."""
|
||||
model_config = {"populate_by_name": True, "extra": "allow"} # Allow DB system fields
|
||||
|
||||
id: str = Field(
|
||||
default_factory=lambda: str(uuid.uuid4()), description="Primary key"
|
||||
)
|
||||
workflowId: Optional[str] = Field(
|
||||
None, description="Foreign key to workflow (for workflow stats)"
|
||||
)
|
||||
processingTime: Optional[float] = Field(
|
||||
None, description="Processing time in seconds"
|
||||
)
|
||||
bytesSent: Optional[int] = Field(None, description="Number of bytes sent")
|
||||
bytesReceived: Optional[int] = Field(None, description="Number of bytes received")
|
||||
errorCount: Optional[int] = Field(None, description="Number of errors encountered")
|
||||
process: Optional[str] = Field(None, description="The process that delivers the stats data (e.g. 'action.outlook.readMails', 'ai.process.document.name')")
|
||||
engine: Optional[str] = Field(None, description="The engine used (e.g. 'ai.anthropic.35', 'ai.tavily.basic', 'renderer.docx')")
|
||||
priceCHF: Optional[float] = Field(None, description="Calculated price in USD for the operation")
|
||||
|
||||
|
||||
registerModelLabels(
|
||||
"ChatStat",
|
||||
{"en": "Chat Statistics", "fr": "Statistiques de chat"},
|
||||
{
|
||||
"id": {"en": "ID", "fr": "ID"},
|
||||
"workflowId": {"en": "Workflow ID", "fr": "ID du workflow"},
|
||||
"processingTime": {"en": "Processing Time", "fr": "Temps de traitement"},
|
||||
"bytesSent": {"en": "Bytes Sent", "fr": "Octets envoyés"},
|
||||
"bytesReceived": {"en": "Bytes Received", "fr": "Octets reçus"},
|
||||
"errorCount": {"en": "Error Count", "fr": "Nombre d'erreurs"},
|
||||
"process": {"en": "Process", "fr": "Processus"},
|
||||
"engine": {"en": "Engine", "fr": "Moteur"},
|
||||
"priceCHF": {"en": "Price CHF", "fr": "Prix CHF"},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class ChatLog(BaseModel):
|
||||
"""Log entries for chat workflows. User-owned, no mandate context."""
|
||||
id: str = Field(
|
||||
|
|
@ -322,7 +284,6 @@ class ChatWorkflow(BaseModel):
|
|||
startedAt: float = Field(default_factory=getUtcTimestamp, description="When the workflow started (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
|
||||
logs: List[ChatLog] = Field(default_factory=list, description="Workflow logs", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
|
||||
messages: List[ChatMessage] = Field(default_factory=list, description="Messages in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
|
||||
stats: List[ChatStat] = Field(default_factory=list, description="Workflow statistics list", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
|
||||
tasks: list = Field(default_factory=list, description="List of tasks in the workflow", json_schema_extra={"frontend_type": "text", "frontend_readonly": True, "frontend_required": False})
|
||||
workflowMode: WorkflowModeEnum = Field(default=WorkflowModeEnum.WORKFLOW_DYNAMIC, description="Workflow mode selector", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from modules.datamodels.datamodelChat import ChatWorkflow, ChatMessage, ChatLog
|
|||
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
|
||||
from modules.shared.attributeUtils import getModelAttributeDefinitions
|
||||
from modules.interfaces import interfaceDbChat
|
||||
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
|
||||
# Configure logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -682,7 +683,9 @@ def get_automation_workflow_chat_data(
|
|||
workflow = chatInterface.getWorkflow(workflowId)
|
||||
if not workflow:
|
||||
raise HTTPException(status_code=404, detail=f"Workflow {workflowId} not found")
|
||||
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
|
||||
billingInterface = _getBillingInterface(context.user, context.mandateId)
|
||||
workflowCost = billingInterface.getWorkflowCost(workflowId)
|
||||
return chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -1291,17 +1291,6 @@ class ChatObjects:
|
|||
logger.error(f"Error updating message {messageId}: {str(e)}", exc_info=True)
|
||||
raise ValueError(f"Error updating message {messageId}: {str(e)}")
|
||||
|
||||
def createStat(self, statData: Dict[str, Any]):
|
||||
"""Create stat record. Compatibility with ChatService; stats may not be persisted in chatbot schema."""
|
||||
from modules.datamodels.datamodelChat import ChatStat
|
||||
stat = ChatStat(**statData)
|
||||
try:
|
||||
created = self.db.recordCreate(ChatStat, statData)
|
||||
return ChatStat(**created)
|
||||
except Exception as e:
|
||||
logger.debug(f"createStat: not persisting (chatbot schema): {e}")
|
||||
return stat
|
||||
|
||||
def deleteMessage(self, conversationId: str, messageId: str) -> bool:
|
||||
"""Deletes a conversation message and related data if user has access."""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -306,12 +306,12 @@ def getChatbotServices(
|
|||
Uses interfaceFeatureChatbot (ChatObjects) for interfaceDbChat to avoid
|
||||
duplicate DB init - chatProcess reuses hub.interfaceDbChat.
|
||||
"""
|
||||
from modules.services import PublicService
|
||||
from modules.serviceHub import PublicService
|
||||
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
|
||||
from modules.features.chatbot.interfaceFeatureChatbot import getInterface as getChatbotInterface
|
||||
from modules.services.serviceChat.mainServiceChat import ChatService
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.services.serviceStreaming.mainServiceStreaming import StreamingService
|
||||
from modules.serviceCenter.services.serviceChat.mainServiceChat import ChatService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.core.serviceStreaming.mainServiceStreaming import StreamingService
|
||||
|
||||
hub = _ChatbotServiceHub()
|
||||
hub.user = user
|
||||
|
|
|
|||
|
|
@ -135,11 +135,3 @@ class ChatPlaygroundObjects:
|
|||
def createLog(self, log) -> Dict[str, Any]:
|
||||
"""Create a new log entry."""
|
||||
return self._chatInterface.createLog(log)
|
||||
|
||||
def getStats(self, workflowId: str) -> List[Dict[str, Any]]:
|
||||
"""Get stats for a workflow."""
|
||||
return self._chatInterface.getStats(workflowId)
|
||||
|
||||
def createStat(self, stat) -> Dict[str, Any]:
|
||||
"""Create a new stat entry."""
|
||||
return self._chatInterface.createStat(stat)
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
|
|||
|
||||
# Import interfaces
|
||||
from modules.interfaces import interfaceDbChat
|
||||
from modules.interfaces.interfaceDbBilling import getInterface as _getBillingInterface
|
||||
|
||||
# Import models
|
||||
from modules.datamodels.datamodelChat import (
|
||||
|
|
@ -220,9 +221,11 @@ def get_workflow_chat_data(
|
|||
detail=f"Workflow with ID {workflowId} not found"
|
||||
)
|
||||
|
||||
# Get unified chat data
|
||||
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp)
|
||||
# Get workflow cost from billing transactions (single source of truth)
|
||||
billingInterface = _getBillingInterface(context.user, context.mandateId)
|
||||
workflowCost = billingInterface.getWorkflowCost(workflowId)
|
||||
|
||||
chatData = chatInterface.getUnifiedChatData(workflowId, afterTimestamp, workflowCost=workflowCost)
|
||||
return chatData
|
||||
|
||||
except HTTPException:
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ from modules.auth import limiter, getRequestContext, RequestContext
|
|||
from modules.interfaces import interfaceDbChat, interfaceDbManagement
|
||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||
from modules.datamodels.datamodelChat import UserInputRequest
|
||||
from modules.services.serviceStreaming import get_event_manager
|
||||
from modules.serviceCenter.core.serviceStreaming import get_event_manager
|
||||
from modules.features.codeeditor import codeEditorProcessor, fileContextManager
|
||||
from modules.features.codeeditor.datamodelCodeeditor import FileEditProposal, EditStatusEnum
|
||||
|
||||
|
|
|
|||
|
|
@ -1011,7 +1011,7 @@ class CommcoachService:
|
|||
|
||||
async def _callAi(self, systemPrompt: str, userPrompt: str):
|
||||
"""Call the AI service with the given prompts."""
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
|
||||
serviceContext = type('Ctx', (), {
|
||||
'user': self.currentUser,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from urllib.parse import urlparse, unquote
|
|||
|
||||
from modules.datamodels.datamodelUam import User
|
||||
from .datamodelFeatureNeutralizer import DataNeutralizerAttributes, DataNeutraliserConfig
|
||||
from modules.services import getInterface as getServices
|
||||
from modules.serviceHub import getInterface as getServices
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -205,7 +205,7 @@ class NeutralizationPlayground:
|
|||
|
||||
async def processSharepointFiles(self, sourcePath: str, targetPath: str) -> Dict[str, Any]:
|
||||
"""Process files from SharePoint source path and store neutralized files in target path"""
|
||||
from modules.services.serviceSharepoint.mainServiceSharepoint import SharepointService
|
||||
from modules.serviceCenter.services.serviceSharepoint.mainServiceSharepoint import SharepointService
|
||||
processor = SharepointProcessor(self.currentUser, self.services)
|
||||
return await processor.processSharepointFiles(sourcePath, targetPath)
|
||||
|
||||
|
|
|
|||
|
|
@ -262,8 +262,8 @@ class NeutralizationService:
|
|||
fileId: Optional[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Extract -> neutralize -> adapt -> generate for PDF/DOCX/XLSX/PPTX."""
|
||||
from modules.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
||||
from modules.services.serviceExtraction.subPipeline import runExtraction
|
||||
from modules.serviceCenter.services.serviceExtraction.mainServiceExtraction import ExtractionService
|
||||
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
|
||||
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
|
||||
|
||||
# Ensure registries exist
|
||||
|
|
@ -405,10 +405,10 @@ class NeutralizationService:
|
|||
|
||||
def _getRendererForMime(self, mimeType: str):
|
||||
"""Get renderer instance and output mime for the given input MIME type."""
|
||||
from modules.services.serviceGeneration.renderers.rendererPdf import RendererPdf
|
||||
from modules.services.serviceGeneration.renderers.rendererDocx import RendererDocx
|
||||
from modules.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
|
||||
from modules.services.serviceGeneration.renderers.rendererPptx import RendererPptx
|
||||
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPdf import RendererPdf
|
||||
from modules.serviceCenter.services.serviceGeneration.renderers.rendererDocx import RendererDocx
|
||||
from modules.serviceCenter.services.serviceGeneration.renderers.rendererXlsx import RendererXlsx
|
||||
from modules.serviceCenter.services.serviceGeneration.renderers.rendererPptx import RendererPptx
|
||||
|
||||
mime_map = {
|
||||
"application/pdf": (RendererPdf, "application/pdf"),
|
||||
|
|
|
|||
|
|
@ -284,7 +284,7 @@ from .datamodelFeatureRealEstate import (
|
|||
Land,
|
||||
DokumentTyp,
|
||||
)
|
||||
from modules.services import getInterface as getServices
|
||||
from modules.serviceHub import getInterface as getServices
|
||||
from .interfaceFeatureRealEstate import getInterface as getRealEstateInterface
|
||||
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
|
||||
from modules.connectors.connectorSwissTopoMapServer import SwissTopoMapServerConnector
|
||||
|
|
|
|||
|
|
@ -843,7 +843,7 @@ async def testVoice(
|
|||
):
|
||||
"""Test TTS voice with AI-generated sample text in the correct language."""
|
||||
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
||||
|
||||
mandateId = _validateInstanceAccess(instanceId, context)
|
||||
|
|
|
|||
|
|
@ -1062,7 +1062,7 @@ class TeamsbotService:
|
|||
|
||||
# Call SPEECH_TEAMS
|
||||
try:
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
|
||||
# Create minimal service context for AI billing
|
||||
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
||||
|
|
@ -1684,7 +1684,7 @@ class TeamsbotService:
|
|||
"""Summarize a long user-provided session context to its essential points.
|
||||
This reduces token usage in every subsequent AI call."""
|
||||
try:
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
||||
|
||||
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
||||
|
|
@ -1738,7 +1738,7 @@ class TeamsbotService:
|
|||
lines.append(f"[{speaker}]: {text}")
|
||||
textToSummarize = "\n".join(lines)
|
||||
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
||||
|
||||
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
||||
|
|
@ -1783,7 +1783,7 @@ class TeamsbotService:
|
|||
for t in transcripts
|
||||
)
|
||||
|
||||
from modules.services.serviceAi.mainServiceAi import AiService
|
||||
from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService
|
||||
|
||||
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
||||
aiService = AiService(serviceCenter=serviceContext)
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ def get_mime_type_options(
|
|||
"""Get supported MIME types from the document extraction service.
|
||||
Returns: [{ value: "mime/type", label: "Description" }]
|
||||
"""
|
||||
from modules.services.serviceExtraction.subRegistry import ExtractorRegistry
|
||||
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
|
||||
|
||||
registry = ExtractorRegistry()
|
||||
formats = registry.getSupportedFormats()
|
||||
|
|
|
|||
|
|
@ -764,7 +764,11 @@ class BillingObjects:
|
|||
featureCode: str = None,
|
||||
aicoreProvider: str = None,
|
||||
aicoreModel: str = None,
|
||||
description: str = "AI Usage"
|
||||
description: str = "AI Usage",
|
||||
processingTime: float = None,
|
||||
bytesSent: int = None,
|
||||
bytesReceived: int = None,
|
||||
errorCount: int = None
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Record usage cost as a billing transaction.
|
||||
|
|
@ -774,20 +778,6 @@ class BillingObjects:
|
|||
- PREPAY_USER: deduct from user's own balance
|
||||
- PREPAY_MANDATE: deduct from mandate pool balance
|
||||
- CREDIT_POSTPAY: deduct from mandate pool balance
|
||||
|
||||
Args:
|
||||
mandateId: Mandate ID
|
||||
userId: User ID
|
||||
priceCHF: Cost in CHF
|
||||
workflowId: Optional workflow ID
|
||||
featureInstanceId: Optional feature instance ID
|
||||
featureCode: Optional feature code
|
||||
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
|
||||
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
|
||||
description: Transaction description
|
||||
|
||||
Returns:
|
||||
Created transaction dict or None
|
||||
"""
|
||||
if priceCHF <= 0:
|
||||
return None
|
||||
|
|
@ -816,7 +806,11 @@ class BillingObjects:
|
|||
featureCode=featureCode,
|
||||
aicoreProvider=aicoreProvider,
|
||||
aicoreModel=aicoreModel,
|
||||
createdByUserId=userId
|
||||
createdByUserId=userId,
|
||||
processingTime=processingTime,
|
||||
bytesSent=bytesSent,
|
||||
bytesReceived=bytesReceived,
|
||||
errorCount=errorCount
|
||||
)
|
||||
|
||||
# Determine where to deduct balance
|
||||
|
|
@ -828,6 +822,20 @@ class BillingObjects:
|
|||
poolAccount = self.getOrCreateMandateAccount(mandateId)
|
||||
return self.createTransaction(transaction, balanceAccountId=poolAccount["id"])
|
||||
|
||||
# =========================================================================
|
||||
# Workflow Cost Query
|
||||
# =========================================================================
|
||||
|
||||
def getWorkflowCost(self, workflowId: str) -> float:
|
||||
"""Sum of all transaction amounts for a workflow."""
|
||||
if not workflowId:
|
||||
return 0.0
|
||||
transactions = self.db.getRecordset(
|
||||
BillingTransaction,
|
||||
recordFilter={"workflowId": workflowId}
|
||||
)
|
||||
return sum(t.get("amount", 0.0) for t in transactions)
|
||||
|
||||
# =========================================================================
|
||||
# Billing Model Switch Operations
|
||||
# =========================================================================
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@ from modules.datamodels.datamodelUam import AccessLevel
|
|||
|
||||
from modules.datamodels.datamodelChat import (
|
||||
ChatDocument,
|
||||
ChatStat,
|
||||
ChatLog,
|
||||
ChatMessage,
|
||||
ChatWorkflow,
|
||||
|
|
@ -663,10 +662,8 @@ class ChatObjects:
|
|||
|
||||
workflow = workflows[0]
|
||||
try:
|
||||
# Load related data from normalized tables
|
||||
logs = self.getLogs(workflowId)
|
||||
messages = self.getMessages(workflowId)
|
||||
stats = self.getStats(workflowId)
|
||||
|
||||
# Validate workflow data against ChatWorkflow model
|
||||
# Explicit type coercion: DB may store numeric fields as TEXT on some platforms
|
||||
|
|
@ -694,8 +691,7 @@ class ChatObjects:
|
|||
lastActivity=_toFloat(workflow.get("lastActivity")),
|
||||
startedAt=_toFloat(workflow.get("startedAt")),
|
||||
logs=logs,
|
||||
messages=messages,
|
||||
stats=stats
|
||||
messages=messages
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating workflow data: {str(e)}")
|
||||
|
|
@ -731,7 +727,7 @@ class ChatObjects:
|
|||
except Exception as e:
|
||||
logger.warning(f"Could not get Root mandate: {e}")
|
||||
# Note: ChatWorkflow has featureInstanceId for multi-tenancy isolation.
|
||||
# Child tables (ChatMessage, ChatLog, ChatStat, ChatDocument) are user-owned
|
||||
# Child tables (ChatMessage, ChatLog, ChatDocument) are user-owned
|
||||
# and do NOT store featureInstanceId - they inherit isolation from ChatWorkflow.
|
||||
# Ensure featureInstanceId is set from context if not already in workflowData
|
||||
if "featureInstanceId" not in workflowData or not workflowData.get("featureInstanceId"):
|
||||
|
|
@ -789,9 +785,7 @@ class ChatObjects:
|
|||
# Load fresh data from normalized tables
|
||||
logs = self.getLogs(workflowId)
|
||||
messages = self.getMessages(workflowId)
|
||||
stats = self.getStats(workflowId)
|
||||
|
||||
# Convert to ChatWorkflow model
|
||||
return ChatWorkflow(
|
||||
id=updated["id"],
|
||||
status=updated.get("status", workflow.status),
|
||||
|
|
@ -804,8 +798,7 @@ class ChatObjects:
|
|||
lastActivity=updated.get("lastActivity", workflow.lastActivity),
|
||||
startedAt=updated.get("startedAt", workflow.startedAt),
|
||||
logs=logs,
|
||||
messages=messages,
|
||||
stats=stats
|
||||
messages=messages
|
||||
)
|
||||
|
||||
def deleteWorkflow(self, workflowId: str) -> bool:
|
||||
|
|
@ -827,7 +820,6 @@ class ChatObjects:
|
|||
messageId = message.id
|
||||
if messageId:
|
||||
# Delete message documents (but NOT the files!)
|
||||
# Note: ChatStat does NOT have messageId - stats are only at workflow level
|
||||
try:
|
||||
existing_docs = self._getRecordset(ChatDocument, recordFilter={"messageId": messageId})
|
||||
for doc in existing_docs:
|
||||
|
|
@ -839,11 +831,7 @@ class ChatObjects:
|
|||
self.db.recordDelete(ChatMessage, messageId)
|
||||
|
||||
# 2. Delete workflow stats
|
||||
existing_stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
|
||||
for stat in existing_stats:
|
||||
self.db.recordDelete(ChatStat, stat["id"])
|
||||
|
||||
# 3. Delete workflow logs
|
||||
# 2. Delete workflow logs
|
||||
existing_logs = self._getRecordset(ChatLog, recordFilter={"workflowId": workflowId})
|
||||
for log in existing_logs:
|
||||
self.db.recordDelete(ChatLog, log["id"])
|
||||
|
|
@ -1270,7 +1258,6 @@ class ChatObjects:
|
|||
self.db.recordDelete(ChatDocument, doc["id"])
|
||||
|
||||
# 2. Finally delete the message itself
|
||||
# Note: ChatStat has no messageId field -- stats are workflow-level, not message-level
|
||||
success = self.db.recordDelete(ChatMessage, messageId)
|
||||
|
||||
return success
|
||||
|
|
@ -1517,74 +1504,10 @@ class ChatObjects:
|
|||
# Return validated ChatLog instance
|
||||
return ChatLog(**createdLog)
|
||||
|
||||
# Stats methods
|
||||
|
||||
def getStats(self, workflowId: str) -> List[ChatStat]:
|
||||
"""Returns list of statistics for a workflow if user has access."""
|
||||
# Check workflow access first (without calling getWorkflow to avoid circular reference)
|
||||
# Use RBAC filtering
|
||||
workflows = self._getRecordset(ChatWorkflow, recordFilter={"id": workflowId})
|
||||
|
||||
if not workflows:
|
||||
return []
|
||||
|
||||
# Get stats for this workflow from normalized table
|
||||
stats = self._getRecordset(ChatStat, recordFilter={"workflowId": workflowId})
|
||||
|
||||
if not stats:
|
||||
return []
|
||||
|
||||
# Return all stats records sorted by creation time.
|
||||
# Use parseTimestamp to tolerate mixed DB types (float/string) on INT.
|
||||
# DB uses _createdAt (camelCase system field).
|
||||
stats.sort(key=lambda x: parseTimestamp(x.get("_createdAt"), default=0))
|
||||
|
||||
# Convert to ChatStat objects, preserving _createdAt via extra="allow"
|
||||
result = []
|
||||
for stat in stats:
|
||||
chat_stat = ChatStat(**stat)
|
||||
# Explicitly preserve _createdAt from raw DB record
|
||||
if "_createdAt" in stat:
|
||||
setattr(chat_stat, '_createdAt', stat["_createdAt"])
|
||||
result.append(chat_stat)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def createStat(self, statData: Dict[str, Any]) -> ChatStat:
|
||||
"""Creates a new stats record and returns it."""
|
||||
try:
|
||||
# Ensure workflowId is present in statData
|
||||
if "workflowId" not in statData:
|
||||
raise ValueError("workflowId is required in statData")
|
||||
|
||||
# Note: Chat data is user-owned, no mandate/featureInstance context stored
|
||||
# mandateId/featureInstanceId removed from ChatStat model
|
||||
|
||||
# Validate the stat data against ChatStat model
|
||||
stat = ChatStat(**statData)
|
||||
|
||||
logger.debug(f"Creating stat for workflow {statData.get('workflowId')}: "
|
||||
f"process={statData.get('process')}, "
|
||||
f"priceCHF={statData.get('priceCHF', 0):.4f}, "
|
||||
f"processingTime={statData.get('processingTime', 0):.2f}s")
|
||||
|
||||
# Create the stat record in the database
|
||||
created = self.db.recordCreate(ChatStat, stat)
|
||||
|
||||
logger.info(f"Created stat {created.get('id')} for workflow {statData.get('workflowId')}")
|
||||
|
||||
# Return the created ChatStat
|
||||
return ChatStat(**created)
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating workflow stat: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None) -> Dict[str, Any]:
|
||||
def getUnifiedChatData(self, workflowId: str, afterTimestamp: Optional[float] = None, workflowCost: float = 0.0) -> Dict[str, Any]:
|
||||
"""
|
||||
Returns unified chat data (messages, logs, stats) for a workflow in chronological order.
|
||||
Uses timestamp-based selective data transfer for efficient polling.
|
||||
Returns unified chat data (messages, logs) for a workflow in chronological order,
|
||||
plus workflowCost from billing transactions (single source of truth).
|
||||
"""
|
||||
# Check workflow access first
|
||||
# Use RBAC filtering
|
||||
|
|
@ -1652,29 +1575,10 @@ class ChatObjects:
|
|||
"item": chatLog
|
||||
})
|
||||
|
||||
# Get stats - ChatStat model supports _createdAt via model_config extra="allow"
|
||||
stats = self.getStats(workflowId)
|
||||
for stat in stats:
|
||||
# Apply timestamp filtering in Python
|
||||
# Use _createdAt (system field from DB, preserved via model_config extra="allow")
|
||||
stat_timestamp = getattr(stat, '_createdAt', None) or getUtcTimestamp()
|
||||
if afterTimestamp is not None and stat_timestamp <= afterTimestamp:
|
||||
continue
|
||||
|
||||
# Convert to dict and include _createdAt for frontend
|
||||
stat_dict = stat.model_dump() if hasattr(stat, 'model_dump') else stat.dict()
|
||||
stat_dict['_createdAt'] = stat_timestamp
|
||||
|
||||
items.append({
|
||||
"type": "stat",
|
||||
"createdAt": stat_timestamp,
|
||||
"item": stat_dict
|
||||
})
|
||||
|
||||
# Sort all items by createdAt timestamp for chronological order
|
||||
items.sort(key=lambda x: parseTimestamp(x.get("createdAt"), default=0))
|
||||
|
||||
return {"items": items}
|
||||
return {"items": items, "workflowCost": workflowCost}
|
||||
|
||||
|
||||
def getInterface(currentUser: Optional[User] = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> 'ChatObjects':
|
||||
|
|
|
|||
|
|
@ -58,7 +58,6 @@ TABLE_NAMESPACE = {
|
|||
"ChatWorkflow": "chat",
|
||||
"ChatMessage": "chat",
|
||||
"ChatLog": "chat",
|
||||
"ChatStat": "chat",
|
||||
"ChatDocument": "chat",
|
||||
"Prompt": "chat",
|
||||
# Chatbot (poweron_chatbot) - per feature-instance isolation
|
||||
|
|
@ -175,7 +174,7 @@ def getRecordsetWithRBAC(
|
|||
whereValues = []
|
||||
|
||||
# CRITICAL: Only pass featureInstanceId to WHERE clause if the model actually has
|
||||
# this column. Chat child tables (ChatMessage, ChatLog, ChatStat, ChatDocument)
|
||||
# this column. Chat child tables (ChatMessage, ChatLog, ChatDocument)
|
||||
# are user-owned and do NOT have featureInstanceId - only ChatWorkflow does.
|
||||
# Without this check, the SQL query would reference a non-existent column,
|
||||
# causing a silent error that returns empty results.
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ from modules.auth import limiter, requireSysAdminRole, getRequestContext, Reques
|
|||
|
||||
# Import billing components
|
||||
from modules.interfaces.interfaceDbBilling import getInterface as getBillingInterface, _getRootInterface
|
||||
from modules.services.serviceBilling.mainServiceBilling import getService as getBillingService
|
||||
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import getService as getBillingService
|
||||
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
|
||||
from modules.routes.routeDataUsers import _applyFiltersAndSort
|
||||
from modules.datamodels.datamodelBilling import (
|
||||
|
|
@ -162,6 +162,23 @@ def _isAdminOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _isMemberOfMandate(ctx: RequestContext, targetMandateId: str) -> bool:
|
||||
"""Check if user has any enabled membership in the specified mandate."""
|
||||
try:
|
||||
from modules.interfaces.interfaceDbApp import getRootInterface
|
||||
rootInterface = getRootInterface()
|
||||
userMandates = rootInterface.getUserMandates(str(ctx.user.id))
|
||||
for um in userMandates:
|
||||
if str(getattr(um, 'mandateId', None)) != str(targetMandateId):
|
||||
continue
|
||||
if not getattr(um, 'enabled', True):
|
||||
continue
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _filterTransactionsByScope(transactions: list, scope: BillingDataScope) -> list:
|
||||
"""
|
||||
Filter a list of transaction dicts based on the user's BillingDataScope.
|
||||
|
|
@ -720,11 +737,11 @@ def createCheckoutSession(
|
|||
targetMandateId: str = Path(..., description="Mandate ID"),
|
||||
checkoutRequest: CheckoutCreateRequest = Body(...),
|
||||
ctx: RequestContext = Depends(getRequestContext),
|
||||
_admin = Depends(requireSysAdminRole)
|
||||
):
|
||||
"""
|
||||
Create Stripe Checkout Session for credit top-up. Returns redirect URL.
|
||||
SysAdmin only. Amount is validated server-side against allowed presets.
|
||||
RBAC: PREPAY_USER requires mandate membership (user loads own account),
|
||||
PREPAY_MANDATE requires mandate admin role.
|
||||
"""
|
||||
try:
|
||||
billingInterface = getBillingInterface(ctx.user, targetMandateId)
|
||||
|
|
@ -738,10 +755,17 @@ def createCheckoutSession(
|
|||
if billingModel == BillingModelEnum.PREPAY_USER:
|
||||
if not checkoutRequest.userId:
|
||||
raise HTTPException(status_code=400, detail="userId is required for PREPAY_USER model")
|
||||
elif billingModel not in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
|
||||
if str(checkoutRequest.userId) != str(ctx.user.id):
|
||||
raise HTTPException(status_code=403, detail="Users can only load credit to their own account")
|
||||
if not _isMemberOfMandate(ctx, targetMandateId):
|
||||
raise HTTPException(status_code=403, detail="User is not a member of this mandate")
|
||||
elif billingModel in [BillingModelEnum.PREPAY_MANDATE, BillingModelEnum.CREDIT_POSTPAY]:
|
||||
if not _isAdminOfMandate(ctx, targetMandateId):
|
||||
raise HTTPException(status_code=403, detail="Mandate admin role required to load mandate credit")
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Cannot add credit to {billingModel.value} billing model")
|
||||
|
||||
from modules.services.serviceBilling.stripeCheckout import create_checkout_session
|
||||
from modules.serviceCenter.services.serviceBilling.stripeCheckout import create_checkout_session
|
||||
redirect_url = create_checkout_session(
|
||||
mandate_id=targetMandateId,
|
||||
user_id=checkoutRequest.userId,
|
||||
|
|
@ -768,7 +792,7 @@ async def stripeWebhook(
|
|||
No JWT auth - Stripe authenticates via Stripe-Signature header.
|
||||
"""
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
from modules.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
|
||||
from modules.serviceCenter.services.serviceBilling.stripeCheckout import ALLOWED_AMOUNTS_CHF
|
||||
|
||||
webhook_secret = APP_CONFIG.get("STRIPE_WEBHOOK_SECRET")
|
||||
if not webhook_secret:
|
||||
|
|
|
|||
|
|
@ -764,7 +764,7 @@ def send_password_link(
|
|||
expiryHours = int(APP_CONFIG.get("Auth_RESET_TOKEN_EXPIRY_HOURS", "24"))
|
||||
|
||||
try:
|
||||
from modules.services import Services
|
||||
from modules.serviceHub import Services
|
||||
services = Services(targetUser)
|
||||
|
||||
emailSubject = "PowerOn - Passwort setzen"
|
||||
|
|
|
|||
|
|
@ -395,7 +395,7 @@ def trigger_subscription(
|
|||
)
|
||||
|
||||
# Get messaging service from request app state
|
||||
from modules.services import getInterface as getServicesInterface
|
||||
from modules.serviceHub import getInterface as getServicesInterface
|
||||
services = getServicesInterface(context.user, None, mandateId=str(context.mandateId))
|
||||
|
||||
# Konvertiere Dict zu Pydantic Model
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request, sta
|
|||
from modules.auth import limiter, getCurrentUser
|
||||
from modules.datamodels.datamodelUam import User, UserConnection
|
||||
from modules.interfaces.interfaceDbApp import getInterface
|
||||
from modules.services import getInterface as getServices
|
||||
from modules.serviceHub import getInterface as getServices
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@ logger = logging.getLogger(__name__)
|
|||
def getService(
|
||||
key: str,
|
||||
context: ServiceCenterContext,
|
||||
legacy_hub: Optional[Any] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Get a service instance by key for the given context.
|
||||
|
|
@ -34,14 +33,13 @@ def getService(
|
|||
Args:
|
||||
key: Service key (e.g., "web", "extraction", "utils")
|
||||
context: ServiceCenterContext with user, mandate_id, feature_instance_id, workflow
|
||||
legacy_hub: Optional legacy Services instance for fallback when service not yet migrated
|
||||
|
||||
Returns:
|
||||
Service instance
|
||||
"""
|
||||
cache = get_resolution_cache()
|
||||
resolving = set()
|
||||
return resolve(key, context, cache, resolving, legacy_hub=legacy_hub)
|
||||
return resolve(key, context, cache, resolving)
|
||||
|
||||
|
||||
def preWarm(service_keys: Optional[List[str]] = None) -> None:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# All rights reserved.
|
||||
"""
|
||||
Service Center Resolver.
|
||||
Resolution logic, dependency injection, and optional legacy fallback.
|
||||
Resolution logic and dependency injection for service instantiation.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
|
|
@ -14,7 +14,6 @@ from modules.serviceCenter.registry import CORE_SERVICES, IMPORTABLE_SERVICES
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Type for get_service callable passed to services
|
||||
GetServiceFunc = Callable[[str], Any]
|
||||
|
||||
|
||||
|
|
@ -29,50 +28,15 @@ def _load_service_class(module_path: str, class_name: str):
|
|||
return getattr(module, class_name)
|
||||
|
||||
|
||||
def _create_legacy_hub(ctx: ServiceCenterContext) -> Any:
|
||||
"""Create legacy Services instance for fallback when service not yet migrated."""
|
||||
from modules.services import getInterface
|
||||
return getInterface(
|
||||
ctx.user,
|
||||
workflow=ctx.workflow,
|
||||
mandateId=ctx.mandate_id,
|
||||
featureInstanceId=ctx.feature_instance_id,
|
||||
)
|
||||
|
||||
|
||||
def _get_from_legacy(legacy_hub: Any, key: str) -> Any:
|
||||
"""Map service key to legacy hub attribute (for fallback when service center module fails)."""
|
||||
key_to_attr = {
|
||||
"utils": "utils",
|
||||
"security": "security",
|
||||
"streaming": "streaming",
|
||||
"ticket": "ticket",
|
||||
"messaging": "messaging",
|
||||
"billing": "billing",
|
||||
"sharepoint": "sharepoint",
|
||||
"chat": "chat",
|
||||
"extraction": "extraction",
|
||||
"generation": "generation",
|
||||
"ai": "ai",
|
||||
"web": "web",
|
||||
"neutralization": "neutralization",
|
||||
}
|
||||
attr = key_to_attr.get(key)
|
||||
if attr and hasattr(legacy_hub, attr):
|
||||
return getattr(legacy_hub, attr)
|
||||
return None
|
||||
|
||||
|
||||
def resolve(
|
||||
key: str,
|
||||
context: ServiceCenterContext,
|
||||
cache: Dict[str, Any],
|
||||
resolving: Set[str],
|
||||
legacy_hub: Optional[Any] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Resolve a service by key. Uses cache, resolves dependencies recursively.
|
||||
Falls back to legacy_hub if service module cannot be loaded.
|
||||
Raises KeyError if the service is not registered.
|
||||
"""
|
||||
cache_key = f"{_make_context_id(context)}_{key}"
|
||||
if cache_key in cache:
|
||||
|
|
@ -82,59 +46,20 @@ def resolve(
|
|||
raise RuntimeError(f"Circular dependency detected for service: {key}")
|
||||
|
||||
def get_service(dep_key: str) -> Any:
|
||||
return resolve(dep_key, context, cache, resolving, legacy_hub)
|
||||
return resolve(dep_key, context, cache, resolving)
|
||||
|
||||
# Try core first
|
||||
if key in CORE_SERVICES:
|
||||
spec = CORE_SERVICES[key]
|
||||
spec = CORE_SERVICES.get(key) or IMPORTABLE_SERVICES.get(key)
|
||||
if spec:
|
||||
cls = _load_service_class(spec["module"], spec["class"])
|
||||
resolving.add(key)
|
||||
try:
|
||||
cls = _load_service_class(spec["module"], spec["class"])
|
||||
resolving.add(key)
|
||||
try:
|
||||
for dep in spec.get("dependencies", []):
|
||||
get_service(dep)
|
||||
finally:
|
||||
resolving.discard(key)
|
||||
instance = cls(context, get_service)
|
||||
cache[cache_key] = instance
|
||||
return instance
|
||||
except (ImportError, ModuleNotFoundError, AttributeError) as e:
|
||||
logger.debug(f"Could not load core service '{key}' from service center: {e}")
|
||||
if legacy_hub:
|
||||
fallback = _get_from_legacy(legacy_hub, key)
|
||||
if fallback is not None:
|
||||
cache[cache_key] = fallback
|
||||
return fallback
|
||||
raise
|
||||
|
||||
# Try importable
|
||||
if key in IMPORTABLE_SERVICES:
|
||||
spec = IMPORTABLE_SERVICES[key]
|
||||
try:
|
||||
cls = _load_service_class(spec["module"], spec["class"])
|
||||
resolving.add(key)
|
||||
try:
|
||||
for dep in spec.get("dependencies", []):
|
||||
get_service(dep)
|
||||
finally:
|
||||
resolving.discard(key)
|
||||
instance = cls(context, get_service)
|
||||
cache[cache_key] = instance
|
||||
return instance
|
||||
except (ImportError, ModuleNotFoundError, AttributeError) as e:
|
||||
logger.debug(f"Could not load importable service '{key}' from service center: {e}")
|
||||
if legacy_hub:
|
||||
fallback = _get_from_legacy(legacy_hub, key)
|
||||
if fallback is not None:
|
||||
cache[cache_key] = fallback
|
||||
return fallback
|
||||
raise
|
||||
|
||||
if legacy_hub:
|
||||
fallback = _get_from_legacy(legacy_hub, key)
|
||||
if fallback is not None:
|
||||
cache[cache_key] = fallback
|
||||
return fallback
|
||||
for dep in spec.get("dependencies", []):
|
||||
get_service(dep)
|
||||
finally:
|
||||
resolving.discard(key)
|
||||
instance = cls(context, get_service)
|
||||
cache[cache_key] = instance
|
||||
return instance
|
||||
|
||||
raise KeyError(f"Unknown service: {key}")
|
||||
|
||||
|
|
|
|||
|
|
@ -64,6 +64,10 @@ class _ServicesAdapter:
|
|||
def interfaceDbChat(self):
|
||||
return self._get_service("chat").interfaceDbChat
|
||||
|
||||
@property
|
||||
def interfaceDbComponent(self):
|
||||
return self._get_service("chat").interfaceDbComponent
|
||||
|
||||
@property
|
||||
def featureCode(self) -> Optional[str]:
|
||||
w = self.workflow
|
||||
|
|
@ -171,12 +175,8 @@ class AiService:
|
|||
else:
|
||||
response = await self.aiObjects.callWithTextContext(request)
|
||||
finally:
|
||||
# Clear callback after call completes
|
||||
self.aiObjects.billingCallback = None
|
||||
|
||||
# Store workflow stats for analytics
|
||||
self._storeAiCallStats(response, request)
|
||||
|
||||
return response
|
||||
|
||||
# =========================================================================
|
||||
|
|
@ -295,9 +295,6 @@ class AiService:
|
|||
except Exception as e:
|
||||
logger.error(f"BILLING: Failed to record billing for SPEECH_TEAMS: {e}")
|
||||
|
||||
# Store stats
|
||||
self._storeAiCallStats(response, request)
|
||||
|
||||
logger.info(f"SPEECH_TEAMS call completed: model={model.name}, time={processingTime:.2f}s, cost={priceCHF:.4f} CHF")
|
||||
return response
|
||||
|
||||
|
|
@ -644,12 +641,12 @@ detectedIntent-Werte:
|
|||
billingService = getBillingService(user, mandateId, featureInstanceId, featureCode)
|
||||
|
||||
def _billingCallback(response) -> None:
|
||||
"""Record billing for a single AI model call."""
|
||||
"""Record billing transaction with full AI call metadata."""
|
||||
if not response or getattr(response, 'errorCount', 0) > 0:
|
||||
return
|
||||
|
||||
priceCHF = getattr(response, 'priceCHF', 0.0)
|
||||
if not priceCHF or priceCHF <= 0:
|
||||
basePriceCHF = getattr(response, 'priceCHF', 0.0)
|
||||
if not basePriceCHF or basePriceCHF <= 0:
|
||||
return
|
||||
|
||||
provider = getattr(response, 'provider', None) or 'unknown'
|
||||
|
|
@ -657,20 +654,24 @@ detectedIntent-Werte:
|
|||
|
||||
try:
|
||||
billingService.recordUsage(
|
||||
priceCHF=priceCHF,
|
||||
priceCHF=basePriceCHF,
|
||||
workflowId=workflowId,
|
||||
aicoreProvider=provider,
|
||||
aicoreModel=modelName,
|
||||
description=f"AI: {modelName}"
|
||||
description=f"AI: {modelName}",
|
||||
processingTime=getattr(response, 'processingTime', None),
|
||||
bytesSent=getattr(response, 'bytesSent', None),
|
||||
bytesReceived=getattr(response, 'bytesReceived', None),
|
||||
errorCount=getattr(response, 'errorCount', None)
|
||||
)
|
||||
logger.debug(
|
||||
f"Billed model call: {priceCHF:.4f} CHF, "
|
||||
f"Billed model call: {basePriceCHF:.4f} CHF, "
|
||||
f"provider={provider}, model={modelName}, mandate={mandateId}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"BILLING: Failed to record transaction! "
|
||||
f"Cost={priceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
|
||||
f"Cost={basePriceCHF:.4f} CHF, user={user.id}, mandate={mandateId}, "
|
||||
f"provider={provider}, model={modelName}, error={e}"
|
||||
)
|
||||
|
||||
|
|
@ -723,40 +724,6 @@ detectedIntent-Werte:
|
|||
logger.warning(f"Error calculating effective providers: {e}")
|
||||
return None
|
||||
|
||||
def _storeAiCallStats(self, response, request: AiCallRequest) -> None:
|
||||
"""Store workflow stats after an AI call.
|
||||
|
||||
This method stores the AI call statistics (cost, processing time, bytes)
|
||||
to the workflow stats collection for tracking and billing purposes.
|
||||
|
||||
Args:
|
||||
response: AiCallResponse with cost/timing data
|
||||
request: Original AiCallRequest for context
|
||||
"""
|
||||
try:
|
||||
# Skip if no workflow context
|
||||
workflow = getattr(self.services, 'workflow', None)
|
||||
if not workflow or not hasattr(workflow, 'id') or not workflow.id:
|
||||
logger.debug("No workflow context - skipping stats storage")
|
||||
return
|
||||
|
||||
# Skip if response is an error
|
||||
if not response or getattr(response, 'errorCount', 0) > 0:
|
||||
logger.debug("Error response - skipping stats storage")
|
||||
return
|
||||
|
||||
# Determine process name from operation type
|
||||
opType = getattr(request.options, 'operationType', 'unknown') if request.options else 'unknown'
|
||||
process = f"ai.call.{opType}"
|
||||
|
||||
# Store the stat
|
||||
self.services.chat.storeWorkflowStat(workflow, response, process)
|
||||
logger.debug(f"Stored AI call stat: {process}, cost={getattr(response, 'priceCHF', 0):.4f} CHF")
|
||||
|
||||
except Exception as e:
|
||||
# Log but don't fail - stats storage is not critical
|
||||
logger.debug(f"Could not store AI call stat: {str(e)}")
|
||||
|
||||
async def ensureAiObjectsInitialized(self):
|
||||
"""Ensure aiObjects is initialized and submodules are ready."""
|
||||
if self.aiObjects is None:
|
||||
|
|
@ -766,17 +733,17 @@ detectedIntent-Werte:
|
|||
self._initializeSubmodules()
|
||||
|
||||
@classmethod
|
||||
async def create(cls, legacy_services) -> "AiService":
|
||||
"""Create AiService from legacy Services hub. For backward compatibility with tests."""
|
||||
async def create(cls, servicesHub) -> "AiService":
|
||||
"""Create AiService from a ServiceHub instance."""
|
||||
from modules.serviceCenter import getService
|
||||
from modules.serviceCenter.context import ServiceCenterContext
|
||||
ctx = ServiceCenterContext(
|
||||
user=legacy_services.user,
|
||||
mandate_id=legacy_services.mandateId,
|
||||
feature_instance_id=legacy_services.featureInstanceId,
|
||||
workflow=getattr(legacy_services, "workflow", None),
|
||||
user=servicesHub.user,
|
||||
mandate_id=servicesHub.mandateId,
|
||||
feature_instance_id=servicesHub.featureInstanceId,
|
||||
workflow=getattr(servicesHub, "workflow", None),
|
||||
)
|
||||
return getService("ai", ctx, legacy_hub=legacy_services)
|
||||
return getService("ai", ctx)
|
||||
|
||||
# Helper methods
|
||||
|
||||
|
|
|
|||
|
|
@ -125,10 +125,11 @@ class AiCallLooper:
|
|||
logger.error(errorMsg)
|
||||
raise ValueError(errorMsg)
|
||||
|
||||
maxIterations = 50 # Prevent infinite loops
|
||||
maxIterations = 10
|
||||
iteration = 0
|
||||
allSections = [] # Accumulate all sections across iterations
|
||||
lastRawResponse = None # Store last raw JSON response for continuation
|
||||
result = ""
|
||||
allSections = []
|
||||
lastRawResponse = None
|
||||
|
||||
# JSON Base Iteration System:
|
||||
# - jsonBase: the merged JSON string (replaces accumulatedDirectJson array)
|
||||
|
|
|
|||
|
|
@ -261,35 +261,34 @@ class ContentExtractor:
|
|||
|
||||
# Check if it's standardized JSON format (has "documents" or "sections")
|
||||
if document.mimeType == "application/json":
|
||||
try:
|
||||
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
||||
if docBytes:
|
||||
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
||||
if docBytes:
|
||||
try:
|
||||
docData = docBytes.decode('utf-8')
|
||||
jsonData = json.loads(docData)
|
||||
|
||||
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
|
||||
logger.info(f"Document is already in standardized JSON format, using as reference")
|
||||
# Create reference ContentPart for structured JSON
|
||||
contentPart = ContentPart(
|
||||
id=f"ref_{document.id}",
|
||||
label=f"Reference: {document.fileName}",
|
||||
typeGroup="structure",
|
||||
mimeType="application/json",
|
||||
data=docData,
|
||||
metadata={
|
||||
"contentFormat": "reference",
|
||||
"documentId": document.id,
|
||||
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
||||
"skipExtraction": True,
|
||||
"intent": "reference"
|
||||
}
|
||||
)
|
||||
allContentParts.append(contentPart)
|
||||
logger.info(f"✅ Using JSON document directly without extraction")
|
||||
continue # Skip normal extraction for this document
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
|
||||
# Continue with normal extraction
|
||||
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||
logger.warning(f"Could not parse JSON document {document.fileName}: {str(e)}")
|
||||
jsonData = None
|
||||
|
||||
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
|
||||
logger.info(f"Document is already in standardized JSON format, using as reference")
|
||||
contentPart = ContentPart(
|
||||
id=f"ref_{document.id}",
|
||||
label=f"Reference: {document.fileName}",
|
||||
typeGroup="structure",
|
||||
mimeType="application/json",
|
||||
data=docData,
|
||||
metadata={
|
||||
"contentFormat": "reference",
|
||||
"documentId": document.id,
|
||||
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
||||
"skipExtraction": True,
|
||||
"intent": "reference"
|
||||
}
|
||||
)
|
||||
allContentParts.append(contentPart)
|
||||
logger.info(f"✅ Using JSON document directly without extraction")
|
||||
continue
|
||||
|
||||
# Normal extraction path
|
||||
intent = getIntentForDocument(document.id, documentIntents)
|
||||
|
|
|
|||
|
|
@ -230,9 +230,12 @@ class DocumentIntentAnalyzer:
|
|||
else:
|
||||
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
|
||||
|
||||
return None
|
||||
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||
logger.debug(f"Error parsing document {document.fileName}: {str(e)}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
|
||||
logger.error(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
|
||||
return None
|
||||
|
||||
def _buildIntentAnalysisPrompt(
|
||||
|
|
|
|||
|
|
@ -330,17 +330,7 @@ class JsonMergeLogger:
|
|||
except Exception as e:
|
||||
logger.error(f"Failed to write merge log file: {e}")
|
||||
else:
|
||||
# No log file set - write individual file (fallback)
|
||||
currentFileDir = os.path.dirname(os.path.abspath(__file__))
|
||||
logDir = currentFileDir
|
||||
os.makedirs(logDir, exist_ok=True)
|
||||
logFilePath = os.path.join(logDir, f"{mergeId}.txt")
|
||||
try:
|
||||
with open(logFilePath, 'w', encoding='utf-8') as f:
|
||||
f.write(logContent)
|
||||
logger.info(f"JSON merge log written to: {logFilePath}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write merge log file: {e}")
|
||||
logger.debug(f"JSON merge {mergeId} completed ({len(logContent)} chars log). Use initializeLogFile() to persist merge logs.")
|
||||
|
||||
# Clear buffer for next merge
|
||||
JsonMergeLogger._logBuffer = []
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ class StructureFiller:
|
|||
"""Handles filling document structure with content."""
|
||||
|
||||
# Default concurrency limit for parallel generation (chapters/sections)
|
||||
DEFAULT_MAX_CONCURRENT_GENERATION = 16
|
||||
DEFAULT_MAX_CONCURRENT_GENERATION = 5
|
||||
|
||||
def __init__(self, services, aiService):
|
||||
"""Initialize StructureFiller with service center and AI service access."""
|
||||
|
|
@ -568,11 +568,16 @@ class StructureFiller:
|
|||
all_sections_list: List[Dict[str, Any]],
|
||||
language: str,
|
||||
outputFormat: str = "txt",
|
||||
calculateOverallProgress: callable = None
|
||||
calculateOverallProgress: callable = None,
|
||||
preExtractedText: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process a single section and return its elements.
|
||||
Used for parallel processing of sections within a chapter.
|
||||
|
||||
When preExtractedText is provided, the section uses the pre-extracted
|
||||
content directly in its prompt instead of sending raw content parts
|
||||
through the heavy extraction pipeline (avoids chunking + N*M AI calls).
|
||||
"""
|
||||
sectionId = section.get("id")
|
||||
sectionTitle = section.get("title", sectionId)
|
||||
|
|
@ -600,6 +605,149 @@ class StructureFiller:
|
|||
|
||||
elements = []
|
||||
|
||||
# --- Fast path: use pre-extracted text instead of raw content parts ---
|
||||
if preExtractedText and useAiCall and generationHint:
|
||||
logger.info(
|
||||
f"Section {sectionId}: Using pre-extracted text "
|
||||
f"({len(preExtractedText):,} chars) - lightweight AI path"
|
||||
)
|
||||
|
||||
for partId in contentPartIds:
|
||||
part = self._findContentPartById(partId, contentParts)
|
||||
if not part:
|
||||
continue
|
||||
cf = contentFormats.get(partId, part.metadata.get("contentFormat"))
|
||||
if cf == "reference":
|
||||
elements.append({
|
||||
"type": "reference",
|
||||
"documentReference": part.metadata.get("documentReference"),
|
||||
"label": part.metadata.get("usageHint", part.label)
|
||||
})
|
||||
elif cf == "object":
|
||||
if part.typeGroup == "image" and part.data:
|
||||
caption = (
|
||||
section.get("caption")
|
||||
or section.get("metadata", {}).get("caption")
|
||||
or part.metadata.get("caption", "")
|
||||
)
|
||||
elements.append({
|
||||
"type": "image",
|
||||
"content": {
|
||||
"base64Data": part.data,
|
||||
"altText": part.metadata.get("usageHint", part.label),
|
||||
"caption": caption
|
||||
},
|
||||
"caption": caption
|
||||
})
|
||||
|
||||
generationPrompt, templateStructure = self._buildSectionGenerationPrompt(
|
||||
section=section,
|
||||
contentParts=[],
|
||||
userPrompt=userPrompt,
|
||||
generationHint=generationHint,
|
||||
allSections=all_sections_list,
|
||||
sectionIndex=sectionIndex,
|
||||
isAggregation=False,
|
||||
language=language,
|
||||
outputFormat=outputFormat,
|
||||
preExtractedText=preExtractedText
|
||||
)
|
||||
|
||||
sectionOperationId = f"{fillOperationId}_section_{sectionId}"
|
||||
self.services.chat.progressLogStart(
|
||||
sectionOperationId,
|
||||
"Section Generation (Pre-extracted)",
|
||||
f"Section {sectionIndex + 1}/{totalSections}",
|
||||
f"{sectionTitle} (pre-extracted)",
|
||||
parentOperationId=chapterOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.4, "Calling AI for content generation")
|
||||
|
||||
operationType = OperationTypeEnum.DATA_ANALYSE
|
||||
options = AiCallOptions(
|
||||
operationType=operationType,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
aiResponseJson = await self.aiService.callAiWithLooping(
|
||||
prompt=generationPrompt,
|
||||
options=options,
|
||||
debugPrefix=f"{chapterId}_section_{sectionId}",
|
||||
promptBuilder=self.buildSectionPromptWithContinuation,
|
||||
promptArgs={
|
||||
"section": section,
|
||||
"contentParts": [],
|
||||
"userPrompt": userPrompt,
|
||||
"generationHint": generationHint,
|
||||
"allSections": all_sections_list,
|
||||
"sectionIndex": sectionIndex,
|
||||
"isAggregation": False,
|
||||
"templateStructure": templateStructure,
|
||||
"basePrompt": generationPrompt,
|
||||
"language": language
|
||||
},
|
||||
operationId=sectionOperationId,
|
||||
userPrompt=userPrompt,
|
||||
contentParts=None,
|
||||
useCaseId="section_content"
|
||||
)
|
||||
|
||||
try:
|
||||
from modules.shared.jsonUtils import tryParseJson, repairBrokenJson
|
||||
if isinstance(aiResponseJson, str) and ("---" in aiResponseJson or aiResponseJson.count("```json") > 1):
|
||||
generatedElements = self._extractAndMergeMultipleJsonBlocks(aiResponseJson, contentType, sectionId)
|
||||
else:
|
||||
parsedResponse, parseError, cleanedStr = tryParseJson(aiResponseJson)
|
||||
if parsedResponse is None:
|
||||
logger.warning(f"Section {sectionId}: tryParseJson failed, attempting repair")
|
||||
repairedStr = repairBrokenJson(aiResponseJson)
|
||||
parsedResponse, parseError2, _ = tryParseJson(repairedStr)
|
||||
|
||||
if parsedResponse and isinstance(parsedResponse, dict):
|
||||
generatedElements = parsedResponse.get("elements", [])
|
||||
elif parsedResponse and isinstance(parsedResponse, list):
|
||||
generatedElements = parsedResponse
|
||||
else:
|
||||
generatedElements = []
|
||||
except Exception as parseErr:
|
||||
logger.error(f"Section {sectionId}: JSON parse error: {parseErr}")
|
||||
generatedElements = []
|
||||
|
||||
self.services.chat.progressLogUpdate(sectionOperationId, 0.8, "Validating generated content")
|
||||
|
||||
class _AiResponse:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
|
||||
responseElements = await self._processAiResponseForSection(
|
||||
aiResponse=_AiResponse(aiResponseJson),
|
||||
contentType=contentType,
|
||||
operationType=operationType,
|
||||
sectionId=sectionId,
|
||||
generationHint=generationHint,
|
||||
generatedElements=generatedElements,
|
||||
section=section
|
||||
)
|
||||
elements.extend(responseElements)
|
||||
self.services.chat.progressLogFinish(sectionOperationId, True)
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(sectionOperationId, False)
|
||||
logger.error(f"Error in pre-extracted section {sectionId}: {e}")
|
||||
elements.append({
|
||||
"type": "error",
|
||||
"message": f"Error processing section {sectionId}: {str(e)}",
|
||||
"sectionId": sectionId
|
||||
})
|
||||
|
||||
return elements
|
||||
|
||||
# --- Standard path: process content parts directly ---
|
||||
|
||||
# Prüfe ob Aggregation nötig ist
|
||||
needsAggregation = self._needsAggregation(
|
||||
contentType=contentType,
|
||||
|
|
@ -1507,6 +1655,156 @@ class StructureFiller:
|
|||
|
||||
return elements
|
||||
|
||||
async def _preExtractSharedContent(
|
||||
self,
|
||||
contentParts: List[ContentPart],
|
||||
allSectionTasks: List[Dict[str, Any]],
|
||||
userPrompt: str,
|
||||
parentOperationId: str
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Pre-extract content from large/shared content parts ONCE before parallel
|
||||
section filling. Returns dict mapping sectionId -> pre-extracted text.
|
||||
|
||||
Extracts a comprehensive plain-text summary per content part, then gives
|
||||
ALL sections referencing that part the SAME summary. Each section's own
|
||||
generationHint focuses the AI on the relevant aspect during generation.
|
||||
|
||||
This eliminates the N*M AI call explosion where N sections each independently
|
||||
chunk and process the same M-byte content part through the extraction pipeline.
|
||||
"""
|
||||
SIZE_THRESHOLD = 100_000
|
||||
MIN_SHARED_SECTIONS = 2
|
||||
|
||||
partToSections: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for task in allSectionTasks:
|
||||
section = task["section"]
|
||||
for partId in section.get("contentPartIds", []):
|
||||
if partId not in partToSections:
|
||||
partToSections[partId] = []
|
||||
partToSections[partId].append(section)
|
||||
|
||||
if not partToSections:
|
||||
return {}
|
||||
|
||||
preExtractedCache: Dict[str, str] = {}
|
||||
|
||||
for partId, sections in partToSections.items():
|
||||
part = self._findContentPartById(partId, contentParts)
|
||||
if not part:
|
||||
continue
|
||||
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
if contentFormat != "extracted":
|
||||
continue
|
||||
|
||||
if part.typeGroup in ("image", "binary"):
|
||||
continue
|
||||
if part.mimeType and (
|
||||
part.mimeType.startswith("image/")
|
||||
or part.mimeType.startswith("video/")
|
||||
or part.mimeType.startswith("audio/")
|
||||
):
|
||||
continue
|
||||
|
||||
partSize = len(part.data) if part.data else 0
|
||||
numSections = len(sections)
|
||||
|
||||
if numSections < MIN_SHARED_SECTIONS and partSize < SIZE_THRESHOLD:
|
||||
continue
|
||||
|
||||
fileName = part.metadata.get("originalFileName", partId)
|
||||
logger.info(
|
||||
f"Pre-extracting content part {partId} "
|
||||
f"({partSize:,} bytes, referenced by {numSections} sections)"
|
||||
)
|
||||
|
||||
topicLines = []
|
||||
for section in sections:
|
||||
hint = (
|
||||
section.get("generationHint")
|
||||
or section.get("generation_hint")
|
||||
or section.get("title", "")
|
||||
)
|
||||
topicLines.append(f"- {hint}")
|
||||
topicsText = "\n".join(topicLines)
|
||||
|
||||
extractionPrompt = (
|
||||
"# TASK: Extract key information from this document\n\n"
|
||||
"Extract ALL relevant information from the provided content as "
|
||||
"plain text. The extracted content will be used to generate a report "
|
||||
"covering the topics listed below.\n\n"
|
||||
f"## User Request\n{userPrompt}\n\n"
|
||||
f"## Report topics that need data\n{topicsText}\n\n"
|
||||
"## Instructions\n"
|
||||
"- Extract key facts, data points, timestamps, error messages, "
|
||||
"statistics, and specific findings\n"
|
||||
"- Organize by theme but output as PLAIN TEXT (not JSON)\n"
|
||||
"- Be comprehensive but concise - include specific data, "
|
||||
"skip generic filler\n"
|
||||
"- Include concrete examples with exact values from the source\n"
|
||||
"- Do NOT add commentary or analysis - just extract the raw data\n"
|
||||
)
|
||||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(
|
||||
parentOperationId, 0.05,
|
||||
f"Pre-extracting content from {fileName} ({partSize:,} bytes)..."
|
||||
)
|
||||
|
||||
def _preExtractionProgress(chunkProgress, message):
|
||||
mapped = 0.05 + chunkProgress * 0.05
|
||||
self.services.chat.progressLogUpdate(
|
||||
parentOperationId, mapped,
|
||||
f"Pre-extraction: {message}"
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=extractionPrompt,
|
||||
contentParts=[part],
|
||||
options=AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_ANALYSE,
|
||||
priority=PriorityEnum.BALANCED,
|
||||
processingMode=ProcessingModeEnum.DETAILED
|
||||
)
|
||||
)
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
response = await self.aiService.callAi(request, progressCallback=_preExtractionProgress)
|
||||
responseText = response.content if hasattr(response, "content") else str(response)
|
||||
|
||||
if responseText and len(responseText.strip()) > 50:
|
||||
for section in sections:
|
||||
sId = section.get("id", "unknown")
|
||||
preExtractedCache[sId] = responseText
|
||||
logger.info(
|
||||
f"Pre-extraction of {partId} successful: "
|
||||
f"{len(responseText):,} chars summary for {numSections} sections"
|
||||
)
|
||||
self.services.chat.progressLogUpdate(
|
||||
parentOperationId, 0.10,
|
||||
f"Pre-extraction complete ({len(responseText):,} chars). Starting section generation..."
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Pre-extraction of {partId} returned empty/short response "
|
||||
f"({len(responseText) if responseText else 0} chars), "
|
||||
"sections will fall back to direct extraction"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Pre-extraction of {partId} failed: {e}. "
|
||||
"Sections will fall back to direct extraction."
|
||||
)
|
||||
|
||||
if preExtractedCache:
|
||||
logger.info(
|
||||
f"Pre-extraction complete: {len(preExtractedCache)} sections "
|
||||
"have pre-extracted content (will use lightweight AI path)"
|
||||
)
|
||||
|
||||
return preExtractedCache
|
||||
|
||||
async def _fillChapterSections(
|
||||
self,
|
||||
chapterStructure: Dict[str, Any],
|
||||
|
|
@ -1564,27 +1862,42 @@ class StructureFiller:
|
|||
"docFormat": docFormat # Include output format
|
||||
})
|
||||
|
||||
MAX_TOTAL_SECTIONS = 35
|
||||
if totalSections > MAX_TOTAL_SECTIONS:
|
||||
logger.warning(
|
||||
f"Structure has {totalSections} sections (limit {MAX_TOTAL_SECTIONS}). "
|
||||
"Truncating to stay within budget."
|
||||
)
|
||||
allSectionTasks = allSectionTasks[:MAX_TOTAL_SECTIONS]
|
||||
totalSections = len(allSectionTasks)
|
||||
|
||||
preExtractedCache = await self._preExtractSharedContent(
|
||||
contentParts, allSectionTasks, userPrompt, fillOperationId
|
||||
)
|
||||
|
||||
logger.info(f"Starting FULLY PARALLEL section generation: {totalSections} sections across {totalChapters} chapters")
|
||||
|
||||
# Create task wrapper for each section with progress tracking
|
||||
async def processSectionWithSemaphore(taskInfo):
|
||||
checkWorkflowStopped(self.services)
|
||||
sectionId = taskInfo["section"].get("id", "unknown")
|
||||
async with sectionSemaphore:
|
||||
result = await self._processSingleSection(
|
||||
section=taskInfo["section"],
|
||||
sectionIndex=taskInfo["sectionIndex"],
|
||||
totalSections=taskInfo["chapterSectionCount"],
|
||||
chapterIndex=0, # Not used for sequential logic anymore
|
||||
chapterIndex=0,
|
||||
totalChapters=totalChapters,
|
||||
chapterId=taskInfo["chapterId"],
|
||||
chapterOperationId=fillOperationId, # Use fillOperationId as parent (no chapter-level ops in parallel mode)
|
||||
chapterOperationId=fillOperationId,
|
||||
fillOperationId=fillOperationId,
|
||||
contentParts=contentParts,
|
||||
userPrompt=userPrompt,
|
||||
all_sections_list=all_sections_list,
|
||||
language=taskInfo["docLanguage"],
|
||||
outputFormat=taskInfo.get("docFormat", "txt"), # Pass output format
|
||||
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0
|
||||
outputFormat=taskInfo.get("docFormat", "txt"),
|
||||
calculateOverallProgress=lambda *args: completedSections[0] / totalSections if totalSections > 0 else 1.0,
|
||||
preExtractedText=preExtractedCache.get(sectionId)
|
||||
)
|
||||
|
||||
# Update progress after each section completes
|
||||
|
|
@ -1810,6 +2123,7 @@ GENERATION HINT: {generationHint}
|
|||
- Each section should serve a clear purpose with meaningful data
|
||||
- If no relevant data exists for a topic, do NOT create a section for it
|
||||
- Prefer ONE comprehensive section over multiple sparse sections
|
||||
- HARD LIMIT: Maximum 5 sections per chapter. Combine related subtopics into single sections to stay within this limit.
|
||||
|
||||
**CRITICAL**: The chapter's generationHint above describes what content this chapter should generate. If the generationHint references documents/images/data, then EACH section that generates content for this chapter MUST assign the relevant ContentParts from AVAILABLE CONTENT PARTS below.
|
||||
|
||||
|
|
@ -1893,7 +2207,8 @@ Return only valid JSON. Do not include any explanatory text outside the JSON.
|
|||
sectionIndex: Optional[int] = None,
|
||||
isAggregation: bool = False,
|
||||
language: str = "en",
|
||||
outputFormat: str = "txt"
|
||||
outputFormat: str = "txt",
|
||||
preExtractedText: Optional[str] = None
|
||||
) -> tuple[str, str]:
|
||||
"""Baue Prompt für Section-Generierung mit vollständigem Kontext."""
|
||||
# Filtere None-Werte
|
||||
|
|
@ -2057,7 +2372,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
|
||||
6. Format based on content_type ({effectiveContentType}).
|
||||
7. No HTML/styling: Plain text only, no markup.
|
||||
8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
|
||||
8. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
|
||||
|
||||
|
||||
## OUTPUT FORMAT
|
||||
|
|
@ -2083,6 +2398,62 @@ Output requirements:
|
|||
{userPrompt}
|
||||
```
|
||||
|
||||
## CONTEXT
|
||||
{contextText if contextText else ""}
|
||||
"""
|
||||
elif preExtractedText:
|
||||
prompt = f"""# TASK: Generate Section Content from Pre-Extracted Data
|
||||
|
||||
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
|
||||
|
||||
## SECTION METADATA
|
||||
- Section ID: {sectionId}
|
||||
- Content Type: {effectiveContentType}
|
||||
- Generation Hint: {generationHint}{formatNoteAggr}
|
||||
|
||||
## CONTENT EFFICIENCY PRINCIPLES
|
||||
- Generate COMPACT content: Focus on essential facts only
|
||||
- AVOID verbose text, filler phrases, or redundant explanations
|
||||
- Be CONCISE and direct - every word should add value
|
||||
- NO introductory phrases like "This section describes..." or "Here we present..."
|
||||
- Minimize output size for efficient processing
|
||||
|
||||
## PRE-EXTRACTED CONTENT FOR THIS SECTION
|
||||
```
|
||||
{preExtractedText}
|
||||
```
|
||||
|
||||
## INSTRUCTIONS
|
||||
1. Use ONLY the pre-extracted content above. Never invent or generate data not present in it.
|
||||
2. If the pre-extracted content is empty, return empty structures.
|
||||
3. Format based on content_type ({effectiveContentType}).
|
||||
4. Return only valid JSON with "elements" array.
|
||||
5. No HTML/styling: Plain text only, no markup.
|
||||
6. Focus on the MOST RELEVANT information. Be concise.
|
||||
|
||||
## OUTPUT FORMAT
|
||||
Return a JSON object with this structure:
|
||||
|
||||
{{
|
||||
"elements": [
|
||||
{{
|
||||
"type": "{effectiveContentType}",
|
||||
"content": {contentStructureExample}
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Output requirements:
|
||||
- "content" must be an object (never a string)
|
||||
- Return only valid JSON - no text before, no text after, no comments, no explanations, no markdown code fences
|
||||
- Start with {{ and end with }} - return ONLY the JSON object itself
|
||||
- No invented data: Return empty structures if pre-extracted content is empty
|
||||
|
||||
## USER REQUEST
|
||||
```
|
||||
{userPrompt}
|
||||
```
|
||||
|
||||
## CONTEXT
|
||||
{contextText if contextText else ""}
|
||||
"""
|
||||
|
|
@ -2117,7 +2488,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
|
|||
3. Format based on content_type ({effectiveContentType}).
|
||||
4. Return only valid JSON with "elements" array.
|
||||
5. No HTML/styling: Plain text only, no markup.
|
||||
6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself.
|
||||
6. Focus on the MOST RELEVANT information for this section's topic. Extract key facts, data, and findings. Omit redundant, repetitive, or tangential content.
|
||||
|
||||
## OUTPUT FORMAT
|
||||
Return a JSON object with this structure:
|
||||
|
|
|
|||
|
|
@ -430,6 +430,7 @@ Then chapters that generate those generic content types MUST assign the relevant
|
|||
## CHAPTER STRUCTURE REQUIREMENTS
|
||||
- Generate chapters based on USER REQUEST - analyze what structure the user wants
|
||||
- Create ONLY the minimum chapters needed to cover the user's request - avoid over-structuring
|
||||
- HARD LIMIT: Maximum 7 chapters per document. If the topic can be covered in fewer, prefer fewer. Combine related topics into single chapters rather than creating many small ones.
|
||||
- IMPORTANT: Each chapter MUST have ALL these fields:
|
||||
- id: Unique identifier (e.g., "chapter_1")
|
||||
- level: Heading level (1, 2, 3, etc.)
|
||||
|
|
|
|||
|
|
@ -205,36 +205,20 @@ class BillingService:
|
|||
workflowId: str = None,
|
||||
aicoreProvider: str = None,
|
||||
aicoreModel: str = None,
|
||||
description: str = None
|
||||
description: str = None,
|
||||
processingTime: float = None,
|
||||
bytesSent: int = None,
|
||||
bytesReceived: int = None,
|
||||
errorCount: int = None
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Record AI usage cost as a billing transaction.
|
||||
|
||||
This method:
|
||||
1. Applies the pricing markup
|
||||
2. Creates a DEBIT transaction
|
||||
3. Updates the account balance
|
||||
|
||||
Args:
|
||||
priceCHF: Base price from AI model (before markup)
|
||||
workflowId: Optional workflow ID
|
||||
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
|
||||
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
|
||||
description: Optional description
|
||||
|
||||
Returns:
|
||||
Created transaction dict or None if not recorded
|
||||
"""
|
||||
"""Record AI usage cost as a billing transaction with markup applied."""
|
||||
if priceCHF <= 0:
|
||||
return None
|
||||
|
||||
# Apply markup
|
||||
finalPrice = self.calculatePriceWithMarkup(priceCHF)
|
||||
|
||||
if finalPrice <= 0:
|
||||
return None
|
||||
|
||||
# Build description
|
||||
if not description:
|
||||
description = f"AI Usage: {aicoreModel or aicoreProvider or 'unknown'}"
|
||||
|
||||
|
|
@ -247,9 +231,17 @@ class BillingService:
|
|||
featureCode=self.featureCode,
|
||||
aicoreProvider=aicoreProvider,
|
||||
aicoreModel=aicoreModel,
|
||||
description=description
|
||||
description=description,
|
||||
processingTime=processingTime,
|
||||
bytesSent=bytesSent,
|
||||
bytesReceived=bytesReceived,
|
||||
errorCount=errorCount
|
||||
)
|
||||
|
||||
def getWorkflowCost(self, workflowId: str) -> float:
|
||||
"""Get total cost for a workflow from billing transactions."""
|
||||
return self._billingInterface.getWorkflowCost(workflowId)
|
||||
|
||||
# =========================================================================
|
||||
# Provider Permission Check (via RBAC)
|
||||
# =========================================================================
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
from modules.datamodels.datamodelUam import User, UserConnection
|
||||
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatStat, ChatLog
|
||||
from modules.datamodels.datamodelChat import ChatDocument, ChatMessage, ChatLog
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
|
||||
from modules.shared.progressLogger import ProgressLogger
|
||||
|
||||
|
|
@ -688,35 +688,6 @@ class ChatService:
|
|||
workflow.logs.append(chatLog)
|
||||
return chatLog
|
||||
|
||||
def storeWorkflowStat(self, workflow: Any, aiResponse: Any, process: str) -> ChatStat:
|
||||
"""Persist workflow-level ChatStat from AiCallResponse and append to workflow stats list.
|
||||
|
||||
Billing is handled at the AI call source (interfaceAiObjects._callWithModel)
|
||||
via billingCallback - not here. This method only handles workflow stats.
|
||||
"""
|
||||
try:
|
||||
statData = {
|
||||
"workflowId": workflow.id,
|
||||
"process": process,
|
||||
"engine": aiResponse.modelName,
|
||||
"priceCHF": aiResponse.priceCHF,
|
||||
"processingTime": aiResponse.processingTime,
|
||||
"bytesSent": aiResponse.bytesSent,
|
||||
"bytesReceived": aiResponse.bytesReceived,
|
||||
"errorCount": aiResponse.errorCount
|
||||
}
|
||||
|
||||
stat = self.interfaceDbChat.createStat(statData)
|
||||
|
||||
if not hasattr(workflow, 'stats') or workflow.stats is None:
|
||||
workflow.stats = []
|
||||
workflow.stats.append(stat)
|
||||
|
||||
return stat
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store workflow stat: {e}")
|
||||
raise
|
||||
|
||||
def updateMessage(self, messageId: str, messageData: Dict[str, Any]):
|
||||
"""Update message by delegating to the chat interface"""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -2,90 +2,147 @@
|
|||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import json
|
||||
import logging
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Chunker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StructureChunker(Chunker):
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
maxBytes = int(options.get("structureChunkSize", 40000))
|
||||
data = part.data or ""
|
||||
# best-effort: try JSON list/object bucketing; else fallback to line-based
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
|
||||
try:
|
||||
obj = json.loads(data)
|
||||
def emit(bucket: Any):
|
||||
text = json.dumps(bucket, ensure_ascii=False)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
if isinstance(obj, list):
|
||||
bucket: list[Any] = []
|
||||
size = 0
|
||||
for item in obj:
|
||||
text = json.dumps(item, ensure_ascii=False)
|
||||
s = len(text.encode('utf-8'))
|
||||
if size + s > maxBytes and bucket:
|
||||
emit(bucket)
|
||||
bucket = [item]
|
||||
size = s
|
||||
else:
|
||||
bucket.append(item)
|
||||
size += s
|
||||
if bucket:
|
||||
emit(bucket)
|
||||
else:
|
||||
# JSON object (dict) - check if it fits
|
||||
text = json.dumps(obj, ensure_ascii=False)
|
||||
textSize = len(text.encode('utf-8'))
|
||||
if textSize <= maxBytes:
|
||||
emit(obj)
|
||||
else:
|
||||
# Object too large - try to split by keys if possible
|
||||
# For large objects, we need to chunk by character boundaries
|
||||
# since we can't split JSON objects arbitrarily
|
||||
if isinstance(obj, dict) and len(obj) > 1:
|
||||
# Try to split object into multiple chunks by keys
|
||||
# This preserves JSON structure better than line-based chunking
|
||||
currentChunk: Dict[str, Any] = {}
|
||||
currentSize = 2 # Start with "{}" overhead
|
||||
for key, value in obj.items():
|
||||
itemText = json.dumps({key: value}, ensure_ascii=False)
|
||||
itemSize = len(itemText.encode('utf-8'))
|
||||
# Account for comma and spacing between items
|
||||
if currentChunk:
|
||||
itemSize += 2 # ", " separator
|
||||
|
||||
if currentSize + itemSize > maxBytes and currentChunk:
|
||||
# Current chunk is full, emit it
|
||||
emit(currentChunk)
|
||||
currentChunk = {key: value}
|
||||
currentSize = len(itemText.encode('utf-8'))
|
||||
else:
|
||||
currentChunk[key] = value
|
||||
currentSize += itemSize
|
||||
|
||||
# Emit remaining chunk
|
||||
if currentChunk:
|
||||
emit(currentChunk)
|
||||
else:
|
||||
# Single large value or can't split - fallback to line chunking
|
||||
raise ValueError("too large")
|
||||
except Exception:
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
for line in data.split('\n'):
|
||||
s = len(line.encode('utf-8')) + 1
|
||||
if size + s > maxBytes and current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
current = [line]
|
||||
size = s
|
||||
else:
|
||||
current.append(line)
|
||||
size += s
|
||||
if current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
self._chunkValue(obj, maxBytes, chunks)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
self._chunkByLines(data, maxBytes, chunks)
|
||||
|
||||
return chunks
|
||||
|
||||
def _chunkValue(self, obj: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
|
||||
"""Recursively chunk a JSON value (list or dict) into pieces <= maxBytes."""
|
||||
text = json.dumps(obj, ensure_ascii=False)
|
||||
if len(text.encode('utf-8')) <= maxBytes:
|
||||
self._emit(obj, chunks)
|
||||
return
|
||||
|
||||
if isinstance(obj, list):
|
||||
self._chunkList(obj, maxBytes, chunks)
|
||||
elif isinstance(obj, dict):
|
||||
self._chunkDict(obj, maxBytes, chunks)
|
||||
else:
|
||||
self._chunkByLines(text, maxBytes, chunks)
|
||||
|
||||
def _chunkList(self, items: list, maxBytes: int, chunks: List[Dict[str, Any]]):
|
||||
"""Split a JSON array into sub-arrays that each fit within maxBytes."""
|
||||
bucket: list = []
|
||||
bucketSize = 2 # "[]" overhead
|
||||
|
||||
for item in items:
|
||||
itemText = json.dumps(item, ensure_ascii=False)
|
||||
itemSize = len(itemText.encode('utf-8'))
|
||||
separator = 2 if bucket else 0 # ", "
|
||||
|
||||
if bucketSize + itemSize + separator > maxBytes and bucket:
|
||||
self._emit(bucket, chunks)
|
||||
bucket = []
|
||||
bucketSize = 2
|
||||
separator = 0
|
||||
|
||||
if itemSize + 2 > maxBytes:
|
||||
if bucket:
|
||||
self._emit(bucket, chunks)
|
||||
bucket = []
|
||||
bucketSize = 2
|
||||
self._chunkValue(item, maxBytes, chunks)
|
||||
else:
|
||||
bucket.append(item)
|
||||
bucketSize += itemSize + separator
|
||||
|
||||
if bucket:
|
||||
self._emit(bucket, chunks)
|
||||
|
||||
def _chunkDict(self, obj: dict, maxBytes: int, chunks: List[Dict[str, Any]]):
|
||||
"""Split a JSON object by keys. If a single key's value exceeds maxBytes, recurse into it."""
|
||||
if len(obj) <= 1:
|
||||
key, value = next(iter(obj.items()))
|
||||
if isinstance(value, (list, dict)):
|
||||
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
|
||||
else:
|
||||
text = json.dumps(obj, ensure_ascii=False)
|
||||
self._chunkByLines(text, maxBytes, chunks)
|
||||
return
|
||||
|
||||
currentChunk: Dict[str, Any] = {}
|
||||
currentSize = 2 # "{}" overhead
|
||||
|
||||
for key, value in obj.items():
|
||||
itemText = json.dumps({key: value}, ensure_ascii=False)
|
||||
itemSize = len(itemText.encode('utf-8'))
|
||||
separator = 2 if currentChunk else 0
|
||||
|
||||
if currentSize + itemSize + separator > maxBytes and currentChunk:
|
||||
self._emit(currentChunk, chunks)
|
||||
currentChunk = {}
|
||||
currentSize = 2
|
||||
separator = 0
|
||||
|
||||
if itemSize + 2 > maxBytes:
|
||||
if currentChunk:
|
||||
self._emit(currentChunk, chunks)
|
||||
currentChunk = {}
|
||||
currentSize = 2
|
||||
if isinstance(value, (list, dict)):
|
||||
self._chunkSingleKeyValue(key, value, maxBytes, chunks)
|
||||
else:
|
||||
self._chunkByLines(itemText, maxBytes, chunks)
|
||||
else:
|
||||
currentChunk[key] = value
|
||||
currentSize += itemSize + separator
|
||||
|
||||
if currentChunk:
|
||||
self._emit(currentChunk, chunks)
|
||||
|
||||
def _chunkSingleKeyValue(self, key: str, value: Any, maxBytes: int, chunks: List[Dict[str, Any]]):
|
||||
"""Handle a single dict key whose value is too large. Wraps sub-chunks back in {key: subChunk}."""
|
||||
subChunks: List[Dict[str, Any]] = []
|
||||
self._chunkValue(value, maxBytes, subChunks)
|
||||
|
||||
for sub in subChunks:
|
||||
subData = json.loads(sub["data"])
|
||||
wrapped = {key: subData}
|
||||
wrappedText = json.dumps(wrapped, ensure_ascii=False)
|
||||
wrappedSize = len(wrappedText.encode('utf-8'))
|
||||
if wrappedSize <= maxBytes:
|
||||
self._emit(wrapped, chunks)
|
||||
else:
|
||||
self._chunkByLines(wrappedText, maxBytes, chunks)
|
||||
|
||||
def _emit(self, bucket: Any, chunks: List[Dict[str, Any]]):
|
||||
text = json.dumps(bucket, ensure_ascii=False)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
|
||||
def _chunkByLines(self, data: str, maxBytes: int, chunks: List[Dict[str, Any]]):
|
||||
"""Line-based fallback for content that cannot be split structurally."""
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
for line in data.split('\n'):
|
||||
s = len(line.encode('utf-8')) + 1
|
||||
if size + s > maxBytes and current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
current = [line]
|
||||
size = s
|
||||
else:
|
||||
current.append(line)
|
||||
size += s
|
||||
if current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -243,11 +243,7 @@ class ExtractionService:
|
|||
errorCount=0
|
||||
)
|
||||
|
||||
self._get_service("chat").storeWorkflowStat(
|
||||
self._context.workflow,
|
||||
aiResponse,
|
||||
f"extraction.process.{doc.mimeType}"
|
||||
)
|
||||
# Cost is recorded via billingCallback in _callWithModel
|
||||
|
||||
# Write extraction results to debug file
|
||||
try:
|
||||
|
|
@ -1230,15 +1226,52 @@ class ExtractionService:
|
|||
logger.info(f"Chunking {contentPart.typeGroup} part: contentSize={contentSize} bytes, textChunkSize={textChunkSize} bytes, structureChunkSize={structureChunkSize} bytes")
|
||||
chunks = chunker.chunk(contentPart, chunkingOptions)
|
||||
logger.info(f"Created {len(chunks)} chunks for {contentPart.typeGroup} part (contentSize={contentSize} bytes)")
|
||||
if chunks:
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
|
||||
logger.info(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes")
|
||||
return chunks
|
||||
|
||||
# Post-chunking validation: force line-based split on any chunk still exceeding target
|
||||
validatedChunks = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunkData = chunk.get('data', '')
|
||||
chunkSize = len(chunkData.encode('utf-8')) if chunkData else 0
|
||||
if chunkSize > availableContentBytes and chunkData:
|
||||
logger.warning(f" Chunk {i+1}/{len(chunks)}: {chunkSize} bytes exceeds target {availableContentBytes} bytes, force-splitting by lines")
|
||||
subChunks = self._forceLineSplit(chunkData, availableContentBytes, len(validatedChunks))
|
||||
validatedChunks.extend(subChunks)
|
||||
else:
|
||||
chunk["order"] = len(validatedChunks)
|
||||
validatedChunks.append(chunk)
|
||||
|
||||
if len(validatedChunks) != len(chunks):
|
||||
logger.info(f"Post-chunking validation: {len(chunks)} -> {len(validatedChunks)} chunks after force-splitting oversized chunks")
|
||||
|
||||
for i, chunk in enumerate(validatedChunks):
|
||||
chunkSize = len(chunk.get('data', '').encode('utf-8')) if chunk.get('data') else 0
|
||||
logger.info(f" Chunk {i+1}/{len(validatedChunks)}: {chunkSize} bytes")
|
||||
|
||||
return validatedChunks
|
||||
except Exception as e:
|
||||
logger.error(f"Chunking failed for {contentPart.typeGroup}: {str(e)}")
|
||||
return []
|
||||
|
||||
def _forceLineSplit(self, data: str, maxBytes: int, startOrder: int) -> List[Dict[str, Any]]:
|
||||
"""Line-based safety-net split for chunks that still exceed maxBytes after structured chunking."""
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
for line in data.split('\n'):
|
||||
s = len(line.encode('utf-8')) + 1
|
||||
if size + s > maxBytes and current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
|
||||
current = [line]
|
||||
size = s
|
||||
else:
|
||||
current.append(line)
|
||||
size += s
|
||||
if current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": startOrder + len(chunks)})
|
||||
return chunks
|
||||
|
||||
async def processContentPartWithFallback(self, contentPart, prompt: str, options, failoverModelList, aiObjects, progressCallback=None) -> AiCallResponse:
|
||||
"""Process a single content part with model-aware chunking and fallback.
|
||||
|
||||
|
|
@ -1386,73 +1419,210 @@ class ExtractionService:
|
|||
logger.warning(f"⚠️ Content part ({contentTokens:.0f} tokens est.) exceeds available space ({availableContentBytes/TOKEN_SAFETY_FACTOR:.0f} tokens est.), chunking required")
|
||||
|
||||
# If either condition fails, chunk the content
|
||||
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking - they generate images from prompts, not process content chunks
|
||||
# CRITICAL: IMAGE_GENERATE operations should NOT use chunking
|
||||
if (totalTokens > maxTotalTokens or partSize > availableContentBytes) and options.operationType != OperationTypeEnum.IMAGE_GENERATE:
|
||||
# Part too large or total exceeds limit - chunk it (but not for image generation)
|
||||
chunks = await self.chunkContentPartForAi(contentPart, model, options, prompt)
|
||||
if not chunks:
|
||||
raise ValueError(f"Failed to chunk content part for model {model.name}")
|
||||
|
||||
logger.info(f"Starting to process {len(chunks)} chunks with model {model.name}")
|
||||
|
||||
if progressCallback:
|
||||
progressCallback(0.0, f"Starting to process {len(chunks)} chunks")
|
||||
|
||||
chunkResults = []
|
||||
for idx, chunk in enumerate(chunks):
|
||||
chunkNum = idx + 1
|
||||
chunkData = chunk.get('data', '')
|
||||
logger.info(f"Processing chunk {chunkNum}/{len(chunks)} with model {model.name}")
|
||||
|
||||
if progressCallback:
|
||||
progressCallback(chunkNum / len(chunks), f"Processing chunk {chunkNum}/{len(chunks)}")
|
||||
|
||||
try:
|
||||
chunkResponse = await aiObjects._callWithModel(model, prompt, chunkData, options)
|
||||
chunkResults.append(chunkResponse)
|
||||
except Exception as chunkError:
|
||||
logger.error(f"Error processing chunk {chunkNum}/{len(chunks)}: {str(chunkError)}")
|
||||
# Continue with other chunks even if one fails
|
||||
continue
|
||||
|
||||
# Merge chunk results
|
||||
if not chunkResults:
|
||||
raise ValueError(f"All chunks failed for content part")
|
||||
|
||||
# Pass original contentPart to preserve typeGroup for all chunks (one-to-many: 1 part -> N chunks)
|
||||
mergedContent = self.mergePartResults(chunkResults, options, [contentPart])
|
||||
|
||||
# Parallel chunk processing with per-chunk failover
|
||||
remainingModels = failoverModelList[attempt:]
|
||||
allChunkResults, allResponses = await self._processChunksParallel(
|
||||
chunks, prompt, options, remainingModels, aiObjects, progressCallback
|
||||
)
|
||||
|
||||
if not allResponses:
|
||||
raise ValueError("All chunks failed for content part")
|
||||
|
||||
mergedContent = self.mergePartResults(allResponses, options, [contentPart])
|
||||
|
||||
# Stitch pass: reconcile cross-chunk artifacts when multiple chunks were processed
|
||||
if len(allResponses) > 1:
|
||||
mergedContent = await self._stitchChunkResults(
|
||||
mergedContent, len(allResponses), prompt, options, aiObjects
|
||||
)
|
||||
|
||||
return AiCallResponse(
|
||||
content=mergedContent,
|
||||
modelName=model.name,
|
||||
provider=model.connectorType,
|
||||
priceCHF=sum(r.priceCHF for r in chunkResults),
|
||||
processingTime=sum(r.processingTime for r in chunkResults),
|
||||
bytesSent=sum(r.bytesSent for r in chunkResults),
|
||||
bytesReceived=sum(r.bytesReceived for r in chunkResults),
|
||||
errorCount=sum(r.errorCount for r in chunkResults)
|
||||
priceCHF=sum(r.priceCHF for r in allResponses),
|
||||
processingTime=sum(r.processingTime for r in allResponses),
|
||||
bytesSent=sum(r.bytesSent for r in allResponses),
|
||||
bytesReceived=sum(r.bytesReceived for r in allResponses),
|
||||
errorCount=sum(r.errorCount for r in allResponses)
|
||||
)
|
||||
else:
|
||||
# Part fits - call AI directly via aiObjects interface
|
||||
logger.info(f"✅ Content part fits within model limits, processing directly")
|
||||
# Part fits - call AI directly
|
||||
logger.info(f"Content part fits within model limits, processing directly")
|
||||
response = await aiObjects._callWithModel(model, prompt, contentPart.data, options)
|
||||
logger.info(f"✅ Content part processed successfully with model: {model.name}")
|
||||
logger.info(f"Content part processed successfully with model: {model.name}")
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
lastError = e
|
||||
error_msg = str(e) if str(e) else f"{type(e).__name__}"
|
||||
logger.warning(f"❌ Model {model.name} failed for content part: {error_msg}", exc_info=True)
|
||||
logger.warning(f"Model {model.name} failed for content part: {error_msg}", exc_info=True)
|
||||
|
||||
if attempt < len(failoverModelList) - 1:
|
||||
logger.info(f"🔄 Trying next failover model...")
|
||||
logger.info(f"Trying next failover model...")
|
||||
continue
|
||||
else:
|
||||
logger.error(f"💥 All {len(failoverModelList)} models failed for content part")
|
||||
logger.error(f"All {len(failoverModelList)} models failed for content part")
|
||||
break
|
||||
|
||||
# All models failed
|
||||
return self._createErrorResponse(f"All models failed: {str(lastError)}", 0, 0)
|
||||
|
||||
async def _processChunksParallel(
|
||||
self,
|
||||
chunks: List[Dict[str, Any]],
|
||||
prompt: str,
|
||||
options,
|
||||
failoverModels: list,
|
||||
aiObjects,
|
||||
progressCallback=None,
|
||||
maxRetries: int = 3
|
||||
) -> tuple:
|
||||
"""Process chunks in parallel. On failure, re-chunk only the failed chunks for the next model.
|
||||
|
||||
Returns (orderedResults, allResponses) where orderedResults is a dict
|
||||
mapping original order -> AiCallResponse and allResponses is a flat list.
|
||||
"""
|
||||
if not failoverModels:
|
||||
return {}, []
|
||||
|
||||
pendingChunks = [(chunk.get("order", i), chunk) for i, chunk in enumerate(chunks)]
|
||||
completedResults: Dict[float, AiCallResponse] = {}
|
||||
allResponses: List[AiCallResponse] = []
|
||||
retryCount = 0
|
||||
modelIdx = 0
|
||||
currentModel = failoverModels[modelIdx]
|
||||
|
||||
maxConcurrent = 3
|
||||
semaphore = asyncio.Semaphore(maxConcurrent)
|
||||
|
||||
logger.info(f"Starting parallel chunk processing: {len(pendingChunks)} chunks with model {currentModel.name}")
|
||||
|
||||
while pendingChunks and retryCount <= maxRetries and currentModel:
|
||||
modelForRound = currentModel
|
||||
totalInRound = len(pendingChunks)
|
||||
completedInRound = [0]
|
||||
|
||||
async def _processOneChunk(order: float, chunkData: str, model=modelForRound):
|
||||
async with semaphore:
|
||||
result = await aiObjects._callWithModel(model, prompt, chunkData, options)
|
||||
completedInRound[0] += 1
|
||||
if progressCallback:
|
||||
progressCallback(completedInRound[0] / totalInRound, f"Chunk {completedInRound[0]}/{totalInRound} completed")
|
||||
return result
|
||||
|
||||
tasks = {}
|
||||
for order, chunk in pendingChunks:
|
||||
chunkData = chunk.get('data', '')
|
||||
tasks[order] = asyncio.create_task(_processOneChunk(order, chunkData))
|
||||
|
||||
if progressCallback:
|
||||
progressCallback(0.0, f"Processing {len(tasks)} chunks in parallel with {currentModel.name}")
|
||||
|
||||
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
|
||||
|
||||
failedChunks = []
|
||||
for (order, chunk), result in zip(pendingChunks, results):
|
||||
if isinstance(result, Exception):
|
||||
logger.warning(f"Chunk order={order} failed with {currentModel.name}: {result}")
|
||||
failedChunks.append((order, chunk))
|
||||
else:
|
||||
completedResults[order] = result
|
||||
allResponses.append(result)
|
||||
|
||||
logger.info(f"Round {retryCount}: {len(pendingChunks) - len(failedChunks)}/{len(pendingChunks)} chunks succeeded with {currentModel.name}")
|
||||
|
||||
if not failedChunks:
|
||||
break
|
||||
|
||||
retryCount += 1
|
||||
modelIdx += 1
|
||||
if modelIdx >= len(failoverModels):
|
||||
logger.error(f"No more failover models available, {len(failedChunks)} chunks remain failed")
|
||||
break
|
||||
|
||||
currentModel = failoverModels[modelIdx]
|
||||
logger.info(f"Failover: re-chunking {len(failedChunks)} failed chunks for model {currentModel.name}")
|
||||
|
||||
newPending = []
|
||||
for order, failedChunk in failedChunks:
|
||||
reChunked = await self._reChunkForModel(failedChunk, currentModel, prompt, options)
|
||||
for i, subChunk in enumerate(reChunked):
|
||||
subOrder = order + i * 0.001
|
||||
newPending.append((subOrder, subChunk))
|
||||
|
||||
pendingChunks = newPending
|
||||
|
||||
orderedResponses = [completedResults[k] for k in sorted(completedResults.keys())]
|
||||
return orderedResponses, allResponses
|
||||
|
||||
async def _reChunkForModel(self, chunk: Dict[str, Any], model, prompt: str, options) -> List[Dict[str, Any]]:
|
||||
"""Re-chunk a single failed chunk according to the new model's context limits."""
|
||||
chunkData = chunk.get('data', '')
|
||||
tempPart = ContentPart(
|
||||
id=f"rechunk_{uuid.uuid4().hex[:8]}",
|
||||
label="re-chunk",
|
||||
typeGroup="structure" if chunkData.strip().startswith(('{', '[')) else "text",
|
||||
mimeType="application/json" if chunkData.strip().startswith(('{', '[')) else "text/plain",
|
||||
data=chunkData
|
||||
)
|
||||
reChunked = await self.chunkContentPartForAi(tempPart, model, options, prompt)
|
||||
if not reChunked:
|
||||
return [chunk]
|
||||
return reChunked
|
||||
|
||||
async def _stitchChunkResults(
|
||||
self,
|
||||
mergedContent: str,
|
||||
chunkCount: int,
|
||||
originalPrompt: str,
|
||||
options,
|
||||
aiObjects
|
||||
) -> str:
|
||||
"""Reconcile cross-chunk artifacts in merged content.
|
||||
|
||||
Only called when chunkCount > 1. Delegates to aiObjects.callWithTextContext
|
||||
which handles model selection, failover, and billing.
|
||||
"""
|
||||
mergedSize = len(mergedContent.encode('utf-8')) if mergedContent else 0
|
||||
|
||||
stitchPrompt = (
|
||||
"The following content was assembled from multiple independently processed "
|
||||
f"chunks ({chunkCount} chunks) of the same document. "
|
||||
"Review and fix ONLY these issues, preserving all content:\n"
|
||||
"1. Cross-references that point to content from other chunks\n"
|
||||
"2. Duplicate text at chunk boundaries (remove duplicates)\n"
|
||||
"3. Sentences or paragraphs split mid-thought (reconnect them)\n"
|
||||
"4. Inconsistent terminology for the same entity\n\n"
|
||||
"Do NOT add, remove, or rephrase content beyond these fixes. "
|
||||
"Return the corrected content in the same format.\n\n"
|
||||
f"Original processing instruction (truncated): {originalPrompt[:500]}"
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Running stitch pass on {mergedSize} bytes")
|
||||
request = AiCallRequest(
|
||||
prompt=stitchPrompt,
|
||||
context=mergedContent,
|
||||
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE)
|
||||
)
|
||||
response = await aiObjects.callWithTextContext(request)
|
||||
if hasattr(response, 'errorCount') and response.errorCount > 0:
|
||||
logger.warning(f"Stitch pass returned error: {response.content[:200] if response.content else 'empty'}")
|
||||
return mergedContent
|
||||
resultSize = len(response.content.encode('utf-8')) if response.content else 0
|
||||
logger.info(f"Stitch pass completed: {mergedSize} -> {resultSize} bytes")
|
||||
return response.content
|
||||
except Exception as e:
|
||||
logger.warning(f"Stitch pass failed (non-fatal), returning unstitched content: {e}")
|
||||
return mergedContent
|
||||
|
||||
def _createErrorResponse(self, errorMsg: str, inputBytes: int, outputBytes: int) -> AiCallResponse:
|
||||
"""Create an error response."""
|
||||
return AiCallResponse(
|
||||
|
|
@ -1521,9 +1691,18 @@ class ExtractionService:
|
|||
progressCallback(0.1 + (partIndex / totalParts) * 0.8, f"Processing {partLabel} ({partType}) - {partIndex+1}/{totalParts}")
|
||||
|
||||
try:
|
||||
# Process the part
|
||||
partProgressCb = None
|
||||
if progressCallback:
|
||||
partStart = 0.1 + (partIndex / totalParts) * 0.8
|
||||
partRange = 0.8 / totalParts
|
||||
def _makePartProgressCb(start, rangeSize):
|
||||
def _cb(chunkProgress, message):
|
||||
progressCallback(start + chunkProgress * rangeSize, message)
|
||||
return _cb
|
||||
partProgressCb = _makePartProgressCb(partStart, partRange)
|
||||
|
||||
partResult = await self.processContentPartWithFallback(
|
||||
contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging
|
||||
contentPart, prompt, options, failoverModelList, aiObjects, partProgressCb
|
||||
)
|
||||
|
||||
# Write debug files for generation phase (section content generation)
|
||||
|
|
|
|||
|
|
@ -375,7 +375,7 @@ USER PROVIDED:
|
|||
- Language: {language or "Not specified"}
|
||||
|
||||
Extract and provide a JSON response with:
|
||||
1. instruction: Formulate directly, WHAT you want to find on the web. Do not include URLs in the instruction. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz"
|
||||
1. instruction: Formulate a concise search query (MAXIMUM 400 characters) stating WHAT you want to find on the web. Do not include URLs in the instruction. Keep it focused on the core question. Good example: "What is the company Xyz doing?". Bad example: "Conduct web research on the company Xyz and find all information about..."
|
||||
2. urls: Put list of URLs found in the prompt text, and URL's you know, that are relevant to the research
|
||||
3. needsSearch: true if web search is needed to identify url's to crawl, false if only crawling of provided URLs is wanted
|
||||
4. maxNumberPages: Recommended number of URLs to crawl (based on research scope, typical: 2-20)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,18 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Services Module.
|
||||
Central service registry that provides access to shared services.
|
||||
Service Hub.
|
||||
Consumer-facing aggregation layer for services, DB interfaces, and runtime state.
|
||||
|
||||
IMPORTANT: Import-Regelwerk
|
||||
- Zentrale Module (wie dieses) dürfen KEINE Feature-Container importieren
|
||||
Architecture:
|
||||
- serviceHub delegates service resolution to serviceCenter (DI container)
|
||||
- serviceHub owns DB interface initialization and runtime state
|
||||
- serviceCenter knows nothing about serviceHub (one-way dependency)
|
||||
|
||||
Import-Regelwerk:
|
||||
- Zentrale Module (wie dieses) duerfen KEINE Feature-Container importieren
|
||||
- Feature-spezifische Services werden dynamisch geladen
|
||||
- Nur Shared Services werden direkt geladen
|
||||
- Shared Services werden via serviceCenter resolved
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -23,7 +28,6 @@ if TYPE_CHECKING:
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Path to feature containers
|
||||
_FEATURES_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "features")
|
||||
|
||||
|
||||
|
|
@ -54,15 +58,19 @@ class PublicService:
|
|||
])
|
||||
|
||||
|
||||
class Services:
|
||||
class ServiceHub:
|
||||
"""
|
||||
Central Services class providing access to all services.
|
||||
|
||||
Import-Regelwerk:
|
||||
- Shared Services are loaded directly (from modules/services/)
|
||||
- Feature-specific Services are loaded dynamically via filename discovery
|
||||
Consumer-facing aggregation of services, DB interfaces, and runtime state.
|
||||
|
||||
Services are lazy-resolved via serviceCenter on first access.
|
||||
DB interfaces and runtime state are initialized eagerly.
|
||||
Feature services/interfaces are discovered dynamically from features/.
|
||||
"""
|
||||
|
||||
_SERVICE_CENTER_WRAPPING = {
|
||||
"ai": {"functionsOnly": False},
|
||||
}
|
||||
|
||||
def __init__(self, user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None):
|
||||
self.user: User = user
|
||||
self.workflow = workflow
|
||||
|
|
@ -71,123 +79,89 @@ class Services:
|
|||
self.currentUserPrompt: str = ""
|
||||
self.rawUserPrompt: str = ""
|
||||
|
||||
# Initialize central interfaces
|
||||
from modules.serviceCenter.context import ServiceCenterContext
|
||||
self._serviceCenterContext = ServiceCenterContext(
|
||||
user=user,
|
||||
workflow=workflow,
|
||||
mandate_id=mandateId,
|
||||
feature_instance_id=featureInstanceId,
|
||||
)
|
||||
|
||||
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
|
||||
self.interfaceDbApp = getAppInterface(user, mandateId=mandateId)
|
||||
|
||||
|
||||
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
|
||||
self.interfaceDbComponent = getComponentInterface(user, mandateId=mandateId)
|
||||
|
||||
|
||||
self.rbac = self.interfaceDbApp.rbac if self.interfaceDbApp else None
|
||||
|
||||
# ============================================================
|
||||
# CENTRAL INTERFACE (Chat/Workflow)
|
||||
# ============================================================
|
||||
|
||||
from modules.interfaces.interfaceDbChat import getInterface as getChatInterface
|
||||
self.interfaceDbChat = getChatInterface(user, mandateId=mandateId, featureInstanceId=featureInstanceId)
|
||||
|
||||
# ============================================================
|
||||
# SHARED SERVICES (from modules/services/)
|
||||
# ============================================================
|
||||
from .serviceSharepoint.mainServiceSharepoint import SharepointService
|
||||
self.sharepoint = PublicService(SharepointService(self))
|
||||
|
||||
from .serviceTicket.mainServiceTicket import TicketService
|
||||
self.ticket = PublicService(TicketService(self))
|
||||
|
||||
from .serviceChat.mainServiceChat import ChatService
|
||||
self.chat = PublicService(ChatService(self))
|
||||
|
||||
from .serviceUtils.mainServiceUtils import UtilsService
|
||||
self.utils = PublicService(UtilsService(self))
|
||||
|
||||
from .serviceSecurity.mainServiceSecurity import SecurityService
|
||||
self.security = PublicService(SecurityService(self))
|
||||
|
||||
from .serviceMessaging.mainServiceMessaging import MessagingService
|
||||
self.messaging = PublicService(MessagingService(self))
|
||||
|
||||
from .serviceStreaming.mainServiceStreaming import StreamingService
|
||||
self.streaming = PublicService(StreamingService(self))
|
||||
|
||||
# ============================================================
|
||||
# AI SERVICES (from modules/services/)
|
||||
# ============================================================
|
||||
from .serviceAi.mainServiceAi import AiService
|
||||
self.ai = PublicService(AiService(self), functionsOnly=False)
|
||||
|
||||
from .serviceExtraction.mainServiceExtraction import ExtractionService
|
||||
self.extraction = PublicService(ExtractionService(self))
|
||||
|
||||
from .serviceGeneration.mainServiceGeneration import GenerationService
|
||||
self.generation = PublicService(GenerationService(self))
|
||||
|
||||
from .serviceWeb.mainServiceWeb import WebService
|
||||
self.web = PublicService(WebService(self))
|
||||
|
||||
# ============================================================
|
||||
# FEATURE INTERFACES (dynamically loaded)
|
||||
# ============================================================
|
||||
|
||||
self._loadFeatureInterfaces()
|
||||
self._loadFeatureServices()
|
||||
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Lazy-resolve services via serviceCenter on first access."""
|
||||
if name.startswith('_'):
|
||||
raise AttributeError(name)
|
||||
try:
|
||||
from modules.serviceCenter import getService
|
||||
service = getService(name, self._serviceCenterContext)
|
||||
wrapping = self._SERVICE_CENTER_WRAPPING.get(name, {})
|
||||
functionsOnly = wrapping.get("functionsOnly", True)
|
||||
wrapped = PublicService(service, functionsOnly=functionsOnly)
|
||||
setattr(self, name, wrapped)
|
||||
return wrapped
|
||||
except KeyError:
|
||||
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
|
||||
|
||||
def _loadFeatureInterfaces(self):
|
||||
"""Dynamically load interfaces from feature containers by filename pattern."""
|
||||
# Find all interfaceFeature*.py files
|
||||
pattern = os.path.join(_FEATURES_DIR, "*", "interfaceFeature*.py")
|
||||
for filepath in glob.glob(pattern):
|
||||
try:
|
||||
# Extract feature name and interface name
|
||||
featureDir = os.path.basename(os.path.dirname(filepath))
|
||||
filename = os.path.basename(filepath)[:-3] # Remove .py
|
||||
|
||||
# Build module path: modules.features.<feature>.<filename>
|
||||
filename = os.path.basename(filepath)[:-3]
|
||||
|
||||
modulePath = f"modules.features.{featureDir}.{filename}"
|
||||
module = importlib.import_module(modulePath)
|
||||
|
||||
# Get interface via getInterface()
|
||||
|
||||
if hasattr(module, "getInterface"):
|
||||
interface = module.getInterface(self.user, mandateId=self.mandateId, featureInstanceId=self.featureInstanceId)
|
||||
# Derive attribute name: interfaceFeatureAiChat -> interfaceDbChat
|
||||
attrName = filename.replace("interfaceFeature", "interfaceDb")
|
||||
setattr(self, attrName, interface)
|
||||
logger.debug(f"Loaded interface: {attrName} from {modulePath}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not load interface from {filepath}: {e}")
|
||||
|
||||
|
||||
def _loadFeatureServices(self):
|
||||
"""Dynamically load services from feature containers by filename pattern."""
|
||||
# Find all service*/mainService*.py files in feature containers
|
||||
pattern = os.path.join(_FEATURES_DIR, "*", "service*", "mainService*.py")
|
||||
for filepath in glob.glob(pattern):
|
||||
try:
|
||||
# Extract paths
|
||||
serviceDir = os.path.basename(os.path.dirname(filepath))
|
||||
featureDir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
|
||||
filename = os.path.basename(filepath)[:-3] # Remove .py
|
||||
|
||||
# Build module path: modules.features.<feature>.<serviceDir>.<filename>
|
||||
filename = os.path.basename(filepath)[:-3]
|
||||
|
||||
modulePath = f"modules.features.{featureDir}.{serviceDir}.{filename}"
|
||||
module = importlib.import_module(modulePath)
|
||||
|
||||
# Find service class (ends with "Service")
|
||||
|
||||
serviceClass = None
|
||||
for name in dir(module):
|
||||
if name.endswith("Service") and not name.startswith("_"):
|
||||
cls = getattr(module, name)
|
||||
for attrName in dir(module):
|
||||
if attrName.endswith("Service") and not attrName.startswith("_"):
|
||||
cls = getattr(module, attrName)
|
||||
if isinstance(cls, type):
|
||||
serviceClass = cls
|
||||
break
|
||||
|
||||
|
||||
if serviceClass:
|
||||
# Derive attribute name: serviceAi -> ai, serviceExtraction -> extraction
|
||||
attrName = serviceDir.replace("service", "").lower()
|
||||
if not attrName:
|
||||
attrName = serviceDir.lower()
|
||||
|
||||
# Check if it needs functionsOnly=False (for AI service)
|
||||
|
||||
functionsOnly = attrName != "ai"
|
||||
|
||||
|
||||
serviceInstance = serviceClass(self)
|
||||
setattr(self, attrName, PublicService(serviceInstance, functionsOnly=functionsOnly))
|
||||
logger.debug(f"Loaded service: {attrName} from {modulePath}")
|
||||
|
|
@ -195,6 +169,10 @@ class Services:
|
|||
logger.debug(f"Could not load service from {filepath}: {e}")
|
||||
|
||||
|
||||
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> Services:
|
||||
"""Get Services instance for the given user, mandate, and feature instance context."""
|
||||
return Services(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)
|
||||
# Backward-compatible alias
|
||||
Services = ServiceHub
|
||||
|
||||
|
||||
def getInterface(user: User, workflow: "ChatWorkflow" = None, mandateId: Optional[str] = None, featureInstanceId: Optional[str] = None) -> ServiceHub:
|
||||
"""Get ServiceHub instance for the given user, mandate, and feature instance context."""
|
||||
return ServiceHub(user, workflow, mandateId=mandateId, featureInstanceId=featureInstanceId)
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
AIChat Feature Container - Main Module.
|
||||
Handles feature initialization and RBAC catalog registration.
|
||||
|
||||
AIChat is the dynamic chat workflow feature that handles:
|
||||
- AI-powered document processing
|
||||
- Dynamic workflow execution
|
||||
- Automation definitions
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Feature metadata
|
||||
FEATURE_CODE = "chatworkflow"
|
||||
FEATURE_LABEL = {"en": "Chat Workflow", "de": "Chat-Workflow", "fr": "Workflow de Chat"}
|
||||
FEATURE_ICON = "mdi-message-cog"
|
||||
|
||||
# UI Objects for RBAC catalog
|
||||
UI_OBJECTS = [
|
||||
{
|
||||
"objectKey": "ui.feature.aichat.workflows",
|
||||
"label": {"en": "Workflows", "de": "Workflows", "fr": "Workflows"},
|
||||
"meta": {"area": "workflows"}
|
||||
},
|
||||
{
|
||||
"objectKey": "ui.feature.aichat.automations",
|
||||
"label": {"en": "Automations", "de": "Automatisierungen", "fr": "Automatisations"},
|
||||
"meta": {"area": "automations"}
|
||||
},
|
||||
{
|
||||
"objectKey": "ui.feature.aichat.logs",
|
||||
"label": {"en": "Logs", "de": "Logs", "fr": "Journaux"},
|
||||
"meta": {"area": "logs"}
|
||||
},
|
||||
]
|
||||
|
||||
# Resource Objects for RBAC catalog
|
||||
RESOURCE_OBJECTS = [
|
||||
{
|
||||
"objectKey": "resource.feature.aichat.workflow.start",
|
||||
"label": {"en": "Start Workflow", "de": "Workflow starten", "fr": "Démarrer workflow"},
|
||||
"meta": {"endpoint": "/api/chat/playground/start", "method": "POST"}
|
||||
},
|
||||
{
|
||||
"objectKey": "resource.feature.aichat.workflow.stop",
|
||||
"label": {"en": "Stop Workflow", "de": "Workflow stoppen", "fr": "Arrêter workflow"},
|
||||
"meta": {"endpoint": "/api/chat/playground/stop/{workflowId}", "method": "POST"}
|
||||
},
|
||||
{
|
||||
"objectKey": "resource.feature.aichat.workflow.delete",
|
||||
"label": {"en": "Delete Workflow", "de": "Workflow löschen", "fr": "Supprimer workflow"},
|
||||
"meta": {"endpoint": "/api/chat/playground/workflow/{workflowId}", "method": "DELETE"}
|
||||
},
|
||||
]
|
||||
|
||||
# Template roles for this feature
|
||||
TEMPLATE_ROLES = [
|
||||
{
|
||||
"roleLabel": "workflow-admin",
|
||||
"description": {
|
||||
"en": "Workflow Administrator - Full access to workflow configuration and execution",
|
||||
"de": "Workflow-Administrator - Vollzugriff auf Workflow-Konfiguration und Ausführung",
|
||||
"fr": "Administrateur workflow - Accès complet à la configuration et exécution"
|
||||
}
|
||||
},
|
||||
{
|
||||
"roleLabel": "workflow-editor",
|
||||
"description": {
|
||||
"en": "Workflow Editor - Create and modify workflows",
|
||||
"de": "Workflow-Editor - Workflows erstellen und bearbeiten",
|
||||
"fr": "Éditeur workflow - Créer et modifier les workflows"
|
||||
}
|
||||
},
|
||||
{
|
||||
"roleLabel": "workflow-viewer",
|
||||
"description": {
|
||||
"en": "Workflow Viewer - View workflows and execution results",
|
||||
"de": "Workflow-Betrachter - Workflows und Ausführungsergebnisse einsehen",
|
||||
"fr": "Visualiseur workflow - Consulter les workflows et résultats"
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def getFeatureDefinition() -> Dict[str, Any]:
|
||||
"""Return the feature definition for registration."""
|
||||
return {
|
||||
"code": FEATURE_CODE,
|
||||
"label": FEATURE_LABEL,
|
||||
"icon": FEATURE_ICON
|
||||
}
|
||||
|
||||
|
||||
def getUiObjects() -> List[Dict[str, Any]]:
|
||||
"""Return UI objects for RBAC catalog registration."""
|
||||
return UI_OBJECTS
|
||||
|
||||
|
||||
def getResourceObjects() -> List[Dict[str, Any]]:
|
||||
"""Return resource objects for RBAC catalog registration."""
|
||||
return RESOURCE_OBJECTS
|
||||
|
||||
|
||||
def getTemplateRoles() -> List[Dict[str, Any]]:
|
||||
"""Return template roles for this feature."""
|
||||
return TEMPLATE_ROLES
|
||||
|
||||
|
||||
def registerFeature(catalogService) -> bool:
|
||||
"""
|
||||
Register this feature's RBAC objects in the catalog.
|
||||
|
||||
Args:
|
||||
catalogService: The RBAC catalog service instance
|
||||
|
||||
Returns:
|
||||
True if registration was successful
|
||||
"""
|
||||
try:
|
||||
# Register UI objects
|
||||
for uiObj in UI_OBJECTS:
|
||||
catalogService.registerUiObject(
|
||||
featureCode=FEATURE_CODE,
|
||||
objectKey=uiObj["objectKey"],
|
||||
label=uiObj["label"],
|
||||
meta=uiObj.get("meta")
|
||||
)
|
||||
|
||||
# Register Resource objects
|
||||
for resObj in RESOURCE_OBJECTS:
|
||||
catalogService.registerResourceObject(
|
||||
featureCode=FEATURE_CODE,
|
||||
objectKey=resObj["objectKey"],
|
||||
label=resObj["label"],
|
||||
meta=resObj.get("meta")
|
||||
)
|
||||
|
||||
logger.info(f"Feature '{FEATURE_CODE}' registered {len(UI_OBJECTS)} UI objects and {len(RESOURCE_OBJECTS)} resource objects")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register feature '{FEATURE_CODE}': {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def onStart(eventUser) -> None:
|
||||
"""
|
||||
Called when the feature container starts.
|
||||
Initializes AI connectors for model registry.
|
||||
"""
|
||||
try:
|
||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||
modelRegistry.ensureConnectorsRegistered()
|
||||
logger.info(f"Feature '{FEATURE_CODE}' started - AI connectors initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Feature '{FEATURE_CODE}' failed to initialize AI connectors: {e}")
|
||||
|
||||
|
||||
async def onStop(eventUser) -> None:
|
||||
"""Called when the feature container stops."""
|
||||
logger.info(f"Feature '{FEATURE_CODE}' stopped")
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,513 +0,0 @@
|
|||
================================================================================
|
||||
JSON MERGE OPERATION #1
|
||||
================================================================================
|
||||
Timestamp: 2026-01-06T22:24:33.405726
|
||||
|
||||
INPUT:
|
||||
Accumulated length: 40250 chars
|
||||
New Fragment length: 2471 chars
|
||||
Accumulated: 373 lines (showing first 5 and last 5)
|
||||
{
|
||||
"elements": [
|
||||
{
|
||||
"type": "table",
|
||||
"content": {
|
||||
... (363 lines omitted) ...
|
||||
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
|
||||
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
New Fragment: 33 lines (showing first 5 and last 5)
|
||||
```json
|
||||
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
|
||||
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
|
||||
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
|
||||
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
|
||||
... (23 lines omitted) ...
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Normalized Accumulated (40250 chars)
|
||||
(showing first 5 and last 5 of 373 lines)
|
||||
{
|
||||
"elements": [
|
||||
{
|
||||
"type": "table",
|
||||
"content": {
|
||||
... (363 lines omitted) ...
|
||||
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
|
||||
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
|
||||
Normalized New Fragment (2459 chars)
|
||||
(showing first 5 and last 5 of 31 lines)
|
||||
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
|
||||
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
|
||||
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
|
||||
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
|
||||
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
|
||||
... (21 lines omitted) ...
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
STEP: PHASE 1
|
||||
Description: Finding overlap between JSON strings
|
||||
⏳ In progress...
|
||||
|
||||
Overlap Detection (string (exact)):
|
||||
Overlap length: 40
|
||||
✅ Found overlap of 40 chars
|
||||
Accumulated suffix (COMPLETE, 40 chars):
|
||||
============================================================================
|
||||
["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
============================================================================
|
||||
Fragment prefix (40 chars, 1 lines)
|
||||
["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
|
||||
Overlap found (40 chars):
|
||||
Accumulated suffix: ["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
Fragment prefix: ["06.12.25", "08.12.25", "Decathlon, Hin
|
||||
STEP: PHASE 2
|
||||
Description: Merging strings (overlap: 40 chars)
|
||||
⏳ In progress...
|
||||
|
||||
|
||||
Merged String (42669 chars)
|
||||
(showing first 5 and last 5 of 403 lines)
|
||||
{
|
||||
"elements": [
|
||||
{
|
||||
"type": "table",
|
||||
"content": {
|
||||
... (393 lines omitted) ...
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
STEP: PHASE 3
|
||||
Description: Returning merged string (may be unclosed)
|
||||
⏳ In progress...
|
||||
|
||||
|
||||
Returning merged string (preserving incomplete element at end for next iteration)
|
||||
|
||||
================================================================================
|
||||
MERGE RESULT: ✅ SUCCESS
|
||||
================================================================================
|
||||
Final result length: 42669 chars
|
||||
Final result (COMPLETE):
|
||||
================================================================================
|
||||
{
|
||||
"elements": [
|
||||
{
|
||||
"type": "table",
|
||||
"content": {
|
||||
"headers": [
|
||||
"Date",
|
||||
"Valuta",
|
||||
"Details",
|
||||
"Currency",
|
||||
"Amount",
|
||||
"Amount in CHF",
|
||||
"Maskierte Kreditkarte"
|
||||
],
|
||||
"rows": [
|
||||
["12.09.25", "15.09.25", "Coop-1911 Ruti, Ruti ZH", "CH", "102.05", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "26.20", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "food & drive GmbH, Durnten", "CH", "4.50", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "Gartencenter Meier, Durnten", "CH", "88.40", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "15.00", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "THE ARTISAN ZUERICH, ZUERICH", "CH", "18.00", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "KONDITOREI VOLAND WALD, WALD ZH", "CH", "16.50", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.02", "0.00", "**** **** **** 1234"],
|
||||
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "50.80", "", "**** **** **** 1234"],
|
||||
["15.09.25", "16.09.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM", "US", "USD 108.10", "88.60", "**** **** **** 1234"],
|
||||
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "113.35", "", "**** **** **** 1234"],
|
||||
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "3.60", "", "**** **** **** 1234"],
|
||||
["18.09.25", "19.09.25", "Coop-4991 Fallanden, Fallanden", "CH", "116.00", "", "**** **** **** 1234"],
|
||||
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "5.95", "", "**** **** **** 1234"],
|
||||
["18.09.25", "19.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "7.00", "", "**** **** **** 1234"],
|
||||
["19.09.25", "22.09.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "32.10", "", "**** **** **** 1234"],
|
||||
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "14.80", "", "**** **** **** 1234"],
|
||||
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "370.65", "", "**** **** **** 1234"],
|
||||
["19.09.25", "22.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "11.50", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["21.09.25", "22.09.25", "Kreuzwirt, Weissensee", "AT", "EUR 278.00", "266.50", "**** **** **** 1234"],
|
||||
["23.09.25", "24.09.25", "FILIALE, WALD ZH", "CH", "EUR 500.00", "492.15", "**** **** **** 1234"],
|
||||
["24.09.25", "25.09.25", "P2 Parkhaus Ein- & Ausfah, Zurich", "CH", "5.00", "", "**** **** **** 1234"],
|
||||
["24.09.25", "25.09.25", "A.I.R. Bakery, Zurich", "CH", "18.60", "", "**** **** **** 1234"],
|
||||
["24.09.25", "25.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "23.35", "", "**** **** **** 1234"],
|
||||
["25.09.25", "26.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "203.20", "", "**** **** **** 1234"],
|
||||
["25.09.25", "26.09.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "44.10", "", "**** **** **** 1234"],
|
||||
["26.09.25", "29.09.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "95.25", "", "**** **** **** 1234"],
|
||||
["26.09.25", "29.09.25", "Puls Apotheke & Drogerie, Hinwil", "CH", "140.60", "", "**** **** **** 1234"],
|
||||
["26.09.25", "29.09.25", "FILIALE, WALD ZH", "CH", "CHF 280.00", "287.00", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "NYX*LullySA, Lully", "CH", "1.00", "", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "Kisoque de Lully, Lully", "CH", "5.70", "", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "TOTAL MKT FR, NANTERRE", "FR", "EUR 79.95", "76.90", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "AREA NFC 4261525, 69BRON CEDEX", "FR", "EUR 33.50", "32.20", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "HOLIDAY APARTMENTS, PORT SAPLAYA", "ES", "EUR 1'118.15", "1'075.45", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "LE BISTROT DEL M, MEZE", "FR", "EUR 210.20", "202.15", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "EUR 2.40", "2.30", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 90.09", "86.65", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 4.70", "4.50", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "E.S. LA SELVA, COMAJULIANA", "ES", "EUR 8.40", "8.10", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 15.60", "15.00", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 24.40", "23.45", "**** **** **** 1234"],
|
||||
["29.09.25", "30.09.25", "OROMARKET SUPERMERCADOS, OROPESA", "ES", "EUR 17.32", "16.65", "**** **** **** 1234"],
|
||||
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 40.40", "38.85", "**** **** **** 1234"],
|
||||
["29.09.25", "30.09.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 22.55", "21.70", "**** **** **** 1234"],
|
||||
["29.09.25", "30.09.25", "ALDI OROPESA, OROPESA", "ES", "EUR 129.39", "124.40", "**** **** **** 1234"],
|
||||
["30.09.25", "01.10.25", "QUESADA CENTER, OROPESA DEL M", "ES", "EUR 84.05", "80.95", "**** **** **** 1234"],
|
||||
["30.09.25", "01.10.25", "PASSION CREPES, OROPESA", "ES", "EUR 10.30", "9.90", "**** **** **** 1234"],
|
||||
["30.09.25", "01.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 17.53", "16.90", "**** **** **** 1234"],
|
||||
["30.09.25", "01.10.25", "Restaurante DRAGON, OROPESA", "ES", "EUR 75.00", "72.25", "**** **** **** 1234"],
|
||||
["30.09.25", "01.10.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM", "US", "USD 216.20", "177.55", "**** **** **** 1234"],
|
||||
["01.10.25", "02.10.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "29.60", "", "**** **** **** 1234"],
|
||||
["01.10.25", "02.10.25", "RTE PUERTA DEL SOL, OROPESA DEL M", "ES", "EUR 169.20", "163.10", "**** **** **** 1234"],
|
||||
["01.10.25", "02.10.25", "TREN TURISTICO OROPESA, OROPESA DEL M", "ES", "EUR 15.00", "14.45", "**** **** **** 1234"],
|
||||
["01.10.25", "02.10.25", "LANGDOCK GMBH, BERLIN", "DE", "EUR 25.00", "24.10", "**** **** **** 1234"],
|
||||
["01.10.25", "02.10.25", "WWW.PERPLEXITY.AI, WWW.PERPLEXIT", "US", "USD 10.81", "8.90", "**** **** **** 1234"],
|
||||
["02.10.25", "06.10.25", "GOOGLE *YouTubePremium, g.co/helppay#", "GB", "33.90", "", "**** **** **** 1234"],
|
||||
["02.10.25", "06.10.25", "WILLY LA CONCHA, OROPESA DEL M", "ES", "EUR 98.93", "95.40", "**** **** **** 1234"],
|
||||
["03.10.25", "06.10.25", "Netflix.com, Los Gatos", "NL", "20.90", "", "**** **** **** 1234"],
|
||||
["03.10.25", "06.10.25", "COALIMENT LA CONCHA, OROPESA DEL M", "ES", "EUR 11.74", "11.30", "**** **** **** 1234"],
|
||||
["03.10.25", "06.10.25", "DONA RESU, OROPESA", "ES", "EUR 7.30", "7.05", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 89.50", "86.30", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 8.45", "8.15", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "HELADERIA LAS DELICIAS, OROPESA DEL M", "ES", "EUR 10.80", "10.40", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "REST. BISTROT, OROPESA DEL M", "ES", "EUR 117.90", "113.70", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["04.10.25", "06.10.25", "Google Duolingo Langu, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
|
||||
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 3.00", "2.90", "**** **** **** 1234"],
|
||||
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 9.00", "8.70", "**** **** **** 1234"],
|
||||
["05.10.25", "06.10.25", "RESTAURANTE, ORPESA", "ES", "EUR 87.75", "84.60", "**** **** **** 1234"],
|
||||
["05.10.25", "06.10.25", "HABANA, OROPESA", "ES", "EUR 15.50", "14.95", "**** **** **** 1234"],
|
||||
["06.10.25", "07.10.25", "HABANA, OROPESA", "ES", "EUR 25.00", "24.05", "**** **** **** 1234"],
|
||||
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 3.95", "3.80", "**** **** **** 1234"],
|
||||
["06.10.25", "07.10.25", "QUESADA CENTER SUPERMERCA, OROPESA", "ES", "EUR 47.75", "45.95", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "MAGIC SPORT HALL OLYMPICS, OROPESA DEL M", "ES", "EUR 183.75", "176.70", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 172.55", "165.90", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "Wondershare, Hong Kong", "HK", "25.95", "", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "MERCADONA MARINA DOR, ORPESA DEL MA", "ES", "EUR 99.13", "95.30", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "RECEP HOTEL MAGIC SPORTS, OROPESA DEL M", "ES", "EUR 10.00", "9.60", "**** **** **** 1234"],
|
||||
["07.10.25", "08.10.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
|
||||
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 98.07", "94.00", "**** **** **** 1234"],
|
||||
["08.10.25", "09.10.25", "AUTOROUTES ASF, VEDENE CEDEX", "FR", "EUR 44.20", "42.35", "**** **** **** 1234"],
|
||||
["08.10.25", "09.10.25", "A.R.E.A., 69671", "FR", "EUR 11.20", "10.75", "**** **** **** 1234"],
|
||||
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "113.10", "", "**** **** **** 1234"],
|
||||
["09.10.25", "10.10.25", "SOCAR station-service, Bursins", "CH", "6.80", "", "**** **** **** 1234"],
|
||||
["09.10.25", "10.10.25", "A.R.E.A., 69671", "FR", "EUR 15.00", "14.40", "**** **** **** 1234"],
|
||||
["08.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 110.00", "105.45", "**** **** **** 1234"],
|
||||
["09.10.25", "10.10.25", "DOMAINE DE ROZAN, LA TRONCHE", "FR", "EUR 40.00", "38.35", "**** **** **** 1234"],
|
||||
["10.10.25", "13.10.25", "Coop-1252 Wald, Wald ZH", "CH", "164.85", "", "**** **** **** 1234"],
|
||||
["10.10.25", "13.10.25", "CURSOR, AI POWERED IDE, CURSOR.COM", "US", "USD 20.00", "16.60", "**** **** **** 1234"],
|
||||
["11.10.25", "13.10.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "Cafe Konditorei Voland, Laupen ZH", "CH", "37.70", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "17.35", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "5.40", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "54.00", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "Rest Volkshaus, Zurich", "CH", "18.00", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "Sora Sushi - HB Zurich, Zurich", "CH", "74.00", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "176.32 avec, Ruti ZH", "CH", "2.45", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "Baradox AG, Zurich", "CH", "15.00", "", "**** **** **** 1234"],
|
||||
["12.09.25", "15.09.25", "Volkshausstiftung Zurich, Zurich", "CH", "3.00", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "SBB Bahnhof Ruti ZH, Ruti ZH", "CH", "9.20", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "SBB Bahnhof Wald, Wald ZH", "CH", "27.00", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "EXIL GMBH, ZUERICH", "CH", "14.00", "", "**** **** **** 1234"],
|
||||
["13.09.25", "15.09.25", "URBAN FOOD CLUTURE GMB, ZURICH", "CH", "135.00", "", "**** **** **** 1234"],
|
||||
["14.09.25", "15.09.25", "Google One, 650-2530000", "US", "100.00", "", "**** **** **** 1234"],
|
||||
["15.09.25", "16.09.25", "Ex Libris AG, Dietikon", "CH", "13.00", "", "**** **** **** 1234"],
|
||||
["15.09.25", "16.09.25", "Coop-1252 Wald, Wald ZH", "CH", "51.45", "", "**** **** **** 1234"],
|
||||
["16.09.25", "17.09.25", "Shell Waldhof, Wald ZH", "CH", "5.80", "", "**** **** **** 1234"],
|
||||
["19.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "16.05", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "14.60", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.55", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.90", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "Coop-1252 Wald, Wald ZH", "CH", "60.75", "", "**** **** **** 1234"],
|
||||
["20.09.25", "22.09.25", "MORE BAR GMBH, BUBIKON", "CH", "70.00", "", "**** **** **** 1234"],
|
||||
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "6.40", "", "**** **** **** 1234"],
|
||||
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "4.20", "", "**** **** **** 1234"],
|
||||
["21.09.25", "22.09.25", "LS Pirates AG, Hinwil", "CH", "13.45", "", "**** **** **** 1234"],
|
||||
["22.09.25", "23.09.25", "Migros M Wald, Wald ZH", "CH", "16.80", "", "**** **** **** 1234"],
|
||||
["22.09.25", "23.09.25", "BLEICHI + HOTEL, WALD", "CH", "43.00", "", "**** **** **** 1234"],
|
||||
["23.09.25", "24.09.25", "Coop-1252 Wald, Wald ZH", "CH", "155.75", "", "**** **** **** 1234"],
|
||||
["24.09.25", "25.09.25", "BKG*HOTEL AT BOOKING.C, (888)850-3958", "NL", "EUR 177.35", "170.35", "**** **** **** 1234"],
|
||||
["27.09.25", "29.09.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "21.50", "", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "SELVA HOSTELERIA, MACANET DE LA", "ES", "15.75", "", "**** **** **** 1234"],
|
||||
["28.09.25", "29.09.25", "AREAS LA SELVA, BARCELONA", "ES", "EUR 19.11", "18.40", "**** **** **** 1234"],
|
||||
["02.10.25", "06.10.25", "GOOGLE *YouTube Member, g.co/helppay#", "GB", "15.00", "", "**** **** **** 1234"],
|
||||
["01.10.25", "06.10.25", "Eventfrog.c 737909203525, Olten", "CH", "114.95", "", "**** **** **** 1234"],
|
||||
["06.10.25", "07.10.25", "digitec Galaxus (Online), Zurich", "CH", "23.80", "", "**** **** **** 1234"],
|
||||
["08.10.25", "09.10.25", "E.S.LA CLARIANA, MADRID", "ES", "EUR 29.58", "28.35", "**** **** **** 1234"],
|
||||
["10.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "72.45", "", "**** **** **** 1234"],
|
||||
["10.10.25", "13.10.25", "Ticketcorner*89987227, 410900800800", "CH", "199.80", "", "**** **** **** 1234"],
|
||||
["10.10.25", "13.10.25", "SP NORAYA, RUMISBERG", "CH", "79.90", "", "**** **** **** 1234"],
|
||||
["11.10.25", "13.10.25", "B2BAND.COM, BRATISLAVA", "SK", "139.95", "", "**** **** **** 1234"],
|
||||
["11.10.25", "13.10.25", "TEMU.COM, BASEL", "CH", "81.20", "", "**** **** **** 1234"],
|
||||
["11.10.25", "13.10.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "Rest Volkshaus, Zurich", "CH", "9.00", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "Shell Heuberg, Forch", "CH", "100.10", "", "**** **** **** 1234"],
|
||||
["12.10.25", "13.10.25", "Parkhaus Helvetiaplatz, Zurich", "CH", "8.00", "", "**** **** **** 1234"],
|
||||
["14.10.25", "15.10.25", "P2 Parkhaus Ein- & Ausfah, Zurich CH", "CHF", "5.00", "", "**** **** **** 1234"],
|
||||
["14.10.25", "15.10.25", "Migros Zurich Airport, Zurich CH", "CHF", "16.35", "", "**** **** **** 1234"],
|
||||
["14.10.25", "15.10.25", "GITHUB, INC., GITHUB.COM US", "USD", "0.30", "0.25", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "Dosenbach Schuhe & Sport, Hinwil CH", "CHF", "50.00", "", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "257.20", "", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "Landi, Wald CH", "CHF", "67.85", "", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "Puls Apotheke & Drogerie, Hinwil CH", "CHF", "9.20", "", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "CLAUDE.AI SUBSCRIPTION, ANTHROPIC.COM US", "USD", "108.10", "89.50", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "7.80", "", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "14.50", "", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "SV (Schweiz) AG, 27960, Zurich ETH-Ze CH", "CHF", "4.20", "", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "Universitatsspital Zurich, Zurich CH", "CHF", "30.00", "", "**** **** **** 1234"],
|
||||
["18.10.25", "20.10.25", "HubSpot Germany GmbH, Berlin DE", "EUR", "267.55", "256.05", "**** **** **** 1234"],
|
||||
["18.10.25", "20.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
|
||||
["19.10.25", "20.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "7.20", "", "**** **** **** 1234"],
|
||||
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "20.30", "", "**** **** **** 1234"],
|
||||
["19.10.25", "20.10.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "11.10", "", "**** **** **** 1234"],
|
||||
["18.10.25", "20.10.25", "ANTHROPIC, ANTHROPIC.COM US", "USD", "108.10", "88.75", "**** **** **** 1234"],
|
||||
["20.10.25", "21.10.25", "APCOA, Dubendorf CH", "CHF", "20.00", "", "**** **** **** 1234"],
|
||||
["20.10.25", "21.10.25", "STWEG Ambassador House, Glattbrugg CH", "CHF", "5.00", "", "**** **** **** 1234"],
|
||||
["23.10.25", "24.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "199.85", "", "**** **** **** 1234"],
|
||||
["24.10.25", "24.10.25", "Ticketcorner*90004263, 410900800800 CH", "CHF", "159.75", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Google Duolingo Langu, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "1.50", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Hornbach Baumarkt Galgene, Galgenen CH", "CHF", "814.10", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "REMO WUEST BACK. KOND., GALGENEN CH", "CHF", "20.00", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "12.90", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "15.30", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "6.50", "", "**** **** **** 1234"],
|
||||
["30.10.25", "31.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "139.85", "", "**** **** **** 1234"],
|
||||
["30.10.25", "31.10.25", "Coop-4054 Hinwil Restaura, Hinwil CH", "CHF", "34.95", "", "**** **** **** 1234"],
|
||||
["31.10.25", "03.11.25", "Coop-1911 Ruti, Ruti ZH CH", "CHF", "66.50", "", "**** **** **** 1234"],
|
||||
["31.10.25", "03.11.25", "OPENAI *CHATGPT SUBSCR, OPENAI.COM US", "USD", "216.20", "178.70", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "GOOGLE *ADS5192965135, cc§google.com IE", "", "79.15", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "99.60", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "LANGDOCK GMBH, BERLIN DE", "EUR", "25.00", "23.90", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
|
||||
["02.11.25", "03.11.25", "GOOGLE *YouTubePremium, g.co/helppay# GB", "", "33.90", "", "**** **** **** 1234"],
|
||||
["02.11.25", "03.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "119.45", "", "**** **** **** 1234"],
|
||||
["03.11.25", "03.11.25", "Netflix.com, Los Gatos NL", "", "20.90", "", "**** **** **** 1234"],
|
||||
["03.11.25", "04.11.25", "www.fust.ch, Oberburen CH", "CHF", "1'560.90", "", "**** **** **** 1234"],
|
||||
["06.11.25", "07.11.25", "Grand Casino Luzern AG, Luzern CH", "CHF", "100.00", "108.00", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "0.40", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "Migros M Mutschellen, Berikon CH", "CHF", "15.90", "", "**** **** **** 1234"],
|
||||
["08.11.25", "10.11.25", "wondershare.com, Hong Kong HK", "", "25.95", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "9.85", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "Google One, 650-2530000 US", "USD", "10.00", "", "**** **** **** 1234"],
|
||||
["08.11.25", "10.11.25", "Steiner-Beck AG, Wald ZH CH", "CHF", "32.20", "", "**** **** **** 1234"],
|
||||
["08.11.25", "10.11.25", "Google PixVerse AI Vi, 650-2530000 US", "USD", "5.20", "", "**** **** **** 1234"],
|
||||
["09.11.25", "10.11.25", "KONDITOREI VOLAND WALD, WALD ZH CH", "CHF", "25.80", "", "**** **** **** 1234"],
|
||||
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
|
||||
["09.11.25", "11.11.25", "Starfood AG (Schweiz), Rothenburg CH", "CHF", "16.00", "", "**** **** **** 1234"],
|
||||
["10.11.25", "11.11.25", "Coop-2253 Jona Eisenhof, Jona CH", "CHF", "161.25", "", "**** **** **** 1234"],
|
||||
["12.11.25", "13.11.25", "Hess AG Erdbau + Recy, Laupen ZH CH", "CHF", "39.20", "", "**** **** **** 1234"],
|
||||
["12.11.25", "13.11.25", "Jumbo-6017 Hinwil, Hinwil CH", "CHF", "173.70", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "57.90", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "Migros MM Ruti, Ruti ZH CH", "CHF", "140.10", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "WAL*WESTHIVE, DUEBENDORF CH", "CHF", "22.30", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "UDIO.COM, UDIO.COM US", "EUR", "36.00", "34.35", "**** **** **** 1234"],
|
||||
["15.10.25", "16.10.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "4.95", "", "**** **** **** 1234"],
|
||||
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "61.50", "", "**** **** **** 1234"],
|
||||
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "12.95", "", "**** **** **** 1234"],
|
||||
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "32.30", "", "**** **** **** 1234"],
|
||||
["16.10.25", "17.10.25", "TEMU.COM, BASEL CH", "CHF", "17.95", "", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "54.00", "", "**** **** **** 1234"],
|
||||
["17.10.25", "20.10.25", "Candrian Catering AG 2, Zurich CH", "CHF", "15.50", "", "**** **** **** 1234"],
|
||||
["20.10.25", "21.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "178.95", "", "**** **** **** 1234"],
|
||||
["21.10.25", "22.10.25", "Denner Ruti ZH, Ruti ZH CH", "CHF", "50.15", "", "**** **** **** 1234"],
|
||||
["24.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "100.65", "", "**** **** **** 1234"],
|
||||
["24.10.25", "27.10.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "70.35", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "47.00", "", "**** **** **** 1234"],
|
||||
["25.10.25", "27.10.25", "Shell Waldhof, Wald ZH CH", "CHF", "3.20", "", "**** **** **** 1234"],
|
||||
["26.10.25", "27.10.25", "TEMU.COM, BASEL CH", "CHF", "63.10", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "ONLY, Hinwil CH", "CHF", "222.60", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "104.10", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "24.95", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "177.25", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "H & M, Hinwil CH", "CHF", "43.85", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Coop-4253 Hinwil Wasseri, Hinwil CH", "CHF", "52.30", "", "**** **** **** 1234"],
|
||||
["27.10.25", "28.10.25", "Manor AG, Hinwil CH", "CHF", "59.05", "", "**** **** **** 1234"],
|
||||
["28.10.25", "29.10.25", "Migros MM Rapperswil, Rapperswil SG CH", "CHF", "23.35", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "ROSSMANN Schweiz AG, Wallisellen CH", "CHF", "13.95", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "Migros MR Glattzentrum, Glattzentrum CH", "CHF", "42.20", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "Calzedonia, Wallisellen CH", "CHF", "178.25", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "Intimissimi, Wallisellen CH", "CHF", "90.20", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "76.80", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "New Yorker (Schweiz) GmbH, Wetzikon ZH CH", "CHF", "7.95", "", "**** **** **** 1234"],
|
||||
["29.10.25", "30.10.25", "Golden Bar GmbH, Wald ZH CH", "CHF", "40.00", "", "**** **** **** 1234"],
|
||||
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "12.60", "", "**** **** **** 1234"],
|
||||
["30.10.25", "31.10.25", "Frischer Max, Zurich CH", "CHF", "4.20", "", "**** **** **** 1234"],
|
||||
["30.10.25", "31.10.25", "Halle 622, Zurich CH", "CHF", "15.75", "", "**** **** **** 1234"],
|
||||
["31.10.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
|
||||
["31.10.25", "03.11.25", "Eventfrog.c 739003945141, Olten CH", "CHF", "67.85", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "SBB Bahnhof Ruti ZH, Ruti ZH CH", "CHF", "27.00", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "AMERON ZUERICH, ZUERICH CH", "CHF", "30.00", "", "**** **** **** 1234"],
|
||||
["31.10.25", "03.11.25", "SKYLINE EVENTS, ZUERICH CH", "CHF", "13.50", "", "**** **** **** 1234"],
|
||||
["02.11.25", "03.11.25", "AURA Event Saal, Zuerich CH", "CHF", "15.75", "", "**** **** **** 1234"],
|
||||
["02.11.25", "03.11.25", "GOOGLE *YouTube Member, g.co/helppay# GB", "", "15.00", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "VBZ Bellevue, Zurich CH", "CHF", "2.80", "", "**** **** **** 1234"],
|
||||
["01.11.25", "03.11.25", "WAL*CLUB BELLEVUE, HOERI CH", "CHF", "16.50", "", "**** **** **** 1234"],
|
||||
["02.11.25", "03.11.25", "MCDONALDS ZUERICH 2016, ZUERICH CH", "CHF", "10.50", "", "**** **** **** 1234"],
|
||||
["03.11.25", "04.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "191.15", "", "**** **** **** 1234"],
|
||||
["05.11.25", "06.11.25", "Coop-1252 Wald, Wald ZH CH", "CHF", "51.35", "", "**** **** **** 1234"],
|
||||
["06.11.25", "07.11.25", "Ticketcorner*90024523, 410900800800 CH", "CHF", "158.75", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "SUMUP *JW BROW&LASH, LACHEN CH", "CHF", "290.00", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "104.50", "", "**** **** **** 1234"],
|
||||
["07.11.25", "10.11.25", "Agrola TopShop Wald, Wald ZH CH", "CHF", "10.30", "", "**** **** **** 1234"],
|
||||
["08.11.25", "10.11.25", "Pizza Thal GmbH, Murgenthal CH", "CHF", "19.50", "", "**** **** **** 1234"],
|
||||
["09.11.25", "10.11.25", "TEMU.COM, BASEL CH", "CHF", "190.85", "", "**** **** **** 1234"],
|
||||
["10.11.25", "11.11.25", "Sinora GmbH, Bonstetten CH", "CHF", "115.20", "", "**** **** **** 1234"],
|
||||
["10.11.25", "11.11.25", "WAL*HAAR SHOP CH AG, UETENDORF CH", "CHF", "33.85", "", "**** **** **** 1234"],
|
||||
["11.11.25", "12.11.25", "Bleiche Fitness, Wald ZH CH", "CHF", "90.00", "", "**** **** **** 1234"],
|
||||
["11.11.25", "12.11.25", "Parkhaus Urania, Zurich CH", "CHF", "14.00", "", "**** **** **** 1234"],
|
||||
["12.11.25", "13.11.25", "Coop-4958 Rapperswil, Rapperswil SG CH", "CHF", "24.80", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "56.00", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "5.95", "", "**** **** **** 1234"],
|
||||
["13.11.25", "14.11.25", "TEMU.COM, BASEL CH", "CHF", "15.25", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "Santa Lucia Altstetten, Zurich", "CH", "38.00", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "Agrola TopShop Wald, Wald ZH", "CH", "126.80", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "GITHUB, INC., GITHUB.COM", "US", "USD 0.70", "0.60", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Jumbo-6017 Hinwil, Hinwil", "CH", "53.85", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "57.00", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "13.95", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "NEGISHI ALTSTETTEN BAH, ZUERICH", "CH", "31.90", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Google PixVerse AI Vi, 650-2530000", "US", "5.20", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "CANVA* I04701-26464248, CANVA.COM", "US", "12.00", "", "**** **** **** 1234"],
|
||||
["17.11.25", "18.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 270.25", "220.65", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "Coop-1252 Wald, Wald ZH", "CH", "7.80", "", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.30", "", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "343.30", "", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "ANTHROPIC, ANTHROPIC.COM", "US", "USD 5.41", "4.45", "**** **** **** 1234"],
|
||||
["18.11.25", "20.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.35", "", "**** **** **** 1234"],
|
||||
["19.11.25", "20.11.25", "Wuest Partner, Zurich", "CH", "324.30", "", "**** **** **** 1234"],
|
||||
["19.11.25", "21.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.40", "11.80", "**** **** **** 1234"],
|
||||
["20.11.25", "21.11.25", "Coop-1252 Wald, Wald ZH", "CH", "85.35", "", "**** **** **** 1234"],
|
||||
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "17.95", "", "**** **** **** 1234"],
|
||||
["20.11.25", "21.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "6.30", "", "**** **** **** 1234"],
|
||||
["21.11.25", "24.11.25", "STWEG Ambassador House, Glattbrugg", "CH", "7.50", "", "**** **** **** 1234"],
|
||||
["21.11.25", "24.11.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "16.95", "", "**** **** **** 1234"],
|
||||
["21.11.25", "24.11.25", "MCDONALDS RESTAURANT G, WALLISELLEN", "CH", "13.00", "", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "Ski- und Snowboard-Center, Neuhaus SG", "CH", "128.00", "", "**** **** **** 1234"],
|
||||
["21.11.25", "24.11.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "408.25", "", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "GOOGLE *Duolingo Langu, g.co/HelpPay#", "US", "9.20", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "48.60", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "Migros Santispark Bad, Abtwil SG", "CH", "8.50", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "Migros ELS Santispark PH, Abtwil SG", "CH", "3.00", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "121.80", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "AVIA Service Autogrill, St. Margrethe", "CH", "10.50", "", "**** **** **** 1234"],
|
||||
["23.11.25", "24.11.25", "KONDITOREI VOLAND LAUP, LAUPEN ZH", "CH", "62.80", "", "**** **** **** 1234"],
|
||||
["23.11.25", "25.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 9.30", "8.90", "**** **** **** 1234"],
|
||||
["24.11.25", "25.11.25", "Landi, Wald", "CH", "27.15", "", "**** **** **** 1234"],
|
||||
["24.11.25", "26.11.25", "SHOP.ASFINAG.AT, WIEN", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
|
||||
["26.11.25", "27.11.25", "MyPlace, Affoltern am", "CH", "10.30", "", "**** **** **** 1234"],
|
||||
["27.11.25", "28.11.25", "Coop-1911 Ruti, Ruti ZH", "CH", "57.20", "", "**** **** **** 1234"],
|
||||
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "10.10", "", "**** **** **** 1234"],
|
||||
["28.11.25", "01.12.25", "Manor AG, Hinwil", "CH", "136.25", "", "**** **** **** 1234"],
|
||||
["28.11.25", "01.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "205.35", "", "**** **** **** 1234"],
|
||||
["01.12.25", "02.12.25", "GOOGLE *ADS5192965135, cc§google.com", "IE", "59.00", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "112.50", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "Coop-1252 Wald, Wald ZH", "CH", "117.70", "", "**** **** **** 1234"],
|
||||
["03.12.25", "03.12.25", "Autodesk ADY, Dublin 2", "IE", "1'989.05", "", "**** **** **** 1234"],
|
||||
["03.12.25", "03.12.25", "NETFLIX.COM, Amsterdam", "NL", "22.90", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 17.48", "14.50", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "GOOGLE *YouTubePremium, g.co/HelpPay#", "US", "33.90", "", "**** **** **** 1234"],
|
||||
["04.12.25", "05.12.25", "Migros M Dubendorf Stettb, Dubendorf", "CH", "103.20", "", "**** **** **** 1234"],
|
||||
["04.12.25", "05.12.25", "WAL*WESTHIVE, DUEBENDORF", "CH", "19.80", "", "**** **** **** 1234"],
|
||||
["05.12.25", "08.12.25", "MICROSOFT#G127221615, MSBILL.INFO", "CH", "55.20", "", "**** **** **** 1234"],
|
||||
["04.12.25", "08.12.25", "Ristorante Amalfi AG, Zurich", "CH", "67.00", "", "**** **** **** 1234"],
|
||||
["05.12.25", "08.12.25", "Landi, Wald", "CH", "11.90", "", "**** **** **** 1234"],
|
||||
["05.12.25", "08.12.25", "Notariat Wald, Wald ZH", "CH", "40.00", "", "**** **** **** 1234"],
|
||||
["05.12.25", "08.12.25", "Coop-1252 Wald, Wald ZH", "CH", "149.75", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "TIERARZTPRAXIS BACHTEL, WALD ZH", "CH", "80.30", "", "**** **** **** 1234"],
|
||||
["07.12.25", "08.12.25", "HERAHELP.COM, 0044330027088", "CY", "EUR 19.95", "19.25", "**** **** **** 1234"],
|
||||
["07.12.25", "08.12.25", "Google One, 650-2530000", "US", "10.00", "", "**** **** **** 1234"],
|
||||
["10.12.25", "11.12.25", "TAVILY AI, WWW.TAVILY.CO", "US", "USD 43.26", "35.95", "**** **** **** 1234"],
|
||||
["11.12.25", "12.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "247.40", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "ONLY, Zurich", "CH", "101.75", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "SUMUP *MARYS COSMETICS, USTER", "CH", "419.00", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "S2P*Calzedonia, 0447554090", "IT", "86.75", "", "**** **** **** 1234"],
|
||||
["14.11.25", "17.11.25", "Parkhaus Urania, Zurich", "CH", "12.00", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "JustEat, Zurich", "CH", "193.70", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "ONLY, Hinwil", "CH", "126.10", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "242.70", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Manor AG, Hinwil", "CH", "35.35", "", "**** **** **** 1234"],
|
||||
["15.11.25", "17.11.25", "Valentyna Nails, R?ti", "CH", "160.00", "", "**** **** **** 1234"],
|
||||
["13.11.25", "17.11.25", "redcare-apotheke, Sevenum", "NL", "79.90", "", "**** **** **** 1234"],
|
||||
["16.11.25", "17.11.25", "NORDSTERN, Basel", "CH", "64.20", "", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "La Makeup Sp. z. o.o., Warsaw", "PL", "104.85", "", "**** **** **** 1234"],
|
||||
["18.11.25", "19.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
|
||||
["20.11.25", "21.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "94.60", "", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 7.39", "7.05", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "SHELL PETTNAU 5538, PETTNAU", "AT", "EUR 4.39", "4.20", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "Coop-1252 Wald, Wald ZH", "CH", "57.85", "", "**** **** **** 1234"],
|
||||
["22.11.25", "24.11.25", "ASFINAG S16 HMS ST JAKOB, ST.ANTON/ARLB", "AT", "EUR 12.50", "11.95", "**** **** **** 1234"],
|
||||
["24.11.25", "25.11.25", "Posthotel Achenkirc, Achenkirch", "AT", "EUR 1'211.80", "1'160.25", "**** **** **** 1234"],
|
||||
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "15.00", "", "**** **** **** 1234"],
|
||||
["24.11.25", "25.11.25", "MS* CITYPOP2NIGHTBASE, ZURICH", "CH", "8.40", "", "**** **** **** 1234"],
|
||||
["24.11.25", "25.11.25", "BKG*BOOKING.COM HOTEL, (888)850-3958", "NL", "187.95", "", "**** **** **** 1234"],
|
||||
["25.11.25", "26.11.25", "Coop-1252 Wald, Wald ZH", "CH", "63.00", "", "**** **** **** 1234"],
|
||||
["25.11.25", "26.11.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
|
||||
["26.11.25", "27.11.25", "Hallenbad Wald, Wald ZH", "CH", "54.00", "", "**** **** **** 1234"],
|
||||
["27.11.25", "28.11.25", "Bestseller AS, Amsterdam", "NL", "35.90", "", "**** **** **** 1234"],
|
||||
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
|
||||
["29.11.25", "01.12.25", "CLUB NORDSTERN, BASEL", "CH", "6.00", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "84.90", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "126.15", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "Shell Waldhof, Wald ZH", "CH", "3.70", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "Bleiche Fitness, Wald ZH", "CH", "90.00", "", "**** **** **** 1234"],
|
||||
["02.12.25", "03.12.25", "GOOGLE *YouTube Member, g.co/HelpPay#", "US", "15.00", "", "**** **** **** 1234"],
|
||||
["03.12.25", "04.12.25", "APODRO APOTHEKE WALD, WALD ZH", "CH", "54.90", "", "**** **** **** 1234"],
|
||||
["04.12.25", "05.12.25", "TICKETCORNER CH, RUEMLANG", "CH", "285.70", "", "**** **** **** 1234"],
|
||||
["05.12.25", "05.12.25", "NOII.CH DATING, WINTERTHUR", "CH", "74.10", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "TEMU.COM, BASEL", "CH", "72.50", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "326.85", "", "**** **** **** 1234"],
|
||||
["06.12.25", "08.12.25", "Decathlon, Hinwil", "CH", "150.00", "", "**** **** **** 1234"],
|
||||
["07.12.25", "09.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "200.00", "", "**** **** **** 1234"],
|
||||
["08.12.25", "10.12.25", "Zürich HB, Zürich", "CH", "45.00", "", "**** **** **** 1234"],
|
||||
["09.12.25", "11.12.25", "Amazon Marketplace, amazon.de", "DE", "120.00", "", "**** **** **** 1234"],
|
||||
["10.12.25", "12.12.25", "IKEA, Dietlikon", "CH", "350.00", "", "**** **** **** 1234"],
|
||||
["11.12.25", "13.12.25", "Manor, Zürich", "CH", "75.00", "", "**** **** **** 1234"],
|
||||
["12.12.25", "14.12.25", "Zalando, zalando.ch", "CH", "90.00", "", "**** **** **** 1234"],
|
||||
["13.12.25", "15.12.25", "SBB CFF FFS, Bern", "CH", "60.00", "", "**** **** **** 1234"],
|
||||
["14.12.25", "16.12.25", "Apple Store, Zürich", "CH", "999.00", "", "**** **** **** 1234"],
|
||||
["15.12.25", "17.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "150.00", "", "**** **** **** 1234"],
|
||||
["16.12.25", "18.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "250.00", "", "**** **** **** 1234"],
|
||||
["17.12.25", "19.12.25", "Shell Waldhof, Wald ZH", "CH", "60.00", "", "**** **** **** 1234"],
|
||||
["18.12.25", "20.12.25", "Zürich HB, Zürich", "CH", "30.00", "", "**** **** **** 1234"],
|
||||
["19.12.25", "21.12.25", "Amazon Marketplace, amazon.de", "DE", "80.00", "", "**** **** **** 1234"],
|
||||
["20.12.25", "22.12.25", "IKEA, Dietlikon", "CH", "400.00", "", "**** **** **** 1234"],
|
||||
["21.12.25", "23.12.25", "Manor, Zürich", "CH", "100.00", "", "**** **** **** 1234"],
|
||||
["22.12.25", "24.12.25", "Zalando, zalando.ch", "CH", "110.00", "", "**** **** **** 1234"],
|
||||
["23.12.25", "25.12.25", "SBB CFF FFS, Bern", "CH", "70.00", "", "**** **** **** 1234"],
|
||||
["24.12.25", "26.12.25", "Apple Store, Zürich", "CH", "1200.00", "", "**** **** **** 1234"],
|
||||
["25.12.25", "27.12.25", "Migros-Genossenschafts-Bund, Zürich", "CH", "180.00", "", "**** **** **** 1234"],
|
||||
["26.12.25", "28.12.25", "Coop-4253 Hinwil Wasseri, Hinwil", "CH", "300.00", "", "**** **** **** 1234"],
|
||||
["27.12.25", "29.12.25", "Shell Waldhof, Wald ZH", "CH", "70.00", "", "**** **** **** 1234"],
|
||||
["28.12.25", "30.12.25", "Zürich HB, Zürich", "CH", "40.00", "", "**** **** **** 1234"],
|
||||
["29.12.25", "31.12.25", "Amazon Marketplace, amazon.de", "DE", "100.00", "", "**** **** **** 1234"],
|
||||
["30.12.25", "01.01.26", "IKEA, Dietlikon", "CH", "450.00", "", "**** **** **** 1234"],
|
||||
["31.12.25", "02.01.26", "Manor, Zürich", "CH", "125.00", "", "**** **** **** 1234"]
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
================================================================================
|
||||
|
|
@ -1,239 +0,0 @@
|
|||
# AI Call Iteration Flow - JSON Merging System
|
||||
|
||||
This document describes the iteration flow for handling large JSON responses from AI that may be truncated and need to be merged across multiple iterations.
|
||||
|
||||
## Overview
|
||||
|
||||
When an AI response is too large, it may be truncated (cut) at an arbitrary point. The iteration system:
|
||||
1. Detects incomplete JSON
|
||||
2. Requests continuation from the AI
|
||||
3. Merges the continuation with the existing JSON
|
||||
4. Repeats until complete or max failures reached
|
||||
|
||||
---
|
||||
|
||||
## Key Variables
|
||||
|
||||
| Variable | Type | Purpose |
|
||||
|----------|------|---------|
|
||||
| `jsonBase` | `str \| None` | The merged JSON string (CUT version for overlap matching) |
|
||||
| `candidateJson` | `str` | Temporary holder for merged result until validated |
|
||||
| `lastValidCompletePart` | `str \| None` | Fallback - last successfully parsed CLOSED JSON |
|
||||
| `lastOverlapContext` | `str` | Context for retry/continuation prompts |
|
||||
| `lastHierarchyContextForPrompt` | `str` | Context for retry/continuation prompts |
|
||||
| `mergeFailCount` | `int` | Global counter (max 3 failures) |
|
||||
|
||||
---
|
||||
|
||||
## Key Distinction: hierarchyContext vs completePart
|
||||
|
||||
| Field | Description | Use Case |
|
||||
|-------|-------------|----------|
|
||||
| `hierarchyContext` | **CUT JSON** - truncated at cut point | Used as `jsonBase` for merging with next AI fragment |
|
||||
| `completePart` | **CLOSED JSON** - all structures properly closed | Used for validation, parsing, and fallback |
|
||||
|
||||
**Why this matters:**
|
||||
- The next AI fragment starts with an **overlap** that matches the CUT point
|
||||
- If we used `completePart` (closed), the overlap detection would FAIL
|
||||
- We must use `hierarchyContext` (cut) so overlap matching works correctly
|
||||
|
||||
---
|
||||
|
||||
## Flow Steps
|
||||
|
||||
### Step 1: BUILD PROMPT
|
||||
|
||||
**Location:** `subAiCallLooping.py` lines 163-212
|
||||
**Function:** `buildContinuationContext()` from `modules/shared/jsonUtils.py`
|
||||
|
||||
- **First iteration:** Use original prompt
|
||||
- **Continuation:** `buildContinuationContext(allSections, lastRawResponse, ...)`
|
||||
- Internally calls `getContexts(lastRawResponse)` to get overlap/hierarchy
|
||||
- Builds continuation prompt with `overlapContext` + `hierarchyContextForPrompt`
|
||||
|
||||
### Step 2: CALL AI
|
||||
|
||||
**Location:** `subAiCallLooping.py` lines 214-299
|
||||
**Function:** `self.aiService.callAi(request)`
|
||||
|
||||
- Returns `response.content` as `result`
|
||||
- NOTE: Do NOT update `lastRawResponse` yet! (only after successful merge)
|
||||
|
||||
### Step 4: MERGE
|
||||
|
||||
**Location:** `subAiCallLooping.py` lines 338-396
|
||||
**Function:** `JsonResponseHandler.mergeJsonStringsWithOverlap()` from `modules/services/serviceAi/subJsonResponseHandling.py`
|
||||
|
||||
```
|
||||
IF first iteration (jsonBase is None):
|
||||
→ candidateJson = result
|
||||
ELSE:
|
||||
→ mergedJsonString, hasOverlap = mergeJsonStringsWithOverlap(jsonBase, result)
|
||||
|
||||
IF hasOverlap = False (MERGE FAILED):
|
||||
→ mergeFailCount++
|
||||
→ If mergeFailCount >= 3: return lastValidCompletePart (fallback)
|
||||
→ Else: continue (retry with unchanged jsonBase AND lastRawResponse!)
|
||||
ELSE:
|
||||
→ candidateJson = mergedJsonString (don't update jsonBase yet!)
|
||||
|
||||
→ lastRawResponse = candidateJson (ONLY after first iteration or successful merge!)
|
||||
|
||||
TRY DIRECT PARSE of candidateJson:
|
||||
IF parse succeeds:
|
||||
→ jsonBase = candidateJson (commit)
|
||||
→ FINISHED! Return normalized result
|
||||
ELSE:
|
||||
→ Proceed to Step 5
|
||||
```
|
||||
|
||||
### Step 5: GET CONTEXTS
|
||||
|
||||
**Location:** `subAiCallLooping.py` lines 420-427
|
||||
**Function:** `getContexts()` from `modules/shared/jsonContinuation.py`
|
||||
|
||||
```python
|
||||
contexts = getContexts(candidateJson)
|
||||
```
|
||||
|
||||
Returns `JsonContinuationContexts`:
|
||||
- `overlapContext`: `""` if JSON is complete (no cut point)
|
||||
- `hierarchyContext`: CUT JSON (for merging with next fragment)
|
||||
- `hierarchyContextForPrompt`: CUT JSON with budget limits (for prompts)
|
||||
- `completePart`: CLOSED JSON (repaired if needed)
|
||||
- `jsonParsingSuccess`: `True` if completePart is valid JSON
|
||||
|
||||
**Enhancement:** If original JSON is already complete → `overlapContext = ""`
|
||||
This signals "JSON is complete, no more continuation needed"
|
||||
|
||||
### Step 6: DECIDE
|
||||
|
||||
**Location:** `subAiCallLooping.py` lines 429-528
|
||||
|
||||
#### Case A: `jsonParsingSuccess=true` AND `overlapContext=""`
|
||||
**→ FINISHED**
|
||||
- JSON is complete (no cut point)
|
||||
- `jsonBase = contexts.completePart` (use CLOSED version for final result)
|
||||
- Return `completePart` as result
|
||||
|
||||
#### Case B: `jsonParsingSuccess=true` AND `overlapContext!=""`
|
||||
**→ CONTINUE to next iteration**
|
||||
- JSON parseable but has cut point
|
||||
- `jsonBase = contexts.hierarchyContext` ← **CUT version for next merge!**
|
||||
- `lastValidCompletePart = contexts.completePart` ← **CLOSED version for fallback**
|
||||
- Store contexts for next prompt
|
||||
- `mergeFailCount = 0` (reset on success)
|
||||
- `lastRawResponse = jsonBase`
|
||||
- Continue to next iteration
|
||||
|
||||
#### Case C: `jsonParsingSuccess=false`
|
||||
**→ RETRY with same prompt**
|
||||
- Do NOT update `jsonBase` (keep previous valid state)
|
||||
- `mergeFailCount++`
|
||||
- If `mergeFailCount >= 3`: return `lastValidCompletePart` (fallback)
|
||||
- Else: continue (retry with unchanged jsonBase/lastRawResponse)
|
||||
|
||||
---
|
||||
|
||||
## Flow Diagram
|
||||
|
||||
```
|
||||
┌───────────────────────────────────────────────────────────────┐
|
||||
│ ITERATION START │
|
||||
└───────────────────────────┬───────────────────────────────────┘
|
||||
│
|
||||
┌───────────────────────────▼───────────────────────────────────┐
|
||||
│ STEP 1: BUILD PROMPT │
|
||||
│ - First: original prompt │
|
||||
│ - Next: buildContinuationContext(lastRawResponse) │
|
||||
└───────────────────────────┬───────────────────────────────────┘
|
||||
│
|
||||
┌───────────────────────────▼───────────────────────────────────┐
|
||||
│ STEP 2: CALL AI → result │
|
||||
└───────────────────────────┬───────────────────────────────────┘
|
||||
│
|
||||
┌───────────────────────────▼───────────────────────────────────┐
|
||||
│ STEP 4: MERGE jsonBase + result → candidateJson │
|
||||
└───────────────────────────┬───────────────────────────────────┘
|
||||
│
|
||||
┌────────────▼────────────┐
|
||||
│ Merge OK? │
|
||||
└────────────┬────────────┘
|
||||
│
|
||||
┌─────────────────────┼─────────────────────┐
|
||||
│ NO │ YES │
|
||||
▼ ▼ │
|
||||
┌──────────────┐ ┌──────────────────┐ │
|
||||
│ fails++ │ │ TRY DIRECT PARSE │ │
|
||||
│ if >=3: │ │ of candidateJson │ │
|
||||
│ RETURN │ └────────┬─────────┘ │
|
||||
│ fallback │ │ │
|
||||
│ else: RETRY │ ┌────────▼─────────┐ │
|
||||
│ (continue) │ │ Parse OK? │ │
|
||||
└──────────────┘ └────────┬─────────┘ │
|
||||
│ │
|
||||
┌─────────────────────┼─────────────────────┐
|
||||
│ YES │ NO │
|
||||
▼ ▼ │
|
||||
┌──────────────┐ ┌──────────────────────────────┐
|
||||
│ FINISHED ✓ │ │ STEP 5: getContexts() │
|
||||
│ Return │ │ → jsonParsingSuccess │
|
||||
│ normalized │ │ → overlapContext │
|
||||
│ result │ └────────────┬─────────────────┘
|
||||
└──────────────┘ │
|
||||
┌────────────▼────────────────────┐
|
||||
│ STEP 6: DECIDE │
|
||||
└────────────┬────────────────────┘
|
||||
│
|
||||
┌────────────────────────────┼────────────────────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌───────────────────┐ ┌───────────────────────┐ ┌───────────────────┐
|
||||
│ success=true │ │ success=true │ │ success=false │
|
||||
│ overlap="" │ │ overlap!="" │ │ │
|
||||
│ ───────────── │ │ ───────────────── │ │ ───────────── │
|
||||
│ FINISHED ✓ │ │ CONTINUE │ │ RETRY │
|
||||
│ │ │ │ │ │
|
||||
│ jsonBase = │ │ jsonBase = │ │ jsonBase unchanged│
|
||||
│ completePart │ │ hierarchyContext │ │ fails++ │
|
||||
│ (CLOSED) │ │ (CUT for merge!) │ │ │
|
||||
│ │ │ │ │ if >=3: fallback │
|
||||
│ Return result │ │ fallback = │ │ else: retry │
|
||||
│ │ │ completePart │ │ │
|
||||
│ │ │ (CLOSED) │ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ Next iteration → │ │ │
|
||||
└───────────────────┘ └───────────────────────┘ └───────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files Involved
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `modules/services/serviceAi/subAiCallLooping.py` | Main iteration loop |
|
||||
| `modules/shared/jsonContinuation.py` | `getContexts()` - context extraction & repair |
|
||||
| `modules/shared/jsonUtils.py` | `buildContinuationContext()` - prompt building |
|
||||
| `modules/services/serviceAi/subJsonResponseHandling.py` | `mergeJsonStringsWithOverlap()` |
|
||||
| `modules/services/serviceAi/subJsonMerger.py` | `ModularJsonMerger` - actual merge logic |
|
||||
| `modules/datamodels/datamodelAi.py` | `JsonContinuationContexts` model |
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Merge Failures
|
||||
- Max 3 consecutive failures allowed
|
||||
- On failure: retry with unchanged `jsonBase` (previous valid state)
|
||||
- After 3 failures: return `lastValidCompletePart` as fallback
|
||||
|
||||
### Parse Failures
|
||||
- If `getContexts()` cannot produce valid JSON: increment fail counter
|
||||
- Retry with same prompt (don't update jsonBase)
|
||||
- After 3 failures: return `lastValidCompletePart` as fallback
|
||||
|
||||
### Fallback Strategy
|
||||
- `lastValidCompletePart` stores the last successfully parsed CLOSED JSON
|
||||
- Always available as fallback when things go wrong
|
||||
- Ensures we return valid JSON even after multiple failures
|
||||
|
|
@ -1,665 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
AI Call Looping Module
|
||||
|
||||
Handles AI calls with looping and repair logic, including:
|
||||
- Looping with JSON repair and continuation
|
||||
- KPI definition and tracking
|
||||
- Progress tracking and iteration management
|
||||
|
||||
FLOW LOGIC
|
||||
|
||||
VARIABLES:
|
||||
- jsonBase: str (merged JSON so far, starts empty)
|
||||
- lastValidCompletePart: str (fallback for failures)
|
||||
- mergeFailCount: int = 0 (max 3)
|
||||
|
||||
FLOW:
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ 1. BUILD PROMPT │
|
||||
│ - First: original prompt │
|
||||
│ - Next: buildContinuationContext(lastRawResponse) │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ 2. CALL AI → response fragment │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ 4. MERGE jsonBase + response │
|
||||
│ ├─ FAILS: repeat prompt, fails++ (if >=3 return fallback) │
|
||||
│ └─ SUCCEEDS: try parse │
|
||||
│ ├─ SUCCEEDS: FINISHED │
|
||||
│ └─ FAILS: → step 5 │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ 5. GET CONTEXTS (merge OK, parse failed) │
|
||||
│ getContexts(mergedJson) → │
|
||||
│ - If no cut point: overlapContext = "" │
|
||||
│ - Store contexts for next iteration │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ 6. DECIDE │
|
||||
│ ├─ jsonParsingSuccess=true AND overlapContext="": │
|
||||
│ │ FINISHED. return completePart │
|
||||
│ ├─ jsonParsingSuccess=true AND overlapContext!="": │
|
||||
│ │ CONTINUE, fails=0 │
|
||||
│ └─ ELSE: repeat prompt, fails++ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
|
||||
from modules.datamodels.datamodelAi import (
|
||||
AiCallRequest, AiCallOptions
|
||||
)
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from .subJsonResponseHandling import JsonResponseHandler
|
||||
from .subLoopingUseCases import LoopingUseCaseRegistry
|
||||
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
||||
from modules.shared.jsonContinuation import getContexts
|
||||
from modules.shared.jsonUtils import buildContinuationContext, extractJsonString, tryParseJson
|
||||
from modules.shared.jsonUtils import tryParseJson
|
||||
from modules.shared.jsonUtils import closeJsonStructures
|
||||
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AiCallLooper:
|
||||
"""Handles AI calls with looping and repair logic."""
|
||||
|
||||
def __init__(self, services, aiService, responseParser):
|
||||
"""Initialize AiCallLooper with service center, AI service, and response parser access."""
|
||||
self.services = services
|
||||
self.aiService = aiService
|
||||
self.responseParser = responseParser
|
||||
self.useCaseRegistry = LoopingUseCaseRegistry() # Initialize use case registry
|
||||
|
||||
async def callAiWithLooping(
|
||||
self,
|
||||
prompt: str,
|
||||
options: AiCallOptions,
|
||||
debugPrefix: str = "ai_call",
|
||||
promptBuilder: Optional[Callable] = None,
|
||||
promptArgs: Optional[Dict[str, Any]] = None,
|
||||
operationId: Optional[str] = None,
|
||||
userPrompt: Optional[str] = None,
|
||||
contentParts: Optional[List[ContentPart]] = None, # ARCHITECTURE: Support ContentParts for large content
|
||||
useCaseId: str = None # REQUIRED: Explicit use case ID - no auto-detection, no fallback
|
||||
) -> str:
|
||||
"""
|
||||
Shared core function for AI calls with repair-based looping system.
|
||||
Automatically repairs broken JSON and continues generation seamlessly.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to send to AI
|
||||
options: AI call configuration options
|
||||
debugPrefix: Prefix for debug file names
|
||||
promptBuilder: Optional function to rebuild prompts for continuation
|
||||
promptArgs: Optional arguments for prompt builder
|
||||
operationId: Optional operation ID for progress tracking
|
||||
userPrompt: Optional user prompt for KPI definition
|
||||
contentParts: Optional content parts for first iteration
|
||||
useCaseId: REQUIRED: Explicit use case ID - no auto-detection, no fallback
|
||||
|
||||
Returns:
|
||||
Complete AI response after all iterations
|
||||
"""
|
||||
# REQUIRED: useCaseId must be provided - no auto-detection, no fallback
|
||||
if not useCaseId:
|
||||
errorMsg = (
|
||||
"useCaseId is REQUIRED for callAiWithLooping. "
|
||||
"No auto-detection - must explicitly specify use case ID. "
|
||||
f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}"
|
||||
)
|
||||
logger.error(errorMsg)
|
||||
raise ValueError(errorMsg)
|
||||
|
||||
# Validate use case exists
|
||||
useCase = self.useCaseRegistry.get(useCaseId)
|
||||
if not useCase:
|
||||
errorMsg = (
|
||||
f"Use case '{useCaseId}' not found in registry. "
|
||||
f"Available use cases: {list(self.useCaseRegistry.useCases.keys())}"
|
||||
)
|
||||
logger.error(errorMsg)
|
||||
raise ValueError(errorMsg)
|
||||
|
||||
maxIterations = 50 # Prevent infinite loops
|
||||
iteration = 0
|
||||
allSections = [] # Accumulate all sections across iterations
|
||||
lastRawResponse = None # Store last raw JSON response for continuation
|
||||
|
||||
# JSON Base Iteration System:
|
||||
# - jsonBase: the merged JSON string (replaces accumulatedDirectJson array)
|
||||
# - After each iteration, new response is merged with jsonBase
|
||||
# - On merge success: check if complete, store contexts for next iteration
|
||||
# - On merge fail: retry with same prompt, increment fails
|
||||
jsonBase = None # Merged JSON string (starts None, set on first response)
|
||||
|
||||
# Merge fail tracking - stop after 3 consecutive merge failures
|
||||
MAX_MERGE_FAILS = 3
|
||||
mergeFailCount = 0 # Global counter for merge failures across entire loop
|
||||
lastValidCompletePart = None # Store last successfully parsed completePart for fallback
|
||||
|
||||
# Get parent operation ID for iteration operations (parentId should be operationId, not log entry ID)
|
||||
parentOperationId = operationId # Use the parent's operationId directly
|
||||
|
||||
while iteration < maxIterations:
|
||||
iteration += 1
|
||||
|
||||
# Create separate operation for each iteration with parent reference
|
||||
iterationOperationId = None
|
||||
if operationId:
|
||||
iterationOperationId = f"{operationId}_iter_{iteration}"
|
||||
self.services.chat.progressLogStart(
|
||||
iterationOperationId,
|
||||
"AI Call",
|
||||
f"Iteration {iteration}",
|
||||
"",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
# Build iteration prompt
|
||||
# CRITICAL: Build continuation prompt if we have sections OR if we have a previous response (even if broken)
|
||||
# This ensures continuation prompts are built even when JSON is so broken that no sections can be extracted
|
||||
if (len(allSections) > 0 or lastRawResponse) and promptBuilder and promptArgs:
|
||||
# Extract templateStructure and basePrompt from promptArgs (REQUIRED)
|
||||
templateStructure = promptArgs.get("templateStructure")
|
||||
if not templateStructure:
|
||||
raise ValueError(
|
||||
f"templateStructure is REQUIRED in promptArgs for use case '{useCaseId}'. "
|
||||
"Prompt creation functions must return (prompt, templateStructure) tuple."
|
||||
)
|
||||
|
||||
basePrompt = promptArgs.get("basePrompt")
|
||||
if not basePrompt:
|
||||
# Fallback: use prompt parameter (should be the same)
|
||||
basePrompt = prompt
|
||||
logger.warning(
|
||||
f"basePrompt not found in promptArgs for use case '{useCaseId}', "
|
||||
"using prompt parameter instead. This may indicate a bug."
|
||||
)
|
||||
|
||||
# This is a continuation - build continuation context with raw JSON and rebuild prompt
|
||||
continuationContext = buildContinuationContext(
|
||||
allSections, lastRawResponse, useCaseId, templateStructure
|
||||
)
|
||||
if not lastRawResponse:
|
||||
logger.warning(f"Iteration {iteration}: No previous response available for continuation!")
|
||||
|
||||
# Store valid completePart from continuation context for fallback on merge failures
|
||||
# Use getContexts to check if completePart is parseable and store it
|
||||
if lastRawResponse and not lastValidCompletePart:
|
||||
try:
|
||||
contexts = getContexts(lastRawResponse)
|
||||
if contexts.jsonParsingSuccess and contexts.completePart:
|
||||
lastValidCompletePart = contexts.completePart
|
||||
logger.debug(f"Iteration {iteration}: Stored initial valid completePart ({len(lastValidCompletePart)} chars)")
|
||||
except Exception as e:
|
||||
logger.debug(f"Iteration {iteration}: Failed to extract completePart: {e}")
|
||||
|
||||
# Unified prompt builder call: Continuation builders only need continuationContext, templateStructure, and basePrompt
|
||||
# All initial context (section, userPrompt, etc.) is already in basePrompt, so promptArgs is not needed
|
||||
# Extract templateStructure and basePrompt from promptArgs (they're explicit parameters)
|
||||
iterationPrompt = await promptBuilder(
|
||||
continuationContext=continuationContext,
|
||||
templateStructure=templateStructure,
|
||||
basePrompt=basePrompt
|
||||
)
|
||||
else:
|
||||
# First iteration - use original prompt
|
||||
iterationPrompt = prompt
|
||||
|
||||
# Make AI call
|
||||
try:
|
||||
checkWorkflowStopped(self.services)
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogUpdate(iterationOperationId, 0.3, "Calling AI model")
|
||||
# ARCHITECTURE: Pass ContentParts directly to AiCallRequest
|
||||
# This allows model-aware chunking to handle large content properly
|
||||
# ContentParts are only passed in first iteration (continuations don't need them)
|
||||
request = AiCallRequest(
|
||||
prompt=iterationPrompt,
|
||||
context="",
|
||||
options=options,
|
||||
contentParts=contentParts if iteration == 1 else None # Only pass ContentParts in first iteration
|
||||
)
|
||||
|
||||
# Write the ACTUAL prompt sent to AI
|
||||
# For section content generation: write prompt for first iteration and continuation iterations
|
||||
# For document generation: write prompt for each iteration
|
||||
isSectionContent = "_section_" in debugPrefix
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt")
|
||||
elif isSectionContent:
|
||||
# Save continuation prompts for section_content debugging
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
|
||||
else:
|
||||
# Document generation - save all iteration prompts
|
||||
self.services.utils.writeDebugFile(iterationPrompt, f"{debugPrefix}_prompt_iteration_{iteration}")
|
||||
|
||||
response = await self.aiService.callAi(request)
|
||||
result = response.content
|
||||
|
||||
# Track bytes for progress reporting
|
||||
bytesReceived = len(result.encode('utf-8')) if result else 0
|
||||
totalBytesSoFar = sum(len(section.get('content', '').encode('utf-8')) if isinstance(section.get('content'), str) else 0 for section in allSections) + bytesReceived
|
||||
|
||||
# Update progress after AI call with byte information
|
||||
if iterationOperationId:
|
||||
# Format bytes for display (kB or MB)
|
||||
if totalBytesSoFar < 1024:
|
||||
bytesDisplay = f"{totalBytesSoFar}B"
|
||||
elif totalBytesSoFar < 1024 * 1024:
|
||||
bytesDisplay = f"{totalBytesSoFar / 1024:.1f}kB"
|
||||
else:
|
||||
bytesDisplay = f"{totalBytesSoFar / (1024 * 1024):.1f}MB"
|
||||
self.services.chat.progressLogUpdate(iterationOperationId, 0.6, f"AI response received ({bytesDisplay})")
|
||||
|
||||
# Write raw AI response to debug file
|
||||
# For section content generation: write response for first iteration and continuation iterations
|
||||
# For document generation: write response for each iteration
|
||||
if iteration == 1:
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response")
|
||||
elif isSectionContent:
|
||||
# Save continuation responses for section_content debugging
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
|
||||
else:
|
||||
# Document generation - save all iteration responses
|
||||
self.services.utils.writeDebugFile(result, f"{debugPrefix}_response_iteration_{iteration}")
|
||||
|
||||
# Note: Stats are now stored centrally in callAi() - no need to duplicate here
|
||||
|
||||
# Check for error response using generic error detection (errorCount > 0 or modelName == "error")
|
||||
if hasattr(response, 'errorCount') and response.errorCount > 0:
|
||||
errorMsg = f"Iteration {iteration}: Error response detected (errorCount={response.errorCount}), stopping loop: {result[:200] if result else 'empty'}"
|
||||
logger.error(errorMsg)
|
||||
break
|
||||
|
||||
if hasattr(response, 'modelName') and response.modelName == "error":
|
||||
errorMsg = f"Iteration {iteration}: Error response detected (modelName=error), stopping loop: {result[:200] if result else 'empty'}"
|
||||
logger.error(errorMsg)
|
||||
break
|
||||
|
||||
if not result or not result.strip():
|
||||
logger.warning(f"Iteration {iteration}: Empty response, stopping")
|
||||
break
|
||||
|
||||
# Check if this is a text response (not document generation)
|
||||
# Text responses don't need JSON parsing - return immediately after first successful response
|
||||
isTextResponse = (promptBuilder is None and promptArgs is None) or debugPrefix == "text"
|
||||
|
||||
if isTextResponse:
|
||||
# For text responses, return the text immediately - no JSON parsing needed
|
||||
logger.info(f"Iteration {iteration}: Text response received, returning immediately")
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
return result
|
||||
|
||||
# NOTE: Do NOT update lastRawResponse here!
|
||||
# lastRawResponse should only be updated after successful merge
|
||||
# This ensures retry iterations use the correct base context
|
||||
|
||||
# Handle use cases that return JSON directly (no section extraction needed)
|
||||
# Check if use case supports direct return (all registered use cases do)
|
||||
if useCase and not useCase.requiresExtraction:
|
||||
# =====================================================================
|
||||
# ITERATION FLOW (Simplified)
|
||||
# =====================================================================
|
||||
# Step 4: MERGE jsonBase + new response
|
||||
# - FAILS: repeat prompt, increment fails cont (if >=3 return fallback)
|
||||
# - SUCCEEDS: try parse
|
||||
# - SUCCEEDS: FINISHED
|
||||
# - FAILS: proceed to Step 5
|
||||
# Step 5: GET CONTEXTS (merge OK, parse failed)
|
||||
# - getContexts() with repair
|
||||
# - If no cut point: overlapContext = ""
|
||||
# Step 6: DECIDE
|
||||
# - jsonParsingSuccess=true AND overlapContext="": FINISHED
|
||||
# - jsonParsingSuccess=true AND overlapContext!="": continue, fails=0
|
||||
# - ELSE: repeat prompt, increment fails count
|
||||
# =====================================================================
|
||||
|
||||
# STEP 4: MERGE jsonBase + new response
|
||||
# Use candidateJson to hold merged result until we confirm it's valid
|
||||
candidateJson = None
|
||||
|
||||
if jsonBase is None:
|
||||
# First iteration - candidate is the current result
|
||||
candidateJson = result
|
||||
logger.debug(f"Iteration {iteration}: First response, candidateJson ({len(candidateJson)} chars)")
|
||||
else:
|
||||
# Merge jsonBase with new response
|
||||
logger.info(f"Iteration {iteration}: Merging jsonBase ({len(jsonBase)} chars) with new response ({len(result)} chars)")
|
||||
mergedJsonString, hasOverlap = JsonResponseHandler.mergeJsonStringsWithOverlap(jsonBase, result)
|
||||
|
||||
if not hasOverlap:
|
||||
# MERGE FAILED - repeat prompt with unchanged jsonBase
|
||||
mergeFailCount += 1
|
||||
logger.warning(
|
||||
f"Iteration {iteration}: Merge failed, no overlap found "
|
||||
f"(fail {mergeFailCount}/{MAX_MERGE_FAILS})"
|
||||
)
|
||||
|
||||
if mergeFailCount >= MAX_MERGE_FAILS:
|
||||
# Max failures reached - return last valid completePart
|
||||
logger.error(
|
||||
f"Iteration {iteration}: Max merge failures ({MAX_MERGE_FAILS}) reached, "
|
||||
"returning last valid completePart"
|
||||
)
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, False)
|
||||
|
||||
if lastValidCompletePart:
|
||||
try:
|
||||
extracted = extractJsonString(lastValidCompletePart)
|
||||
parsed, parseErr, _ = tryParseJson(extracted)
|
||||
if parseErr is None and parsed:
|
||||
normalized = self._normalizeJsonStructure(parsed, useCase)
|
||||
return json.dumps(normalized, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
pass
|
||||
return lastValidCompletePart
|
||||
else:
|
||||
# No valid fallback - return whatever we have
|
||||
return jsonBase if jsonBase else ""
|
||||
|
||||
# Not at max failures - retry with same prompt (jsonBase unchanged)
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogUpdate(
|
||||
iterationOperationId, 0.7,
|
||||
f"Merge failed ({mergeFailCount}/{MAX_MERGE_FAILS}), retrying"
|
||||
)
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
continue
|
||||
|
||||
# MERGE SUCCEEDED - set candidate (don't update jsonBase yet!)
|
||||
candidateJson = mergedJsonString
|
||||
logger.debug(f"Iteration {iteration}: Merge succeeded, candidateJson ({len(candidateJson)} chars)")
|
||||
|
||||
# Update lastRawResponse ONLY after we have a valid candidateJson
|
||||
# (first iteration or successful merge - NOT on merge failure!)
|
||||
# This ensures retry iterations use the correct base context
|
||||
lastRawResponse = candidateJson
|
||||
|
||||
# Try direct parse of candidate
|
||||
try:
|
||||
extracted = extractJsonString(candidateJson)
|
||||
parsed, parseErr, _ = tryParseJson(extracted)
|
||||
if parseErr is None and parsed:
|
||||
# Direct parse succeeded - FINISHED
|
||||
# Commit candidate to jsonBase
|
||||
jsonBase = candidateJson
|
||||
logger.info(f"Iteration {iteration}: Direct parse succeeded, JSON is complete")
|
||||
normalized = self._normalizeJsonStructure(parsed, useCase)
|
||||
result = json.dumps(normalized, indent=2, ensure_ascii=False)
|
||||
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
|
||||
if not useCase.finalResultHandler:
|
||||
raise ValueError(
|
||||
f"Use case '{useCaseId}' is missing required 'finalResultHandler' callback."
|
||||
)
|
||||
return useCase.finalResultHandler(
|
||||
result, normalized, extracted, debugPrefix, self.services
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Iteration {iteration}: Direct parse failed: {e}")
|
||||
|
||||
# STEP 5: GET CONTEXTS (merge OK, parse failed = cut JSON)
|
||||
# Use candidateJson for context extraction
|
||||
contexts = getContexts(candidateJson)
|
||||
overlapInfo = "(empty=complete)" if contexts.overlapContext == "" else f"({len(contexts.overlapContext)} chars)"
|
||||
logger.debug(
|
||||
f"Iteration {iteration}: getContexts() -> "
|
||||
f"jsonParsingSuccess={contexts.jsonParsingSuccess}, "
|
||||
f"overlapContext={overlapInfo}"
|
||||
)
|
||||
|
||||
# STEP 6: DECIDE based on jsonParsingSuccess and overlapContext
|
||||
if contexts.jsonParsingSuccess and contexts.overlapContext == "":
|
||||
# JSON is complete (no cut point) - FINISHED
|
||||
# Use completePart for final result (closed, repaired JSON)
|
||||
# No more merging needed, so we don't need the cut version
|
||||
jsonBase = contexts.completePart
|
||||
logger.info(f"Iteration {iteration}: jsonParsingSuccess=true, overlapContext='', JSON complete")
|
||||
|
||||
# Store and parse completePart
|
||||
lastValidCompletePart = contexts.completePart
|
||||
|
||||
try:
|
||||
extracted = extractJsonString(contexts.completePart)
|
||||
parsed, parseErr, _ = tryParseJson(extracted)
|
||||
if parseErr is None and parsed:
|
||||
normalized = self._normalizeJsonStructure(parsed, useCase)
|
||||
result = json.dumps(normalized, indent=2, ensure_ascii=False)
|
||||
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
|
||||
if not useCase.finalResultHandler:
|
||||
raise ValueError(
|
||||
f"Use case '{useCaseId}' is missing required 'finalResultHandler' callback."
|
||||
)
|
||||
return useCase.finalResultHandler(
|
||||
result, normalized, extracted, debugPrefix, self.services
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Iteration {iteration}: Failed to parse completePart: {e}")
|
||||
|
||||
# Fallback: return completePart as-is
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
return contexts.completePart
|
||||
|
||||
elif contexts.jsonParsingSuccess and contexts.overlapContext != "":
|
||||
# JSON parseable but has cut point - CONTINUE to next iteration
|
||||
# CRITICAL: Use hierarchyContext (CUT json) as jsonBase for next merge!
|
||||
# - hierarchyContext = the truncated JSON at cut point (needed for overlap matching)
|
||||
# - completePart = closed JSON (for validation/fallback only)
|
||||
# The next AI fragment's overlap must match the CUT point, not closed structures
|
||||
jsonBase = contexts.hierarchyContext
|
||||
logger.info(
|
||||
f"Iteration {iteration}: jsonParsingSuccess=true, overlapContext not empty, "
|
||||
f"continuing iteration (jsonBase updated to hierarchyContext: {len(jsonBase)} chars)"
|
||||
)
|
||||
|
||||
# Store valid completePart as fallback (different from jsonBase!)
|
||||
lastValidCompletePart = contexts.completePart
|
||||
|
||||
# Reset fail counter on successful progress
|
||||
mergeFailCount = 0
|
||||
|
||||
# Update lastRawResponse for continuation prompt building
|
||||
# Use the CUT version for prompt context as well
|
||||
lastRawResponse = jsonBase
|
||||
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogUpdate(iterationOperationId, 0.7, "JSON incomplete, requesting continuation")
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
continue
|
||||
|
||||
else:
|
||||
# JSON not parseable after repair - repeat prompt, increment fails
|
||||
# Do NOT update jsonBase - keep previous valid state
|
||||
mergeFailCount += 1
|
||||
logger.warning(
|
||||
f"Iteration {iteration}: jsonParsingSuccess=false, "
|
||||
f"repeat prompt (fail {mergeFailCount}/{MAX_MERGE_FAILS})"
|
||||
)
|
||||
|
||||
if mergeFailCount >= MAX_MERGE_FAILS:
|
||||
# Max failures reached - return last valid completePart
|
||||
logger.error(
|
||||
f"Iteration {iteration}: Max failures ({MAX_MERGE_FAILS}) reached, "
|
||||
"returning last valid completePart"
|
||||
)
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, False)
|
||||
|
||||
if lastValidCompletePart:
|
||||
try:
|
||||
extracted = extractJsonString(lastValidCompletePart)
|
||||
parsed, parseErr, _ = tryParseJson(extracted)
|
||||
if parseErr is None and parsed:
|
||||
normalized = self._normalizeJsonStructure(parsed, useCase)
|
||||
return json.dumps(normalized, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
pass
|
||||
return lastValidCompletePart
|
||||
else:
|
||||
return jsonBase if jsonBase else ""
|
||||
|
||||
# Not at max - retry with same prompt
|
||||
# Do NOT update jsonBase or lastRawResponse - keep previous for retry
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogUpdate(
|
||||
iterationOperationId, 0.7,
|
||||
f"Parse failed ({mergeFailCount}/{MAX_MERGE_FAILS}), retrying"
|
||||
)
|
||||
self.services.chat.progressLogFinish(iterationOperationId, True)
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in AI call iteration {iteration}: {str(e)}")
|
||||
if iterationOperationId:
|
||||
self.services.chat.progressLogFinish(iterationOperationId, False)
|
||||
break
|
||||
|
||||
if iteration >= maxIterations:
|
||||
logger.warning(f"AI call stopped after maximum iterations ({maxIterations})")
|
||||
|
||||
# This code path should never be reached because all registered use cases
|
||||
# return early when JSON is complete. This would only execute for use cases that
|
||||
# require section extraction, but no such use cases are currently registered.
|
||||
logger.error(f"Unexpected code path: reached end of loop without return for use case '{useCaseId}'")
|
||||
return result if result else ""
|
||||
|
||||
def _isJsonStringIncomplete(self, jsonString: str) -> bool:
|
||||
"""
|
||||
Check if JSON string is incomplete (truncated) BEFORE closing/parsing.
|
||||
|
||||
This is critical because if JSON is truncated, closing it makes it appear complete,
|
||||
but we need to detect the truncation to continue iteration.
|
||||
|
||||
Args:
|
||||
jsonString: JSON string to check
|
||||
|
||||
Returns:
|
||||
True if JSON string appears incomplete/truncated, False otherwise
|
||||
"""
|
||||
if not jsonString or not jsonString.strip():
|
||||
return False
|
||||
|
||||
# Normalize JSON string
|
||||
normalized = stripCodeFences(normalizeJsonText(jsonString)).strip()
|
||||
if not normalized:
|
||||
return False
|
||||
|
||||
# Find first '{' or '[' to start
|
||||
startIdx = -1
|
||||
for i, char in enumerate(normalized):
|
||||
if char in '{[':
|
||||
startIdx = i
|
||||
break
|
||||
|
||||
if startIdx == -1:
|
||||
return False
|
||||
|
||||
jsonContent = normalized[startIdx:]
|
||||
|
||||
# Check if structures are balanced (all opened structures are closed)
|
||||
braceCount = 0
|
||||
bracketCount = 0
|
||||
inString = False
|
||||
escapeNext = False
|
||||
|
||||
for char in jsonContent:
|
||||
if escapeNext:
|
||||
escapeNext = False
|
||||
continue
|
||||
|
||||
if char == '\\':
|
||||
escapeNext = True
|
||||
continue
|
||||
|
||||
if char == '"':
|
||||
inString = not inString
|
||||
continue
|
||||
|
||||
if not inString:
|
||||
if char == '{':
|
||||
braceCount += 1
|
||||
elif char == '}':
|
||||
braceCount -= 1
|
||||
elif char == '[':
|
||||
bracketCount += 1
|
||||
elif char == ']':
|
||||
bracketCount -= 1
|
||||
|
||||
# If structures are unbalanced, JSON is incomplete
|
||||
if braceCount > 0 or bracketCount > 0:
|
||||
return True
|
||||
|
||||
# Check if JSON ends with incomplete value (e.g., unclosed string, incomplete number, trailing comma)
|
||||
trimmed = jsonContent.rstrip()
|
||||
if not trimmed:
|
||||
return False
|
||||
|
||||
# Check for trailing comma (might indicate incomplete)
|
||||
if trimmed.endswith(','):
|
||||
# Trailing comma might indicate incomplete, but could also be valid
|
||||
# Check if there's a closing bracket/brace after the comma
|
||||
return False # Trailing comma alone doesn't mean incomplete
|
||||
|
||||
# Check if ends with incomplete string (odd number of quotes)
|
||||
quoteCount = jsonContent.count('"')
|
||||
if quoteCount % 2 == 1:
|
||||
# Odd number of quotes - string is not closed
|
||||
return True
|
||||
|
||||
# Check if ends mid-value (e.g., ends with "417 instead of "4170. 41719"])
|
||||
# Look for patterns that suggest truncation:
|
||||
# - Ends with incomplete number (e.g., "417)
|
||||
# - Ends with incomplete array element (e.g., ["417)
|
||||
# - Ends with incomplete object property (e.g., {"key": "val)
|
||||
|
||||
# If JSON parses successfully without closing, it's complete
|
||||
parsed, parseErr, _ = tryParseJson(jsonContent)
|
||||
if parseErr is None:
|
||||
# Parses successfully - it's complete
|
||||
return False
|
||||
|
||||
# If it doesn't parse, try closing it and see if that helps
|
||||
closed = closeJsonStructures(jsonContent)
|
||||
parsedClosed, parseErrClosed, _ = tryParseJson(closed)
|
||||
|
||||
if parseErrClosed is None:
|
||||
# Only parses after closing - it was incomplete
|
||||
return True
|
||||
|
||||
# Doesn't parse even after closing - might be malformed, but assume incomplete to be safe
|
||||
return True
|
||||
|
||||
def _normalizeJsonStructure(self, parsed: Any, useCase) -> Any:
|
||||
"""
|
||||
Normalize JSON structure to ensure consistent format before merging.
|
||||
Handles different response formats and converts them to expected structure.
|
||||
|
||||
Args:
|
||||
parsed: Parsed JSON object (can be dict, list, or primitive)
|
||||
useCase: LoopingUseCase instance with jsonNormalizer callback
|
||||
|
||||
Returns:
|
||||
Normalized JSON structure
|
||||
"""
|
||||
# Use callback to normalize JSON structure (REQUIRED - no fallback)
|
||||
if not useCase or not useCase.jsonNormalizer:
|
||||
raise ValueError(
|
||||
f"Use case '{useCase.useCaseId if useCase else 'unknown'}' is missing required 'jsonNormalizer' callback. "
|
||||
"All use cases must provide a jsonNormalizer function."
|
||||
)
|
||||
return useCase.jsonNormalizer(parsed, useCase.useCaseId)
|
||||
|
||||
|
|
@ -1,721 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Content Extraction Module
|
||||
|
||||
Handles content extraction and preparation, including:
|
||||
- Extracting content from documents based on intents
|
||||
- Processing pre-extracted documents
|
||||
- Vision AI for image text extraction
|
||||
- AI processing of text content
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import base64
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent, ExtractionOptions, MergeStrategy
|
||||
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
"""Handles content extraction and preparation."""
|
||||
|
||||
def __init__(self, services, aiService, intentAnalyzer):
|
||||
"""Initialize ContentExtractor with service center, AI service, and intent analyzer access."""
|
||||
self.services = services
|
||||
self.aiService = aiService
|
||||
self.intentAnalyzer = intentAnalyzer
|
||||
|
||||
async def extractAndPrepareContent(
|
||||
self,
|
||||
documents: List[ChatDocument],
|
||||
documentIntents: List[DocumentIntent],
|
||||
parentOperationId: str,
|
||||
getIntentForDocument: callable
|
||||
) -> List[ContentPart]:
|
||||
"""
|
||||
Phase 5B: Extrahiert Content basierend auf Intents und bereitet ContentParts mit Metadaten vor.
|
||||
Gibt Liste von ContentParts im passenden Format zurück.
|
||||
|
||||
WICHTIG: Ein Dokument kann mehrere ContentParts erzeugen, wenn mehrere Intents vorhanden sind.
|
||||
Beispiel: Bild mit intents=["extract", "render"] erzeugt:
|
||||
- ContentPart(contentFormat="object", ...) für Rendering
|
||||
- ContentPart(contentFormat="extracted", ...) für Text-Analyse
|
||||
|
||||
Args:
|
||||
documents: Liste der zu verarbeitenden Dokumente
|
||||
documentIntents: Liste von DocumentIntent-Objekten
|
||||
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
||||
getIntentForDocument: Callable to get intent for document ID
|
||||
|
||||
Returns:
|
||||
Liste von ContentParts mit vollständigen Metadaten
|
||||
"""
|
||||
# Erstelle Operation-ID für Extraktion
|
||||
extractionOperationId = f"{parentOperationId}_content_extraction"
|
||||
|
||||
# Starte ChatLog mit Parent-Referenz
|
||||
self.services.chat.progressLogStart(
|
||||
extractionOperationId,
|
||||
"Content Extraction",
|
||||
"Extraction",
|
||||
f"Extracting from {len(documents)} documents",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
allContentParts = []
|
||||
|
||||
for document in documents:
|
||||
checkWorkflowStopped(self.services)
|
||||
# Check if document is already a ContentExtracted document (pre-extracted JSON)
|
||||
logger.debug(f"Checking document {document.id} ({document.fileName}, mimeType={document.mimeType}) for pre-extracted content")
|
||||
preExtracted = self.intentAnalyzer.resolvePreExtractedDocument(document)
|
||||
|
||||
if preExtracted:
|
||||
logger.info(f"✅ Found pre-extracted document: {document.fileName} -> Original: {preExtracted['originalDocument']['fileName']}")
|
||||
logger.info(f" Pre-extracted document ID: {document.id}, Original document ID: {preExtracted['originalDocument']['id']}")
|
||||
logger.info(f" ContentParts count: {len(preExtracted['contentExtracted'].parts) if preExtracted['contentExtracted'].parts else 0}")
|
||||
|
||||
# Verwende bereits extrahierte ContentParts direkt
|
||||
contentExtracted = preExtracted["contentExtracted"]
|
||||
|
||||
# WICHTIG: Intent muss für das JSON-Dokument gefunden werden, nicht für das Original
|
||||
# (Intent-Analyse mappt bereits zurück zu JSON-Dokument-ID)
|
||||
intent = getIntentForDocument(document.id, documentIntents)
|
||||
logger.info(f" Intent lookup for document {document.id}: found={intent is not None}")
|
||||
if intent:
|
||||
logger.info(f" Intent: {intent.intents}, extractionPrompt: {intent.extractionPrompt[:100] if intent.extractionPrompt else None}...")
|
||||
else:
|
||||
logger.warning(f" ⚠️ No intent found for pre-extracted document {document.id}! Available intent documentIds: {[i.documentId for i in documentIntents]}")
|
||||
|
||||
if contentExtracted.parts:
|
||||
# CRITICAL: Process pre-extracted parts - analyze structure parts for nested content
|
||||
processedParts = []
|
||||
for part in contentExtracted.parts:
|
||||
# Überspringe leere Parts (Container ohne Daten)
|
||||
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
|
||||
if part.typeGroup == "container":
|
||||
continue # Überspringe leere Container
|
||||
|
||||
# CRITICAL: Check if structure part contains nested parts (e.g., JSON with documentData.parts)
|
||||
if part.typeGroup == "structure" and part.mimeType == "application/json" and part.data:
|
||||
nestedParts = self._extractNestedPartsFromStructure(part, document, preExtracted, intent)
|
||||
if nestedParts:
|
||||
# Replace structure part with extracted nested parts
|
||||
processedParts.extend(nestedParts)
|
||||
logger.info(f"✅ Extracted {len(nestedParts)} nested parts from structure part {part.id}")
|
||||
continue # Skip original structure part
|
||||
|
||||
# Keep original part if no nested parts found
|
||||
processedParts.append(part)
|
||||
|
||||
# Use processed parts (with nested parts extracted)
|
||||
for part in processedParts:
|
||||
if not part.metadata:
|
||||
part.metadata = {}
|
||||
|
||||
# Ensure metadata is complete
|
||||
if "documentId" not in part.metadata:
|
||||
part.metadata["documentId"] = document.id
|
||||
|
||||
# WICHTIG: Prüfe Intent für dieses Part
|
||||
partIntent = intent.intents if intent else ["extract"]
|
||||
|
||||
# Debug-Logging für Intent-Verarbeitung
|
||||
logger.debug(f"Processing part {part.id}: typeGroup={part.typeGroup}, intents={partIntent}, hasData={bool(part.data)}, dataLength={len(str(part.data)) if part.data else 0}")
|
||||
|
||||
# WICHTIG: Ein Part kann mehrere Intents haben - erstelle für jeden Intent einen ContentPart
|
||||
# Generische Intent-Verarbeitung für ALLE Content-Typen
|
||||
hasReferenceIntent = "reference" in partIntent
|
||||
hasRenderIntent = "render" in partIntent
|
||||
hasExtractIntent = "extract" in partIntent
|
||||
hasPartData = bool(part.data) and (not isinstance(part.data, str) or len(part.data.strip()) > 0)
|
||||
|
||||
logger.debug(f"Part {part.id}: reference={hasReferenceIntent}, render={hasRenderIntent}, extract={hasExtractIntent}, hasData={hasPartData}")
|
||||
|
||||
# SAFETY: For images with any intent, always ensure render is included
|
||||
# This ensures the image object part is always available for later rendering
|
||||
isImage = part.typeGroup == "image" or (part.mimeType and part.mimeType.startswith("image/"))
|
||||
if isImage and hasPartData and not hasRenderIntent:
|
||||
logger.info(f"🖼️ Auto-adding render intent for image {part.id} (original intents: {partIntent})")
|
||||
hasRenderIntent = True
|
||||
|
||||
# Track ob der originale Part bereits hinzugefügt wurde
|
||||
originalPartAdded = False
|
||||
|
||||
# 1. Reference Intent: Erstelle Reference ContentPart
|
||||
if hasReferenceIntent:
|
||||
referencePart = ContentPart(
|
||||
id=f"ref_{document.id}_{part.id}",
|
||||
label=f"Reference: {part.label or 'Content'}",
|
||||
typeGroup="reference",
|
||||
mimeType=part.mimeType or "application/octet-stream",
|
||||
data="", # Leer - nur Referenz
|
||||
metadata={
|
||||
"contentFormat": "reference",
|
||||
"documentId": document.id,
|
||||
"documentReference": f"docItem:{document.id}:{preExtracted['originalDocument']['fileName']}",
|
||||
"intent": "reference",
|
||||
"usageHint": f"Reference: {preExtracted['originalDocument']['fileName']}",
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
||||
}
|
||||
)
|
||||
allContentParts.append(referencePart)
|
||||
logger.debug(f"✅ Created reference ContentPart for {part.id}")
|
||||
|
||||
# 2. Render Intent: Erstelle Object ContentPart (für Binary/Image Rendering)
|
||||
if hasRenderIntent and hasPartData:
|
||||
# Prüfe ob es ein Binary/Image ist (kann gerendert werden)
|
||||
isRenderable = (
|
||||
part.typeGroup == "image" or
|
||||
part.typeGroup == "binary" or
|
||||
(part.mimeType and (
|
||||
part.mimeType.startswith("image/") or
|
||||
part.mimeType.startswith("video/") or
|
||||
part.mimeType.startswith("audio/") or
|
||||
self._isBinary(part.mimeType)
|
||||
))
|
||||
)
|
||||
|
||||
if isRenderable:
|
||||
objectPart = ContentPart(
|
||||
id=f"obj_{document.id}_{part.id}",
|
||||
label=f"Object: {part.label or 'Content'}",
|
||||
typeGroup=part.typeGroup,
|
||||
mimeType=part.mimeType or "application/octet-stream",
|
||||
data=part.data, # Base64/Binary data ist bereits vorhanden
|
||||
metadata={
|
||||
"contentFormat": "object",
|
||||
"documentId": document.id,
|
||||
"intent": "render",
|
||||
"usageHint": f"Render as visual element: {preExtracted['originalDocument']['fileName']}",
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
||||
"relatedExtractedPartId": f"extracted_{document.id}_{part.id}" if hasExtractIntent else None
|
||||
}
|
||||
)
|
||||
allContentParts.append(objectPart)
|
||||
logger.debug(f"✅ Created object ContentPart for {part.id} (render intent)")
|
||||
else:
|
||||
logger.warning(f"⚠️ Part {part.id} has render intent but is not renderable (typeGroup={part.typeGroup}, mimeType={part.mimeType})")
|
||||
elif hasRenderIntent and not hasPartData:
|
||||
logger.warning(f"⚠️ Part {part.id} has render intent but no data, skipping render part")
|
||||
|
||||
# 3. Extract Intent: Erstelle Extracted ContentPart (NO AI processing here - happens during section generation)
|
||||
if hasExtractIntent:
|
||||
# For images: Keep as image part with extract intent - Vision AI extraction happens during section generation
|
||||
if part.typeGroup == "image" and hasPartData:
|
||||
logger.info(f"📷 Image {part.id} with extract intent - will be processed with Vision AI during section generation")
|
||||
# Keep image part as-is, mark with extract intent
|
||||
part.metadata.update({
|
||||
"contentFormat": "extracted", # Marked for extraction, but not yet extracted
|
||||
"intent": "extract",
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
||||
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
||||
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else "Extract all text content from this image.",
|
||||
"needsVisionExtraction": True # Flag to indicate Vision AI extraction needed
|
||||
})
|
||||
allContentParts.append(part)
|
||||
originalPartAdded = True
|
||||
else:
|
||||
# For text/table content: Use directly as extracted (no AI processing here)
|
||||
# AI processing with extractionPrompt happens during section generation
|
||||
if not originalPartAdded:
|
||||
part.metadata.update({
|
||||
"contentFormat": "extracted",
|
||||
"intent": "extract",
|
||||
"fromExtractContent": True,
|
||||
"skipExtraction": True, # Already extracted (raw extraction)
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"],
|
||||
"relatedObjectPartId": f"obj_{document.id}_{part.id}" if hasRenderIntent else None,
|
||||
"extractionPrompt": intent.extractionPrompt if intent and intent.extractionPrompt else None
|
||||
})
|
||||
# Stelle sicher dass contentFormat gesetzt ist
|
||||
if "contentFormat" not in part.metadata:
|
||||
part.metadata["contentFormat"] = "extracted"
|
||||
allContentParts.append(part)
|
||||
originalPartAdded = True
|
||||
logger.debug(f"✅ Using pre-extracted ContentPart {part.id} as extracted (no AI processing needed)")
|
||||
|
||||
# 4. Fallback: Wenn kein Intent vorhanden oder Part wurde noch nicht hinzugefügt
|
||||
# (sollte normalerweise nicht vorkommen, da default "extract" ist)
|
||||
if not hasReferenceIntent and not hasRenderIntent and not hasExtractIntent and not originalPartAdded:
|
||||
logger.warning(f"⚠️ Part {part.id} has no recognized intents, adding as extracted by default")
|
||||
part.metadata.update({
|
||||
"contentFormat": "extracted",
|
||||
"intent": "extract",
|
||||
"fromExtractContent": True,
|
||||
"skipExtraction": True,
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
||||
})
|
||||
allContentParts.append(part)
|
||||
originalPartAdded = True
|
||||
|
||||
logger.info(f"✅ Using {len([p for p in contentExtracted.parts if p.data and len(str(p.data)) > 0])} pre-extracted ContentParts from ContentExtracted document {document.fileName}")
|
||||
logger.info(f" Original document: {preExtracted['originalDocument']['fileName']}")
|
||||
continue # Skip normal extraction for this document
|
||||
|
||||
# Check if it's standardized JSON format (has "documents" or "sections")
|
||||
if document.mimeType == "application/json":
|
||||
try:
|
||||
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
||||
if docBytes:
|
||||
docData = docBytes.decode('utf-8')
|
||||
jsonData = json.loads(docData)
|
||||
|
||||
if isinstance(jsonData, dict) and ("documents" in jsonData or "sections" in jsonData):
|
||||
logger.info(f"Document is already in standardized JSON format, using as reference")
|
||||
# Create reference ContentPart for structured JSON
|
||||
contentPart = ContentPart(
|
||||
id=f"ref_{document.id}",
|
||||
label=f"Reference: {document.fileName}",
|
||||
typeGroup="structure",
|
||||
mimeType="application/json",
|
||||
data=docData,
|
||||
metadata={
|
||||
"contentFormat": "reference",
|
||||
"documentId": document.id,
|
||||
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
||||
"skipExtraction": True,
|
||||
"intent": "reference"
|
||||
}
|
||||
)
|
||||
allContentParts.append(contentPart)
|
||||
logger.info(f"✅ Using JSON document directly without extraction")
|
||||
continue # Skip normal extraction for this document
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse JSON document {document.fileName}, will extract normally: {str(e)}")
|
||||
# Continue with normal extraction
|
||||
|
||||
# Normal extraction path
|
||||
intent = getIntentForDocument(document.id, documentIntents)
|
||||
|
||||
if not intent:
|
||||
# Try to find intent by similar UUID (fix for AI UUID hallucination)
|
||||
correctedIntent = self._findIntentBySimilarId(document.id, documentIntents)
|
||||
if correctedIntent:
|
||||
logger.warning(f"Found intent for document {document.id} using UUID correction (original: {correctedIntent.documentId})")
|
||||
# Create new intent with correct document ID
|
||||
intent = DocumentIntent(
|
||||
documentId=document.id,
|
||||
intents=correctedIntent.intents,
|
||||
extractionPrompt=correctedIntent.extractionPrompt,
|
||||
reasoning=f"Intent matched by UUID similarity (original: {correctedIntent.documentId})"
|
||||
)
|
||||
else:
|
||||
# Default: extract für alle Dokumente ohne Intent
|
||||
logger.warning(f"No intent found for document {document.id}, using default 'extract'")
|
||||
intent = DocumentIntent(
|
||||
documentId=document.id,
|
||||
intents=["extract"],
|
||||
extractionPrompt="Extract all content from the document",
|
||||
reasoning="Default intent: no specific intent found"
|
||||
)
|
||||
|
||||
# WICHTIG: Prüfe alle Intents - ein Dokument kann mehrere ContentParts erzeugen
|
||||
|
||||
if "reference" in intent.intents:
|
||||
# Erstelle Reference ContentPart
|
||||
contentPart = ContentPart(
|
||||
id=f"ref_{document.id}",
|
||||
label=f"Reference: {document.fileName}",
|
||||
typeGroup="reference",
|
||||
mimeType=document.mimeType,
|
||||
data="",
|
||||
metadata={
|
||||
"contentFormat": "reference",
|
||||
"documentId": document.id,
|
||||
"documentReference": f"docItem:{document.id}:{document.fileName}",
|
||||
"intent": "reference",
|
||||
"usageHint": f"Reference document: {document.fileName}"
|
||||
}
|
||||
)
|
||||
allContentParts.append(contentPart)
|
||||
|
||||
# WICHTIG: "render" und "extract" können beide vorhanden sein!
|
||||
# In diesem Fall erzeugen wir BEIDE ContentParts
|
||||
|
||||
# SAFETY: For images with any intent, always create object part for later rendering
|
||||
isImageDocument = document.mimeType and document.mimeType.startswith("image/")
|
||||
shouldAutoRender = isImageDocument and "render" not in intent.intents and ("extract" in intent.intents or "reference" in intent.intents)
|
||||
if shouldAutoRender:
|
||||
logger.info(f"🖼️ Auto-adding render for image document {document.id} (original intents: {intent.intents})")
|
||||
|
||||
if "render" in intent.intents or shouldAutoRender:
|
||||
# Für Images/Binary: extrahiere als Object
|
||||
if document.mimeType.startswith("image/") or self._isBinary(document.mimeType):
|
||||
try:
|
||||
# Lade Binary-Daten (getFileData ist nicht async - keine await nötig)
|
||||
binaryData = self.services.interfaceDbComponent.getFileData(document.fileId)
|
||||
if not binaryData:
|
||||
logger.warning(f"No binary data found for document {document.id}")
|
||||
continue
|
||||
base64Data = base64.b64encode(binaryData).decode('utf-8')
|
||||
|
||||
contentPart = ContentPart(
|
||||
id=f"obj_{document.id}",
|
||||
label=f"Object: {document.fileName}",
|
||||
typeGroup="image" if document.mimeType.startswith("image/") else "binary",
|
||||
mimeType=document.mimeType,
|
||||
data=base64Data,
|
||||
metadata={
|
||||
"contentFormat": "object",
|
||||
"documentId": document.id,
|
||||
"intent": "render",
|
||||
"usageHint": f"Render as visual element: {document.fileName}",
|
||||
"originalFileName": document.fileName,
|
||||
# Verknüpfung zu extracted Part (falls vorhanden)
|
||||
"relatedExtractedPartId": f"ext_{document.id}" if "extract" in intent.intents else None
|
||||
}
|
||||
)
|
||||
allContentParts.append(contentPart)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load binary data for document {document.id}: {str(e)}")
|
||||
|
||||
if "extract" in intent.intents:
|
||||
# Extrahiere Content mit Extraction Service
|
||||
extractionPrompt = intent.extractionPrompt or "Extract all content from the document"
|
||||
|
||||
# Debug-Log (harmonisiert)
|
||||
self.services.utils.writeDebugFile(
|
||||
extractionPrompt,
|
||||
f"content_extraction_prompt_{document.id}"
|
||||
)
|
||||
|
||||
# Führe Extraktion aus
|
||||
|
||||
extractionOptions = ExtractionOptions(
|
||||
prompt=extractionPrompt,
|
||||
mergeStrategy=MergeStrategy()
|
||||
)
|
||||
|
||||
# extractContent ist nicht async - keine await nötig
|
||||
checkWorkflowStopped(self.services)
|
||||
extractedResults = self.services.extraction.extractContent(
|
||||
[document],
|
||||
extractionOptions,
|
||||
operationId=extractionOperationId,
|
||||
parentOperationId=extractionOperationId
|
||||
)
|
||||
|
||||
# Konvertiere extrahierte Ergebnisse zu ContentParts mit Metadaten
|
||||
# Check if object part exists (either explicit render or auto-render for images)
|
||||
hasObjectPart = "render" in intent.intents or shouldAutoRender
|
||||
|
||||
for extracted in extractedResults:
|
||||
for part in extracted.parts:
|
||||
# Markiere als extracted Format
|
||||
part.metadata.update({
|
||||
"contentFormat": "extracted",
|
||||
"documentId": document.id,
|
||||
"extractionPrompt": extractionPrompt,
|
||||
"intent": "extract",
|
||||
"usageHint": f"Use extracted content from {document.fileName}",
|
||||
# Verknüpfung zu object Part (falls vorhanden - including auto-render for images)
|
||||
"relatedObjectPartId": f"obj_{document.id}" if hasObjectPart else None
|
||||
})
|
||||
|
||||
# For images: Mark that Vision AI extraction is needed during section generation
|
||||
if part.typeGroup == "image":
|
||||
part.metadata["needsVisionExtraction"] = True
|
||||
logger.info(f"📷 Image part {part.id} marked for Vision AI extraction during section generation")
|
||||
|
||||
# Stelle sicher, dass ID eindeutig ist (falls object Part existiert)
|
||||
if hasObjectPart:
|
||||
part.id = f"ext_{document.id}_{part.id}"
|
||||
allContentParts.append(part)
|
||||
|
||||
# Debug-Log (harmonisiert)
|
||||
self.services.utils.writeDebugFile(
|
||||
json.dumps([part.dict() for part in allContentParts], indent=2, default=str),
|
||||
"content_extraction_result"
|
||||
)
|
||||
|
||||
# State 2 Validation: Validate and auto-fix ContentParts
|
||||
validatedParts = []
|
||||
for part in allContentParts:
|
||||
# Validation 2.1: Skip ContentParts without documentId
|
||||
if not part.metadata.get("documentId"):
|
||||
logger.warning(f"Skipping ContentPart {part.id} - missing documentId in metadata")
|
||||
continue
|
||||
|
||||
# Validation 2.2: Skip ContentParts with invalid contentFormat
|
||||
contentFormat = part.metadata.get("contentFormat")
|
||||
if contentFormat not in ["extracted", "object", "reference"]:
|
||||
logger.warning(
|
||||
f"Skipping ContentPart {part.id} - invalid contentFormat: {contentFormat}"
|
||||
)
|
||||
continue
|
||||
|
||||
validatedParts.append(part)
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(extractionOperationId, True)
|
||||
|
||||
return validatedParts
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(extractionOperationId, False)
|
||||
logger.error(f"Error in extractAndPrepareContent: {str(e)}")
|
||||
raise
|
||||
|
||||
async def extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
||||
"""
|
||||
Extrahiere Text aus einem Image-Part mit Vision AI.
|
||||
|
||||
Args:
|
||||
imagePart: ContentPart mit typeGroup="image"
|
||||
extractionPrompt: Prompt für die Text-Extraktion
|
||||
|
||||
Returns:
|
||||
Extrahierter Text oder None bei Fehler
|
||||
"""
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
||||
|
||||
# Final extraction prompt
|
||||
finalPrompt = extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting."
|
||||
|
||||
# Debug-Log (harmonisiert)
|
||||
self.services.utils.writeDebugFile(
|
||||
finalPrompt,
|
||||
f"content_extraction_prompt_image_{imagePart.id}"
|
||||
)
|
||||
|
||||
# Erstelle AI-Call-Request mit Image-Part
|
||||
request = AiCallRequest(
|
||||
prompt=finalPrompt,
|
||||
context="",
|
||||
options=AiCallOptions(operationType=OperationTypeEnum.IMAGE_ANALYSE),
|
||||
contentParts=[imagePart]
|
||||
)
|
||||
|
||||
# Verwende AI-Service für Vision AI-Verarbeitung
|
||||
checkWorkflowStopped(self.services)
|
||||
response = await self.aiService.callAi(request)
|
||||
|
||||
# Debug-Log für Response (harmonisiert)
|
||||
if response and response.content:
|
||||
self.services.utils.writeDebugFile(
|
||||
response.content,
|
||||
f"content_extraction_response_image_{imagePart.id}"
|
||||
)
|
||||
|
||||
if response and response.content:
|
||||
return response.content.strip()
|
||||
|
||||
# Kein Content zurückgegeben - return error message für Debugging
|
||||
errorMsg = f"Vision AI extraction failed: No content returned for image {imagePart.id}"
|
||||
logger.warning(errorMsg)
|
||||
return f"[ERROR: {errorMsg}]"
|
||||
except Exception as e:
|
||||
errorMsg = f"Vision AI extraction failed for image {imagePart.id}: {str(e)}"
|
||||
logger.error(errorMsg)
|
||||
import traceback
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
# Return error message statt None für Debugging
|
||||
return f"[ERROR: {errorMsg}]"
|
||||
|
||||
async def processTextContentWithAi(self, textPart: ContentPart, extractionPrompt: str) -> Optional[str]:
|
||||
"""
|
||||
Verarbeite Text-Content mit AI basierend auf extractionPrompt.
|
||||
|
||||
WICHTIG: Pre-extracted ContentParts von context.extractContent enthalten RAW extrahierten Text
|
||||
(z.B. aus PDF-Text-Layer). Wenn "extract" Intent vorhanden ist, muss dieser Text mit AI
|
||||
verarbeitet werden (Transformation, Strukturierung, etc.) basierend auf extractionPrompt.
|
||||
|
||||
Args:
|
||||
textPart: ContentPart mit typeGroup="text" (oder anderer Text-basierter Typ)
|
||||
extractionPrompt: Prompt für die AI-Verarbeitung des Textes
|
||||
|
||||
Returns:
|
||||
AI-verarbeiteter Text oder None bei Fehler
|
||||
"""
|
||||
try:
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
||||
|
||||
# Final extraction prompt
|
||||
finalPrompt = extractionPrompt or "Process and extract the key information from the following text content."
|
||||
|
||||
# Debug-Log (harmonisiert) - log prompt with text preview
|
||||
textPreview = textPart.data[:500] + "..." if textPart.data and len(textPart.data) > 500 else (textPart.data or "")
|
||||
promptWithContext = f"{finalPrompt}\n\n--- Text Content (preview) ---\n{textPreview}"
|
||||
self.services.utils.writeDebugFile(
|
||||
promptWithContext,
|
||||
f"content_extraction_prompt_text_{textPart.id}"
|
||||
)
|
||||
|
||||
# Erstelle Text-ContentPart für AI-Verarbeitung
|
||||
# Verwende den vorhandenen Text als Input
|
||||
textContentPart = ContentPart(
|
||||
id=textPart.id,
|
||||
label=textPart.label,
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=textPart.data if textPart.data else "",
|
||||
metadata=textPart.metadata.copy() if textPart.metadata else {}
|
||||
)
|
||||
|
||||
# Erstelle AI-Call-Request mit Text-Part
|
||||
request = AiCallRequest(
|
||||
prompt=finalPrompt,
|
||||
context="",
|
||||
options=AiCallOptions(operationType=OperationTypeEnum.DATA_EXTRACT),
|
||||
contentParts=[textContentPart]
|
||||
)
|
||||
|
||||
# Verwende AI-Service für Text-Verarbeitung
|
||||
checkWorkflowStopped(self.services)
|
||||
response = await self.aiService.callAi(request)
|
||||
|
||||
# Debug-Log für Response (harmonisiert)
|
||||
if response and response.content:
|
||||
self.services.utils.writeDebugFile(
|
||||
response.content,
|
||||
f"content_extraction_response_text_{textPart.id}"
|
||||
)
|
||||
|
||||
if response and response.content:
|
||||
return response.content.strip()
|
||||
|
||||
# Kein Content zurückgegeben - return error message für Debugging
|
||||
errorMsg = f"AI text processing failed: No content returned for text part {textPart.id}"
|
||||
logger.warning(errorMsg)
|
||||
return f"[ERROR: {errorMsg}]"
|
||||
except Exception as e:
|
||||
errorMsg = f"AI text processing failed for text part {textPart.id}: {str(e)}"
|
||||
logger.error(errorMsg)
|
||||
import traceback
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
# Return error message statt None für Debugging
|
||||
return f"[ERROR: {errorMsg}]"
|
||||
|
||||
def _isBinary(self, mimeType: str) -> bool:
|
||||
"""Prüfe ob MIME-Type binary ist."""
|
||||
binaryTypes = [
|
||||
"application/octet-stream",
|
||||
"application/pdf",
|
||||
"application/zip",
|
||||
"application/x-zip-compressed"
|
||||
]
|
||||
return mimeType in binaryTypes or mimeType.startswith("image/") or mimeType.startswith("video/") or mimeType.startswith("audio/")
|
||||
|
||||
def _extractNestedPartsFromStructure(
|
||||
self,
|
||||
structurePart: ContentPart,
|
||||
document: ChatDocument,
|
||||
preExtracted: Dict[str, Any],
|
||||
intent: Optional[Any]
|
||||
) -> List[ContentPart]:
|
||||
"""
|
||||
Extract nested parts from a structure ContentPart (e.g., JSON with documentData.parts).
|
||||
|
||||
This is a generic function that analyzes pre-processed ContentParts and extracts
|
||||
any nested parts that are embedded in structure data (typically JSON).
|
||||
|
||||
Works with standard ContentExtracted format: documentData.parts array.
|
||||
Each nested part is extracted as a separate ContentPart with proper metadata.
|
||||
|
||||
Args:
|
||||
structurePart: ContentPart with typeGroup="structure" containing nested parts
|
||||
document: The document this part belongs to
|
||||
preExtracted: Pre-extracted document metadata
|
||||
intent: Document intent for nested parts
|
||||
|
||||
Returns:
|
||||
List of extracted ContentParts, empty if no nested parts found
|
||||
"""
|
||||
nestedParts = []
|
||||
|
||||
try:
|
||||
# Parse JSON structure
|
||||
jsonData = json.loads(structurePart.data)
|
||||
|
||||
# Check for standard ContentExtracted format: documentData.parts
|
||||
if isinstance(jsonData, dict):
|
||||
documentData = jsonData.get("documentData")
|
||||
if isinstance(documentData, dict):
|
||||
parts = documentData.get("parts", [])
|
||||
if isinstance(parts, list) and len(parts) > 0:
|
||||
# Extract each nested part
|
||||
for nestedPartData in parts:
|
||||
if not isinstance(nestedPartData, dict):
|
||||
continue
|
||||
|
||||
nestedPartId = nestedPartData.get("id") or f"nested_{len(nestedParts)}"
|
||||
nestedTypeGroup = nestedPartData.get("typeGroup", "text")
|
||||
nestedMimeType = nestedPartData.get("mimeType", "text/plain")
|
||||
nestedLabel = nestedPartData.get("label", structurePart.label)
|
||||
nestedData = nestedPartData.get("data", "")
|
||||
nestedMetadata = nestedPartData.get("metadata", {})
|
||||
|
||||
# Create ContentPart for nested part
|
||||
nestedPart = ContentPart(
|
||||
id=f"{structurePart.id}_{nestedPartId}",
|
||||
parentId=structurePart.id,
|
||||
label=nestedLabel,
|
||||
typeGroup=nestedTypeGroup,
|
||||
mimeType=nestedMimeType,
|
||||
data=nestedData,
|
||||
metadata={
|
||||
**nestedMetadata,
|
||||
"documentId": document.id,
|
||||
"fromNestedStructure": True,
|
||||
"parentStructurePartId": structurePart.id,
|
||||
"originalFileName": preExtracted["originalDocument"]["fileName"]
|
||||
}
|
||||
)
|
||||
|
||||
nestedParts.append(nestedPart)
|
||||
logger.debug(f"✅ Extracted nested part: {nestedPart.id} (typeGroup={nestedTypeGroup}, mimeType={nestedMimeType})")
|
||||
|
||||
# If no nested parts found, return empty list (original part will be kept)
|
||||
if not nestedParts:
|
||||
logger.debug(f"No nested parts found in structure part {structurePart.id}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Could not parse structure part {structurePart.id} as JSON: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting nested parts from structure part {structurePart.id}: {str(e)}")
|
||||
|
||||
return nestedParts
|
||||
|
||||
def _findIntentBySimilarId(self, documentId: str, documentIntents: List[DocumentIntent]) -> Optional[DocumentIntent]:
|
||||
"""
|
||||
Versucht ein Intent zu finden, dessen UUID ähnlich zur angegebenen Dokument-ID ist.
|
||||
Dies hilft bei AI UUID-Halluzinationen (z.B. 4451 -> 4551).
|
||||
|
||||
Args:
|
||||
documentId: Die Dokument-ID für die ein Intent gesucht wird
|
||||
documentIntents: Liste aller verfügbaren DocumentIntents
|
||||
|
||||
Returns:
|
||||
DocumentIntent mit ähnlicher UUID falls gefunden, sonst None
|
||||
"""
|
||||
if not documentId or len(documentId) != 36: # UUID Format: 8-4-4-4-12
|
||||
return None
|
||||
|
||||
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
|
||||
if documentId.count('-') != 4:
|
||||
return None
|
||||
|
||||
for intent in documentIntents:
|
||||
intentId = intent.documentId
|
||||
if len(intentId) != 36:
|
||||
continue
|
||||
|
||||
# Zähle unterschiedliche Zeichen
|
||||
differences = sum(c1 != c2 for c1, c2 in zip(documentId, intentId))
|
||||
|
||||
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
|
||||
if differences <= 2:
|
||||
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
|
||||
if documentId.count('-') == intentId.count('-'):
|
||||
return intent
|
||||
|
||||
return None
|
||||
|
||||
|
|
@ -1,369 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Document Intent Analysis Module
|
||||
|
||||
Handles analysis of document intents, including:
|
||||
- Clarifying which documents need extraction vs reference
|
||||
- Resolving pre-extracted documents
|
||||
- Building intent analysis prompts
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.datamodels.datamodelExtraction import DocumentIntent
|
||||
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentIntentAnalyzer:
|
||||
"""Handles document intent analysis and resolution."""
|
||||
|
||||
def __init__(self, services, aiService):
|
||||
"""Initialize DocumentIntentAnalyzer with service center and AI service access."""
|
||||
self.services = services
|
||||
self.aiService = aiService
|
||||
|
||||
async def clarifyDocumentIntents(
|
||||
self,
|
||||
documents: List[ChatDocument],
|
||||
userPrompt: str,
|
||||
actionParameters: Dict[str, Any],
|
||||
parentOperationId: str
|
||||
) -> List[DocumentIntent]:
|
||||
"""
|
||||
Phase 5A: Analysiert, welche Dokumente Extraktion vs Referenz benötigen.
|
||||
Gibt DocumentIntent für jedes Dokument zurück.
|
||||
|
||||
Args:
|
||||
documents: Liste der zu verarbeitenden Dokumente
|
||||
userPrompt: User-Anfrage
|
||||
actionParameters: Action-spezifische Parameter (z.B. resultType, outputFormat)
|
||||
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
||||
|
||||
Returns:
|
||||
Liste von DocumentIntent-Objekten
|
||||
"""
|
||||
# Erstelle Operation-ID für Intent-Analyse
|
||||
intentOperationId = f"{parentOperationId}_intent_analysis"
|
||||
|
||||
# Starte ChatLog mit Parent-Referenz
|
||||
self.services.chat.progressLogStart(
|
||||
intentOperationId,
|
||||
"Document Intent Analysis",
|
||||
"Intent Analysis",
|
||||
f"Analyzing {len(documents)} documents",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
# Mappe pre-extracted JSONs zu ursprünglichen Dokument-IDs für Intent-Analyse
|
||||
documentMapping = {} # Maps original doc ID -> JSON doc ID
|
||||
resolvedDocuments = []
|
||||
|
||||
for doc in documents:
|
||||
preExtracted = self.resolvePreExtractedDocument(doc)
|
||||
if preExtracted:
|
||||
originalDocId = preExtracted["originalDocument"]["id"]
|
||||
documentMapping[originalDocId] = doc.id
|
||||
# Erstelle temporäres ChatDocument für ursprüngliches Dokument
|
||||
originalDoc = ChatDocument(
|
||||
id=originalDocId,
|
||||
fileName=preExtracted["originalDocument"]["fileName"],
|
||||
mimeType=preExtracted["originalDocument"]["mimeType"],
|
||||
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
|
||||
fileId=doc.fileId, # Behalte fileId vom JSON
|
||||
messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
|
||||
)
|
||||
resolvedDocuments.append(originalDoc)
|
||||
else:
|
||||
resolvedDocuments.append(doc)
|
||||
|
||||
# Baue Intent-Analyse-Prompt mit ursprünglichen Dokumenten
|
||||
intentPrompt = self._buildIntentAnalysisPrompt(userPrompt, resolvedDocuments, actionParameters)
|
||||
|
||||
# AI-Call (verwende callAiPlanning für einfache JSON-Responses)
|
||||
# Debug-Logs werden bereits von callAiPlanning geschrieben
|
||||
checkWorkflowStopped(self.services)
|
||||
aiResponse = await self.aiService.callAiPlanning(
|
||||
prompt=intentPrompt,
|
||||
debugType="document_intent_analysis"
|
||||
)
|
||||
|
||||
# Parse Result und mappe zurück zu JSON-Dokument-IDs falls nötig
|
||||
intentsData = json.loads(self.services.utils.jsonExtractString(aiResponse))
|
||||
documentIntents = []
|
||||
for intent in intentsData.get("intents", []):
|
||||
docId = intent.get("documentId")
|
||||
# Wenn Intent für ursprüngliches Dokument, mappe zurück zu JSON-Dokument-ID
|
||||
if docId in documentMapping:
|
||||
intent["documentId"] = documentMapping[docId]
|
||||
documentIntents.append(DocumentIntent(**intent))
|
||||
|
||||
# Debug-Log (harmonisiert)
|
||||
self.services.utils.writeDebugFile(
|
||||
json.dumps([intent.dict() for intent in documentIntents], indent=2),
|
||||
"document_intent_analysis_result"
|
||||
)
|
||||
|
||||
# State 1 Validation: Validate and auto-fix document intents
|
||||
documentIds = {d.id for d in documents}
|
||||
validatedIntents = []
|
||||
|
||||
for intent in documentIntents:
|
||||
# Validation 1.2: Skip intents for unknown documents
|
||||
if intent.documentId not in documentIds:
|
||||
# Try to find similar UUID (fix AI hallucination/typo)
|
||||
correctedDocId = self._findSimilarDocumentId(intent.documentId, documentIds)
|
||||
if correctedDocId:
|
||||
logger.warning(f"Corrected UUID typo in AI response: {intent.documentId} -> {correctedDocId}")
|
||||
intent.documentId = correctedDocId
|
||||
else:
|
||||
logger.warning(f"Skipping intent for unknown document: {intent.documentId}")
|
||||
continue
|
||||
validatedIntents.append(intent)
|
||||
|
||||
# Validation 1.1: Documents without intents are OK (not needed)
|
||||
# Intents for non-existing documents are already filtered above
|
||||
documentIntents = validatedIntents
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(intentOperationId, True)
|
||||
|
||||
return documentIntents
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(intentOperationId, False)
|
||||
logger.error(f"Error in clarifyDocumentIntents: {str(e)}")
|
||||
raise
|
||||
|
||||
def resolvePreExtractedDocument(self, document: ChatDocument) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Prüft ob ein JSON-Dokument bereits extrahierte ContentParts enthält.
|
||||
Gibt Dict zurück mit:
|
||||
- originalDocument: ChatDocument-Info des ursprünglichen Dokuments
|
||||
- contentExtracted: ContentExtracted-Objekt mit Parts
|
||||
- parts: Liste der ContentParts
|
||||
|
||||
Returns None wenn kein pre-extracted Format erkannt wird.
|
||||
"""
|
||||
if document.mimeType != "application/json":
|
||||
logger.debug(f"Document {document.id} is not JSON (mimeType={document.mimeType}), skipping pre-extracted check")
|
||||
return None
|
||||
|
||||
try:
|
||||
docBytes = self.services.interfaceDbComponent.getFileData(document.fileId)
|
||||
if not docBytes:
|
||||
return None
|
||||
|
||||
docData = docBytes.decode('utf-8')
|
||||
jsonData = json.loads(docData)
|
||||
|
||||
if not isinstance(jsonData, dict):
|
||||
return None
|
||||
|
||||
# Check for ContentExtracted format
|
||||
# Nur Format 1 (ActionDocument-Format mit validationMetadata) wird unterstützt
|
||||
documentData = None
|
||||
|
||||
validationMetadata = jsonData.get("validationMetadata", {})
|
||||
actionType = validationMetadata.get("actionType")
|
||||
logger.debug(f"JSON document {document.id}: validationMetadata.actionType={actionType}, keys={list(jsonData.keys())}")
|
||||
|
||||
if actionType == "context.extractContent":
|
||||
# Format: {"validationMetadata": {"actionType": "context.extractContent"}, "documentData": {...}}
|
||||
documentData = jsonData.get("documentData")
|
||||
logger.debug(f"Found ContentExtracted via validationMetadata for {document.fileName}, documentData keys: {list(documentData.keys()) if documentData else None}")
|
||||
else:
|
||||
logger.debug(f"JSON document {document.id} does not have actionType='context.extractContent' (got: {actionType})")
|
||||
|
||||
if documentData:
|
||||
|
||||
try:
|
||||
# Stelle sicher, dass "id" vorhanden ist
|
||||
if "id" not in documentData:
|
||||
documentData["id"] = document.id
|
||||
|
||||
contentExtracted = ContentExtracted(**documentData)
|
||||
|
||||
if contentExtracted.parts:
|
||||
# Extrahiere ursprüngliche Dokument-Info aus den Parts
|
||||
originalDocId = None
|
||||
originalFileName = None
|
||||
originalMimeType = None
|
||||
|
||||
for part in contentExtracted.parts:
|
||||
if part.metadata:
|
||||
# Versuche ursprüngliche Dokument-Info zu finden
|
||||
if not originalDocId and part.metadata.get("documentId"):
|
||||
originalDocId = part.metadata.get("documentId")
|
||||
if not originalFileName and part.metadata.get("originalFileName"):
|
||||
originalFileName = part.metadata.get("originalFileName")
|
||||
if not originalMimeType and part.metadata.get("documentMimeType"):
|
||||
originalMimeType = part.metadata.get("documentMimeType")
|
||||
|
||||
# Falls nicht gefunden, versuche aus documentName zu extrahieren
|
||||
if not originalFileName:
|
||||
# Versuche aus documentName zu extrahieren (z.B. "B2025-02c_28_extracted_...json" -> "B2025-02c_28.pdf")
|
||||
if document.fileName and "_extracted_" in document.fileName:
|
||||
originalFileName = document.fileName.split("_extracted_")[0] + ".pdf"
|
||||
|
||||
return {
|
||||
"originalDocument": {
|
||||
"id": originalDocId or document.id,
|
||||
"fileName": originalFileName or document.fileName,
|
||||
"mimeType": originalMimeType or "application/pdf",
|
||||
"fileSize": document.fileSize
|
||||
},
|
||||
"contentExtracted": contentExtracted,
|
||||
"parts": contentExtracted.parts
|
||||
}
|
||||
except Exception as parseError:
|
||||
logger.warning(f"Could not parse ContentExtracted format from {document.fileName}: {str(parseError)}")
|
||||
logger.debug(f"JSON keys: {list(jsonData.keys())}, has parts: {'parts' in jsonData}")
|
||||
import traceback
|
||||
logger.debug(f"Parse error traceback: {traceback.format_exc()}")
|
||||
return None
|
||||
else:
|
||||
logger.debug(f"JSON document {document.id} has no documentData (actionType={actionType})")
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
|
||||
return None
|
||||
|
||||
def _buildIntentAnalysisPrompt(
|
||||
self,
|
||||
userPrompt: str,
|
||||
documents: List[ChatDocument],
|
||||
actionParameters: Dict[str, Any]
|
||||
) -> str:
|
||||
"""Baue Prompt für Intent-Analyse."""
|
||||
# Baue Dokument-Liste - zeige ursprüngliche Dokumente für pre-extracted JSONs
|
||||
docListText = ""
|
||||
for i, doc in enumerate(documents, 1):
|
||||
# Prüfe ob es ein pre-extracted JSON ist
|
||||
preExtracted = self.resolvePreExtractedDocument(doc)
|
||||
|
||||
if preExtracted:
|
||||
# Zeige ursprüngliches Dokument statt JSON
|
||||
originalDoc = preExtracted["originalDocument"]
|
||||
partsInfo = f" (contains {len(preExtracted['parts'])} pre-extracted parts: {', '.join([p.typeGroup for p in preExtracted['parts'] if p.data and len(str(p.data)) > 0])})"
|
||||
docListText += f"\n{i}. Document ID: {originalDoc['id']}\n"
|
||||
docListText += f" File Name: {originalDoc['fileName']}{partsInfo}\n"
|
||||
docListText += f" MIME Type: {originalDoc['mimeType']}\n"
|
||||
docListText += f" File Size: {originalDoc.get('fileSize', doc.fileSize)} bytes\n"
|
||||
else:
|
||||
# Normales Dokument
|
||||
docListText += f"\n{i}. Document ID: {doc.id}\n"
|
||||
docListText += f" File Name: {doc.fileName}\n"
|
||||
docListText += f" MIME Type: {doc.mimeType}\n"
|
||||
docListText += f" File Size: {doc.fileSize} bytes\n"
|
||||
|
||||
outputFormat = actionParameters.get("outputFormat", "txt")
|
||||
|
||||
# FENCE user input to prevent prompt injection
|
||||
fencedUserPrompt = f"""```user_request
|
||||
{userPrompt}
|
||||
```"""
|
||||
|
||||
prompt = f"""USER REQUEST:
|
||||
{fencedUserPrompt}
|
||||
|
||||
DOCUMENTS TO ANALYZE:
|
||||
{docListText}
|
||||
|
||||
TASK: For each document, determine its intents (can be multiple):
|
||||
- "extract": Content extraction needed (text, structure, OCR, etc.)
|
||||
- "render": Image/binary should be rendered as-is (visual element)
|
||||
- "reference": Document reference/attachment (no extraction, just reference)
|
||||
|
||||
TASK: For each document, determine:
|
||||
1. Intents (can be multiple): "extract", "render", "reference"
|
||||
Note: Output format and language are NOT determined here - they will be
|
||||
determined during structure generation (Phase 3) in the chapter structure JSON
|
||||
|
||||
OUTPUT FORMAT: {outputFormat} (global fallback - for reference only)
|
||||
|
||||
RETURN JSON:
|
||||
{{
|
||||
"intents": [
|
||||
{{
|
||||
"documentId": "doc_1",
|
||||
"intents": ["extract"],
|
||||
"extractionPrompt": "Extract all text content, preserving structure",
|
||||
"reasoning": "User needs text content for document generation"
|
||||
}},
|
||||
{{
|
||||
"documentId": "doc_2",
|
||||
"intents": ["extract", "render"],
|
||||
"extractionPrompt": "Extract text content from image using vision AI",
|
||||
"reasoning": "Image contains text that needs extraction, but also should be rendered visually"
|
||||
}},
|
||||
{{
|
||||
"documentId": "doc_3",
|
||||
"intents": ["reference"],
|
||||
"extractionPrompt": null,
|
||||
"reasoning": "Document is only used as reference, no extraction needed"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
CRITICAL RULES:
|
||||
1. For images (mimeType starts with "image/"):
|
||||
- If user wants to "include" or "show" images → add "render"
|
||||
- If user wants to "analyze", "read text", or "extract text" from images → add "extract"
|
||||
- Can have BOTH "extract" and "render" if image needs both text extraction and visual rendering
|
||||
|
||||
2. For text documents:
|
||||
- If user mentions "template" or "structure" → "reference" or "extract" based on context
|
||||
- If user mentions "reference" or "context" → "reference"
|
||||
- Default → "extract"
|
||||
|
||||
3. Consider output format:
|
||||
- For formats like PDF, DOCX, PPTX: images usually need "render"
|
||||
- For formats like CSV, JSON: usually "extract" only
|
||||
- For HTML: can have both "extract" and "render"
|
||||
|
||||
Return ONLY valid JSON following the structure above.
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def _findSimilarDocumentId(self, incorrectId: str, validIds: set) -> Optional[str]:
|
||||
"""
|
||||
Versucht eine ähnliche Dokument-ID zu finden, falls die AI die UUID geändert hat.
|
||||
Prüft auf UUID-Typo (z.B. 4451 -> 4551).
|
||||
|
||||
Args:
|
||||
incorrectId: Die falsche UUID aus der AI-Response
|
||||
validIds: Set von gültigen Dokument-IDs
|
||||
|
||||
Returns:
|
||||
Korrigierte UUID falls gefunden, sonst None
|
||||
"""
|
||||
if not incorrectId or len(incorrectId) != 36: # UUID Format: 8-4-4-4-12
|
||||
return None
|
||||
|
||||
# Prüfe ob es eine UUID ist (Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx)
|
||||
if incorrectId.count('-') != 4:
|
||||
return None
|
||||
|
||||
# Versuche Levenshtein-ähnliche Suche: Prüfe ob nur 1-2 Zeichen unterschiedlich sind
|
||||
for validId in validIds:
|
||||
if len(validId) != 36:
|
||||
continue
|
||||
|
||||
# Zähle unterschiedliche Zeichen
|
||||
differences = sum(c1 != c2 for c1, c2 in zip(incorrectId, validId))
|
||||
|
||||
# Wenn nur 1-2 Zeichen unterschiedlich sind, ist es wahrscheinlich ein Typo
|
||||
if differences <= 2:
|
||||
# Prüfe ob die Struktur ähnlich ist (gleiche Positionen der Bindestriche)
|
||||
if incorrectId.count('-') == validId.count('-'):
|
||||
return validId
|
||||
|
||||
return None
|
||||
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,293 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Generic Looping Use Case System
|
||||
|
||||
Provides parametrized looping infrastructure supporting different JSON formats and use cases.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Callback functions for use-case-specific logic
|
||||
|
||||
def _handleSectionContentFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
|
||||
debugPrefix: str, services: Any) -> str:
|
||||
"""Handle final result for section_content: return raw result to preserve all JSON blocks."""
|
||||
final_json = result # Return raw response to preserve all JSON blocks
|
||||
# Write final merged result for section_content (overwrites iteration 1 response with complete merged result)
|
||||
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
|
||||
services.utils.writeDebugFile(final_json, f"{debugPrefix}_response")
|
||||
return final_json
|
||||
|
||||
|
||||
def _handleChapterStructureFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
|
||||
debugPrefix: str, services: Any) -> str:
|
||||
"""Handle final result for chapter_structure: format JSON and write debug file."""
|
||||
import json
|
||||
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
|
||||
# Write final result for chapter structure
|
||||
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
|
||||
services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
|
||||
return final_json
|
||||
|
||||
|
||||
def _handleCodeStructureFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
|
||||
debugPrefix: str, services: Any) -> str:
|
||||
"""Handle final result for code_structure: format JSON and write debug file."""
|
||||
import json
|
||||
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
|
||||
# Write final result for code structure
|
||||
if services and hasattr(services, 'utils') and hasattr(services.utils, 'writeDebugFile'):
|
||||
services.utils.writeDebugFile(final_json, f"{debugPrefix}_final_result")
|
||||
return final_json
|
||||
|
||||
|
||||
def _handleCodeContentFinalResult(result: str, parsedJsonForUseCase: Any, extractedJsonForUseCase: str,
|
||||
debugPrefix: str, services: Any) -> str:
|
||||
"""Handle final result for code_content: format JSON."""
|
||||
import json
|
||||
final_json = json.dumps(parsedJsonForUseCase, indent=2, ensure_ascii=False) if parsedJsonForUseCase else (extractedJsonForUseCase or result)
|
||||
return final_json
|
||||
|
||||
|
||||
def _normalizeSectionContentJson(parsed: Any, useCaseId: str) -> Any:
|
||||
"""Normalize JSON structure for section_content use case."""
|
||||
# For section_content, expect {"elements": [...]} structure
|
||||
if isinstance(parsed, list):
|
||||
# Check if list contains strings (invalid format) or element objects
|
||||
if parsed and isinstance(parsed[0], str):
|
||||
# Invalid format - list of strings instead of elements
|
||||
# Try to convert strings to paragraph elements as fallback
|
||||
logger.debug(f"Received list of strings instead of elements array, converting to paragraph elements")
|
||||
elements = []
|
||||
for text in parsed:
|
||||
if isinstance(text, str) and text.strip():
|
||||
elements.append({
|
||||
"type": "paragraph",
|
||||
"content": {
|
||||
"text": text.strip()
|
||||
}
|
||||
})
|
||||
return {"elements": elements} if elements else {"elements": []}
|
||||
else:
|
||||
# Convert plain list of elements to elements structure
|
||||
return {"elements": parsed}
|
||||
elif isinstance(parsed, dict):
|
||||
# If it already has "elements", return as-is
|
||||
if "elements" in parsed:
|
||||
return parsed
|
||||
# If it has "type" and looks like an element, wrap in elements array
|
||||
elif parsed.get("type"):
|
||||
return {"elements": [parsed]}
|
||||
# Otherwise, assume it's already in correct format
|
||||
else:
|
||||
return parsed
|
||||
|
||||
# For other use cases, return as-is (they have their own structures)
|
||||
return parsed
|
||||
|
||||
|
||||
def _normalizeDefaultJson(parsed: Any, useCaseId: str) -> Any:
|
||||
"""Default normalizer: return as-is."""
|
||||
return parsed
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoopingUseCase:
|
||||
"""Configuration for a specific looping use case."""
|
||||
|
||||
# Identification
|
||||
useCaseId: str # "section_content", "chapter_structure", "code_structure", "code_content"
|
||||
|
||||
# JSON Format Detection
|
||||
jsonTemplate: Dict[str, Any] # Expected JSON structure template
|
||||
detectionKeys: List[str] # Keys to check for format detection (e.g., ["elements"], ["chapters"], ["files"])
|
||||
detectionPath: str # JSONPath to check (e.g., "documents[0].chapters", "files[0].content")
|
||||
|
||||
# Prompt Building
|
||||
initialPromptBuilder: Optional[Callable] = None # Function to build initial prompt
|
||||
continuationPromptBuilder: Optional[Callable] = None # Function to build continuation prompt
|
||||
|
||||
# Accumulation & Merging
|
||||
accumulator: Optional[Callable] = None # Function to accumulate fragments
|
||||
merger: Optional[Callable] = None # Function to merge accumulated data
|
||||
|
||||
# Continuation Context
|
||||
continuationContextBuilder: Optional[Callable] = None # Build continuation context for this format
|
||||
|
||||
# Result Building
|
||||
resultBuilder: Optional[Callable] = None # Build final result from accumulated data
|
||||
|
||||
# Use-case-specific handlers (callbacks to avoid if/elif chains in generic code)
|
||||
finalResultHandler: Optional[Callable] = None # Handle final result formatting and debug file writing
|
||||
jsonNormalizer: Optional[Callable] = None # Normalize JSON structure for this use case
|
||||
|
||||
# Metadata
|
||||
supportsAccumulation: bool = True # Whether this use case supports accumulation
|
||||
requiresExtraction: bool = False # Whether this requires extraction (like sections)
|
||||
|
||||
|
||||
class LoopingUseCaseRegistry:
|
||||
"""Registry of all looping use cases."""
|
||||
|
||||
def __init__(self):
|
||||
self.useCases: Dict[str, LoopingUseCase] = {}
|
||||
self._registerDefaultUseCases()
|
||||
|
||||
def register(self, useCase: LoopingUseCase):
|
||||
"""Register a new use case."""
|
||||
self.useCases[useCase.useCaseId] = useCase
|
||||
logger.debug(f"Registered looping use case: {useCase.useCaseId}")
|
||||
|
||||
def get(self, useCaseId: str) -> Optional[LoopingUseCase]:
|
||||
"""Get use case by ID."""
|
||||
return self.useCases.get(useCaseId)
|
||||
|
||||
def detectUseCase(self, parsedJson: Dict[str, Any]) -> Optional[str]:
|
||||
"""Detect which use case matches the JSON structure."""
|
||||
for useCaseId, useCase in self.useCases.items():
|
||||
if self._matchesFormat(parsedJson, useCase):
|
||||
return useCaseId
|
||||
return None
|
||||
|
||||
def _matchesFormat(self, json: Dict[str, Any], useCase: LoopingUseCase) -> bool:
|
||||
"""Check if JSON matches use case format."""
|
||||
# Check top-level keys
|
||||
for key in useCase.detectionKeys:
|
||||
if key in json:
|
||||
return True
|
||||
|
||||
# Check nested path using simple dictionary traversal (no jsonpath_ng needed)
|
||||
if useCase.detectionPath:
|
||||
try:
|
||||
# Simple path matching without jsonpath_ng
|
||||
# Format: "documents[0].chapters" or "files[0].content"
|
||||
pathParts = useCase.detectionPath.split(".")
|
||||
current = json
|
||||
|
||||
for part in pathParts:
|
||||
# Handle array indices like "documents[0]"
|
||||
if "[" in part and "]" in part:
|
||||
key = part.split("[")[0]
|
||||
index = int(part.split("[")[1].split("]")[0])
|
||||
if isinstance(current, dict) and key in current:
|
||||
if isinstance(current[key], list) and 0 <= index < len(current[key]):
|
||||
current = current[key][index]
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
# Regular key access
|
||||
if isinstance(current, dict) and part in current:
|
||||
current = current[part]
|
||||
else:
|
||||
return False
|
||||
|
||||
# If we successfully traversed the path, it matches
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug(f"Path matching failed for {useCase.useCaseId}: {e}")
|
||||
|
||||
return False
|
||||
|
||||
def _registerDefaultUseCases(self):
|
||||
"""Register default use cases."""
|
||||
|
||||
# Use Case 1: Section Content Generation
|
||||
# Returns JSON with "elements" array directly
|
||||
self.register(LoopingUseCase(
|
||||
useCaseId="section_content",
|
||||
jsonTemplate={"elements": []},
|
||||
detectionKeys=["elements"],
|
||||
detectionPath="",
|
||||
initialPromptBuilder=None, # Will use default prompt builder
|
||||
continuationPromptBuilder=None, # Will use default continuation builder
|
||||
accumulator=None, # Direct return, no accumulation
|
||||
merger=None,
|
||||
continuationContextBuilder=None, # Will use default continuation context
|
||||
resultBuilder=None, # Return JSON directly
|
||||
finalResultHandler=_handleSectionContentFinalResult,
|
||||
jsonNormalizer=_normalizeSectionContentJson,
|
||||
supportsAccumulation=False,
|
||||
requiresExtraction=False
|
||||
))
|
||||
|
||||
# Use Case 2: Chapter Structure Generation
|
||||
# Returns JSON with "documents[0].chapters" structure
|
||||
self.register(LoopingUseCase(
|
||||
useCaseId="chapter_structure",
|
||||
jsonTemplate={"documents": [{"chapters": []}]},
|
||||
detectionKeys=["chapters"],
|
||||
detectionPath="documents[0].chapters",
|
||||
initialPromptBuilder=None,
|
||||
continuationPromptBuilder=None,
|
||||
accumulator=None, # Direct return, no accumulation
|
||||
merger=None,
|
||||
continuationContextBuilder=None,
|
||||
resultBuilder=None, # Return JSON directly
|
||||
finalResultHandler=_handleChapterStructureFinalResult,
|
||||
jsonNormalizer=_normalizeDefaultJson,
|
||||
supportsAccumulation=False,
|
||||
requiresExtraction=False
|
||||
))
|
||||
|
||||
# Use Case 3: Code Structure Generation
|
||||
self.register(LoopingUseCase(
|
||||
useCaseId="code_structure",
|
||||
jsonTemplate={
|
||||
"metadata": {
|
||||
"language": "",
|
||||
"projectType": "single_file|multi_file",
|
||||
"projectName": ""
|
||||
},
|
||||
"files": [
|
||||
{
|
||||
"id": "",
|
||||
"filename": "",
|
||||
"fileType": "",
|
||||
"dependencies": [],
|
||||
"imports": [],
|
||||
"functions": [],
|
||||
"classes": []
|
||||
}
|
||||
]
|
||||
},
|
||||
detectionKeys=["files"],
|
||||
detectionPath="files",
|
||||
initialPromptBuilder=None,
|
||||
continuationPromptBuilder=None,
|
||||
accumulator=None, # Direct return
|
||||
merger=None,
|
||||
continuationContextBuilder=None,
|
||||
resultBuilder=None,
|
||||
finalResultHandler=_handleCodeStructureFinalResult,
|
||||
jsonNormalizer=_normalizeDefaultJson,
|
||||
supportsAccumulation=False,
|
||||
requiresExtraction=False
|
||||
))
|
||||
|
||||
# Use Case 5: Code Content Generation (NEW)
|
||||
self.register(LoopingUseCase(
|
||||
useCaseId="code_content",
|
||||
jsonTemplate={"files": [{"content": "", "functions": []}]},
|
||||
detectionKeys=["content", "functions"],
|
||||
detectionPath="files[0].content",
|
||||
initialPromptBuilder=None,
|
||||
continuationPromptBuilder=None,
|
||||
accumulator=None, # Will use default accumulator
|
||||
merger=None, # Will use default merger
|
||||
continuationContextBuilder=None,
|
||||
resultBuilder=None, # Will use default result builder
|
||||
finalResultHandler=_handleCodeContentFinalResult,
|
||||
jsonNormalizer=_normalizeDefaultJson,
|
||||
supportsAccumulation=True,
|
||||
requiresExtraction=False
|
||||
))
|
||||
|
||||
logger.info(f"Registered {len(self.useCases)} default looping use cases")
|
||||
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Response Parsing Module
|
||||
|
||||
Handles parsing of AI responses, including:
|
||||
- Section extraction from responses
|
||||
- JSON completeness detection
|
||||
- Loop detection
|
||||
- Document metadata extraction
|
||||
- Final result building
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
from modules.shared.jsonUtils import extractJsonString, repairBrokenJson, extractSectionsFromDocument
|
||||
from .subJsonResponseHandling import JsonResponseHandler
|
||||
from modules.datamodels.datamodelAi import JsonAccumulationState
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ResponseParser:
|
||||
"""Handles parsing of AI responses and completion detection."""
|
||||
|
||||
def __init__(self, services):
|
||||
"""Initialize ResponseParser with service center access."""
|
||||
self.services = services
|
||||
|
||||
def extractSectionsFromResponse(
|
||||
self,
|
||||
result: str,
|
||||
iteration: int,
|
||||
debugPrefix: str,
|
||||
allSections: List[Dict[str, Any]] = None,
|
||||
accumulationState: Optional[JsonAccumulationState] = None
|
||||
) -> Tuple[List[Dict[str, Any]], bool, Optional[Dict[str, Any]], Optional[JsonAccumulationState]]:
|
||||
"""
|
||||
Extract sections from AI response, handling both valid and broken JSON.
|
||||
|
||||
NEW BEHAVIOR:
|
||||
- First iteration: Check if complete, if not start accumulation
|
||||
- Subsequent iterations: Accumulate strings, parse when complete
|
||||
|
||||
Returns:
|
||||
Tuple of:
|
||||
- sections: Extracted sections
|
||||
- wasJsonComplete: True if JSON is complete
|
||||
- parsedResult: Parsed JSON object
|
||||
- updatedAccumulationState: Updated accumulation state (None if not in accumulation mode)
|
||||
"""
|
||||
if allSections is None:
|
||||
allSections = []
|
||||
|
||||
if iteration == 1:
|
||||
# First iteration - check if complete
|
||||
parsed = None
|
||||
try:
|
||||
extracted = extractJsonString(result)
|
||||
parsed = json.loads(extracted)
|
||||
|
||||
# Check completeness
|
||||
if JsonResponseHandler.isJsonComplete(parsed):
|
||||
# Complete JSON - no accumulation needed
|
||||
sections = extractSectionsFromDocument(parsed)
|
||||
logger.info(f"Iteration 1: Complete JSON detected, no accumulation needed")
|
||||
return sections, True, parsed, None # No accumulation
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Incomplete - try to extract partial sections from broken JSON
|
||||
logger.info(f"Iteration 1: Incomplete JSON detected, attempting to extract partial sections")
|
||||
|
||||
partialSections = []
|
||||
if parsed:
|
||||
# Try to extract sections from parsed (even if incomplete)
|
||||
partialSections = extractSectionsFromDocument(parsed)
|
||||
else:
|
||||
# Try to repair broken JSON and extract sections
|
||||
try:
|
||||
repaired = repairBrokenJson(result)
|
||||
if repaired:
|
||||
partialSections = extractSectionsFromDocument(repaired)
|
||||
parsed = repaired # Use repaired version for accumulation state
|
||||
except Exception:
|
||||
pass # If repair fails, continue with empty sections
|
||||
|
||||
|
||||
# Define KPIs (async call - need to handle this)
|
||||
# For now, create accumulation state without KPIs, will be updated after async call
|
||||
accumulationState = JsonAccumulationState(
|
||||
accumulatedJsonString=result,
|
||||
isAccumulationMode=True,
|
||||
lastParsedResult=parsed,
|
||||
allSections=partialSections,
|
||||
kpis=[]
|
||||
)
|
||||
|
||||
# Note: KPI definition will be done in the caller (async context)
|
||||
return partialSections, False, parsed, accumulationState
|
||||
|
||||
else:
|
||||
# Subsequent iterations - accumulate
|
||||
if accumulationState and accumulationState.isAccumulationMode:
|
||||
accumulated, sections, isComplete, parsedResult = \
|
||||
JsonResponseHandler.accumulateAndParseJsonFragments(
|
||||
accumulationState.accumulatedJsonString,
|
||||
result,
|
||||
allSections,
|
||||
iteration
|
||||
)
|
||||
|
||||
# Update accumulation state
|
||||
accumulationState.accumulatedJsonString = accumulated
|
||||
accumulationState.lastParsedResult = parsedResult
|
||||
accumulationState.allSections = allSections + sections if sections else allSections
|
||||
accumulationState.isAccumulationMode = not isComplete
|
||||
|
||||
# Log accumulated JSON for debugging
|
||||
if parsedResult:
|
||||
accumulated_json_str = json.dumps(parsedResult, indent=2, ensure_ascii=False)
|
||||
self.services.utils.writeDebugFile(accumulated_json_str, f"{debugPrefix}_accumulated_json_iteration_{iteration}.json")
|
||||
|
||||
return sections, isComplete, parsedResult, accumulationState
|
||||
else:
|
||||
# No accumulation mode - process normally (shouldn't happen)
|
||||
logger.warning(f"Iteration {iteration}: No accumulation state but iteration > 1")
|
||||
return [], False, None, None
|
||||
|
||||
def shouldContinueGeneration(
|
||||
self,
|
||||
allSections: List[Dict[str, Any]],
|
||||
iteration: int,
|
||||
wasJsonComplete: bool,
|
||||
rawResponse: str = None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if AI generation loop should continue.
|
||||
|
||||
CRITICAL: This is ONLY about AI Loop Completion, NOT Action DoD!
|
||||
Action DoD is checked AFTER the AI Loop completes in _refineDecide.
|
||||
|
||||
Simple logic:
|
||||
- If JSON parsing failed or incomplete → continue (needs more content)
|
||||
- If JSON parses successfully and is complete → stop (all content delivered)
|
||||
- Loop detection prevents infinite loops
|
||||
|
||||
CRITICAL: JSON completeness is determined by parsing, NOT by last character check!
|
||||
Returns True if we should continue, False if AI Loop is done.
|
||||
"""
|
||||
if len(allSections) == 0:
|
||||
return True # No sections yet, continue
|
||||
|
||||
# CRITERION 1: If JSON was incomplete/broken (parsing failed or incomplete) - continue to repair/complete
|
||||
if not wasJsonComplete:
|
||||
logger.info(f"Iteration {iteration}: JSON incomplete/broken - continuing to complete")
|
||||
return True
|
||||
|
||||
# CRITERION 2: JSON is complete (parsed successfully) - check for loop detection
|
||||
if self._isStuckInLoop(allSections, iteration):
|
||||
logger.warning(f"Iteration {iteration}: Detected potential infinite loop - stopping AI loop")
|
||||
return False
|
||||
|
||||
# JSON is complete and not stuck in loop - done
|
||||
logger.info(f"Iteration {iteration}: JSON complete - AI loop done")
|
||||
return False
|
||||
|
||||
def _isStuckInLoop(
|
||||
self,
|
||||
allSections: List[Dict[str, Any]],
|
||||
iteration: int
|
||||
) -> bool:
|
||||
"""
|
||||
Detect if we're stuck in a loop (same content being repeated).
|
||||
|
||||
Generic approach: Check if recent iterations are adding minimal or duplicate content.
|
||||
"""
|
||||
if iteration < 3:
|
||||
return False # Need at least 3 iterations to detect a loop
|
||||
|
||||
if len(allSections) == 0:
|
||||
return False
|
||||
|
||||
# Check if last section is very small (might be stuck)
|
||||
lastSection = allSections[-1]
|
||||
elements = lastSection.get("elements", [])
|
||||
|
||||
if isinstance(elements, list) and elements:
|
||||
lastElem = elements[-1] if elements else {}
|
||||
else:
|
||||
lastElem = elements if isinstance(elements, dict) else {}
|
||||
|
||||
# Check content size of last section
|
||||
lastSectionSize = 0
|
||||
if isinstance(lastElem, dict):
|
||||
for key, value in lastElem.items():
|
||||
if isinstance(value, str):
|
||||
lastSectionSize += len(value)
|
||||
elif isinstance(value, list):
|
||||
lastSectionSize += len(str(value))
|
||||
|
||||
# If last section is very small and we've done many iterations, might be stuck
|
||||
if lastSectionSize < 100 and iteration > 10:
|
||||
logger.warning(f"Potential loop detected: iteration {iteration}, last section size {lastSectionSize}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def extractDocumentMetadata(
|
||||
self,
|
||||
parsedResult: Dict[str, Any]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract document metadata (title, filename) from parsed AI response.
|
||||
Returns dict with 'title' and 'filename' keys if found, None otherwise.
|
||||
"""
|
||||
if not isinstance(parsedResult, dict):
|
||||
return None
|
||||
|
||||
# Try to get from documents array (preferred structure)
|
||||
if "documents" in parsedResult and isinstance(parsedResult["documents"], list) and len(parsedResult["documents"]) > 0:
|
||||
firstDoc = parsedResult["documents"][0]
|
||||
if isinstance(firstDoc, dict):
|
||||
title = firstDoc.get("title")
|
||||
filename = firstDoc.get("filename")
|
||||
if title or filename:
|
||||
return {
|
||||
"title": title,
|
||||
"filename": filename
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
def buildFinalResultFromSections(
|
||||
self,
|
||||
allSections: List[Dict[str, Any]],
|
||||
documentMetadata: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Build final JSON result from accumulated sections.
|
||||
Uses AI-provided metadata (title, filename) if available.
|
||||
"""
|
||||
if not allSections:
|
||||
return ""
|
||||
|
||||
# Extract metadata from AI response if available
|
||||
title = "Generated Document"
|
||||
filename = "document.json"
|
||||
if documentMetadata:
|
||||
if documentMetadata.get("title"):
|
||||
title = documentMetadata["title"]
|
||||
if documentMetadata.get("filename"):
|
||||
filename = documentMetadata["filename"]
|
||||
|
||||
# Build documents structure
|
||||
# Assuming single document for now
|
||||
documents = [{
|
||||
"id": "doc_1",
|
||||
"title": title,
|
||||
"filename": filename,
|
||||
"sections": allSections
|
||||
}]
|
||||
|
||||
result = {
|
||||
"metadata": {
|
||||
"split_strategy": "single_document",
|
||||
"source_documents": [],
|
||||
"extraction_method": "ai_generation"
|
||||
},
|
||||
"documents": documents
|
||||
}
|
||||
|
||||
return json.dumps(result, indent=2)
|
||||
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,508 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Structure Generation Module
|
||||
|
||||
Handles document structure generation, including:
|
||||
- Generating document structure with sections
|
||||
- Building structure prompts
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, PriorityEnum, ProcessingModeEnum
|
||||
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StructureGenerator:
|
||||
"""Handles document structure generation."""
|
||||
|
||||
def __init__(self, services, aiService):
|
||||
"""Initialize StructureGenerator with service center and AI service access."""
|
||||
self.services = services
|
||||
self.aiService = aiService
|
||||
|
||||
def _getUserLanguage(self) -> str:
|
||||
"""Get user language for document generation"""
|
||||
try:
|
||||
if self.services:
|
||||
# Prefer detected language if available (from user intention analysis)
|
||||
if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage:
|
||||
return self.services.currentUserLanguage
|
||||
# Fallback to user's preferred language
|
||||
elif hasattr(self.services, 'user') and self.services.user and hasattr(self.services.user, 'language'):
|
||||
return self.services.user.language
|
||||
except Exception:
|
||||
pass
|
||||
return 'en' # Default fallback
|
||||
|
||||
async def generateStructure(
|
||||
self,
|
||||
userPrompt: str,
|
||||
contentParts: List[ContentPart],
|
||||
outputFormat: Optional[str] = None,
|
||||
parentOperationId: str = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Phase 5C: Generiert Chapter-Struktur (Table of Contents).
|
||||
Definiert für jedes Chapter:
|
||||
- Level, Title
|
||||
- contentParts (unified object with instruction and/or caption per part)
|
||||
- generationHint
|
||||
|
||||
Generate document structure with per-document format determination.
|
||||
Multiple documents can be produced with different formats (e.g., one PDF, one HTML).
|
||||
AI determines formats per-document from user prompt. The outputFormat parameter is
|
||||
only a validation fallback - used if AI doesn't return format per document.
|
||||
|
||||
Args:
|
||||
userPrompt: User-Anfrage
|
||||
contentParts: Alle vorbereiteten ContentParts mit Metadaten
|
||||
outputFormat: Optional global format fallback. If omitted, formats are determined
|
||||
from user prompt by AI. Used as validation fallback if AI doesn't
|
||||
return format per document. Defaults to "txt" if not provided.
|
||||
parentOperationId: Parent Operation-ID für ChatLog-Hierarchie
|
||||
|
||||
Returns:
|
||||
Struktur-Dict mit documents und chapters (nicht sections!)
|
||||
"""
|
||||
# If outputFormat not provided, use "txt" as fallback for validation
|
||||
# AI will determine formats per document from user prompt
|
||||
if not outputFormat:
|
||||
outputFormat = "txt"
|
||||
logger.debug("outputFormat not provided - using 'txt' as validation fallback, formats determined from prompt")
|
||||
# Erstelle Operation-ID für Struktur-Generierung
|
||||
structureOperationId = f"{parentOperationId}_structure_generation"
|
||||
|
||||
# Starte ChatLog mit Parent-Referenz
|
||||
formatDisplay = outputFormat if outputFormat else "auto-determined"
|
||||
self.services.chat.progressLogStart(
|
||||
structureOperationId,
|
||||
"Chapter Structure Generation",
|
||||
"Structure",
|
||||
f"Generating chapter structure (format: {formatDisplay})",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
# Baue Chapter-Struktur-Prompt mit Content-Index
|
||||
structurePrompt = self._buildChapterStructurePrompt(
|
||||
userPrompt=userPrompt,
|
||||
contentParts=contentParts,
|
||||
outputFormat=outputFormat
|
||||
)
|
||||
|
||||
# AI-Call für Chapter-Struktur-Generierung mit Looping-Unterstützung
|
||||
# Use _callAiWithLooping instead of callAiPlanning to support continuation if response is cut
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
||||
priority=PriorityEnum.QUALITY,
|
||||
processingMode=ProcessingModeEnum.DETAILED,
|
||||
compressPrompt=False,
|
||||
compressContext=False,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
structurePrompt, templateStructure = self._buildChapterStructurePrompt(
|
||||
userPrompt=userPrompt,
|
||||
contentParts=contentParts,
|
||||
outputFormat=outputFormat
|
||||
)
|
||||
|
||||
# Create prompt builder for continuation support
|
||||
async def buildChapterStructurePromptWithContinuation(
|
||||
continuationContext: Any,
|
||||
templateStructure: str,
|
||||
basePrompt: str
|
||||
) -> str:
|
||||
"""Build chapter structure prompt with continuation context. Uses unified signature.
|
||||
|
||||
Note: All initial context (userPrompt, contentParts, outputFormat, etc.) is already
|
||||
contained in basePrompt. This function only adds continuation-specific instructions.
|
||||
"""
|
||||
# Extract continuation context fields (only what's needed for continuation)
|
||||
incompletePart = continuationContext.incomplete_part
|
||||
lastRawJson = continuationContext.last_raw_json
|
||||
|
||||
# Generate both overlap context and hierarchy context using jsonContinuation
|
||||
overlapContext = ""
|
||||
unifiedContext = ""
|
||||
if lastRawJson:
|
||||
# Get contexts directly from jsonContinuation
|
||||
from modules.shared.jsonContinuation import getContexts
|
||||
contexts = getContexts(lastRawJson)
|
||||
overlapContext = contexts.overlapContext
|
||||
unifiedContext = contexts.hierarchyContextForPrompt
|
||||
elif incompletePart:
|
||||
unifiedContext = incompletePart
|
||||
else:
|
||||
unifiedContext = "Unable to extract context - response was completely broken"
|
||||
|
||||
# Build unified continuation prompt format
|
||||
continuationPrompt = f"""{basePrompt}
|
||||
|
||||
--- CONTINUATION REQUEST ---
|
||||
The previous JSON response was incomplete. Continue from where it stopped.
|
||||
|
||||
Context showing structure hierarchy with cut point:
|
||||
```
|
||||
{unifiedContext}
|
||||
```
|
||||
|
||||
Overlap Requirement:
|
||||
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
|
||||
|
||||
Overlap context (start your response with this exact text):
|
||||
```json
|
||||
{overlapContext if overlapContext else "No overlap context available"}
|
||||
```
|
||||
|
||||
TASK:
|
||||
1. Start your response EXACTLY with the overlap context shown above (character by character)
|
||||
2. Continue seamlessly from where the overlap context ends
|
||||
3. Complete the remaining content following the JSON structure template above
|
||||
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
|
||||
|
||||
CRITICAL:
|
||||
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
|
||||
- Continue seamlessly after the overlap context with new content
|
||||
- Your response must be valid JSON matching the structure template above"""
|
||||
return continuationPrompt
|
||||
|
||||
# Call AI with looping support
|
||||
# NOTE: Do NOT pass contentParts here - we only need metadata for structure generation
|
||||
# The contentParts metadata is already included in the prompt (contentPartsIndex)
|
||||
# Actual content extraction happens later during section generation
|
||||
checkWorkflowStopped(self.services)
|
||||
aiResponseJson = await self.aiService.callAiWithLooping(
|
||||
prompt=structurePrompt,
|
||||
options=options,
|
||||
debugPrefix="chapter_structure_generation",
|
||||
promptBuilder=buildChapterStructurePromptWithContinuation,
|
||||
promptArgs={
|
||||
"userPrompt": userPrompt,
|
||||
"outputFormat": outputFormat,
|
||||
"templateStructure": templateStructure,
|
||||
"basePrompt": structurePrompt
|
||||
},
|
||||
useCaseId="chapter_structure", # REQUIRED: Explicit use case ID
|
||||
operationId=structureOperationId,
|
||||
userPrompt=userPrompt,
|
||||
contentParts=None # Do not pass ContentParts - only metadata needed, not content extraction
|
||||
)
|
||||
|
||||
# Parse the complete JSON response (looping system already handles completion)
|
||||
extractedJson = self.services.utils.jsonExtractString(aiResponseJson)
|
||||
parsedJson, parseError, cleanedJson = self.services.utils.jsonTryParse(extractedJson)
|
||||
|
||||
if parseError is not None:
|
||||
# Even with looping, try repair as fallback
|
||||
logger.warning(f"JSON parsing failed after looping: {str(parseError)}. Attempting repair...")
|
||||
from modules.shared import jsonUtils
|
||||
repairedJson = jsonUtils.repairBrokenJson(extractedJson)
|
||||
if repairedJson:
|
||||
parsedJson, parseError, _ = self.services.utils.jsonTryParse(json.dumps(repairedJson))
|
||||
if parseError is None:
|
||||
logger.info("Successfully repaired and parsed JSON structure after looping")
|
||||
structure = parsedJson
|
||||
else:
|
||||
logger.error(f"Failed to parse repaired JSON: {str(parseError)}")
|
||||
raise ValueError(f"Failed to parse JSON structure after repair: {str(parseError)}")
|
||||
else:
|
||||
logger.error(f"Failed to repair JSON. Parse error: {str(parseError)}")
|
||||
logger.error(f"Cleaned JSON preview (first 500 chars): {cleanedJson[:500]}")
|
||||
raise ValueError(f"Failed to parse JSON structure: {str(parseError)}")
|
||||
else:
|
||||
structure = parsedJson
|
||||
|
||||
# State 3 Validation: Validate and auto-fix structure
|
||||
# Validation 3.1: Structure missing 'documents' field
|
||||
if "documents" not in structure:
|
||||
raise ValueError("Structure missing 'documents' field - cannot auto-fix")
|
||||
|
||||
documents = structure["documents"]
|
||||
|
||||
# Validation 3.2: Structure has no documents
|
||||
if not isinstance(documents, list) or len(documents) == 0:
|
||||
raise ValueError("Structure has no documents - cannot generate without documents")
|
||||
|
||||
# Import renderer registry for format validation (existing infrastructure)
|
||||
from modules.services.serviceGeneration.renderers.registry import getRenderer
|
||||
|
||||
# Validate and fix each document
|
||||
for doc in documents:
|
||||
# Validation 3.3 & 3.4: Document outputFormat
|
||||
# outputFormat parameter is optional - if omitted, formats determined from prompt by AI
|
||||
# Use as fallback only if AI doesn't return format per document
|
||||
# Multiple documents can have different formats (e.g., one PDF, one HTML)
|
||||
globalFormatFallback = outputFormat or "txt" # Fallback for validation
|
||||
|
||||
if "outputFormat" not in doc or not doc["outputFormat"]:
|
||||
# AI didn't return format or returned empty - use global fallback
|
||||
doc["outputFormat"] = globalFormatFallback
|
||||
logger.warning(f"Document {doc.get('id')} missing outputFormat - using fallback: {doc['outputFormat']}")
|
||||
else:
|
||||
# AI returned format - validate using existing renderer registry
|
||||
formatName = str(doc["outputFormat"]).lower().strip()
|
||||
renderer = getRenderer(formatName) # Uses existing infrastructure
|
||||
|
||||
if not renderer:
|
||||
# Format doesn't match any renderer - use txt (simple approach)
|
||||
logger.warning(f"Document {doc.get('id')} has format without renderer: {formatName}, using 'txt'")
|
||||
doc["outputFormat"] = "txt"
|
||||
else:
|
||||
# Valid format with renderer - normalize and keep AI result
|
||||
doc["outputFormat"] = formatName
|
||||
logger.debug(f"Document {doc.get('id')} using AI-determined format: {formatName}")
|
||||
|
||||
# Validation 3.5 & 3.6: Document language
|
||||
# Use validated currentUserLanguage (always valid, validated during user intention analysis)
|
||||
# Access via _getUserLanguage() which uses self.services.currentUserLanguage
|
||||
userPromptLanguage = self._getUserLanguage() # Uses validated currentUserLanguage infrastructure
|
||||
|
||||
if "language" not in doc or not isinstance(doc["language"], str) or len(doc["language"]) != 2:
|
||||
# AI didn't return language or invalid format - use validated currentUserLanguage
|
||||
doc["language"] = userPromptLanguage
|
||||
if "language" not in doc:
|
||||
logger.warning(f"Document {doc.get('id')} missing language - using currentUserLanguage: {userPromptLanguage}")
|
||||
else:
|
||||
logger.warning(f"Document {doc.get('id')} has invalid language format from AI: {doc['language']}, using currentUserLanguage")
|
||||
else:
|
||||
# AI returned valid language format - normalize
|
||||
doc["language"] = doc["language"].lower().strip()[:2]
|
||||
logger.debug(f"Document {doc.get('id')} using AI-determined language: {doc['language']}")
|
||||
|
||||
# Validation 3.7: Document missing 'chapters' field
|
||||
if "chapters" not in doc:
|
||||
raise ValueError(f"Document {doc.get('id')} missing 'chapters' field - cannot auto-fix")
|
||||
|
||||
# Validation 3.8: Chapter missing 'contentParts' field
|
||||
for chapter in doc["chapters"]:
|
||||
if "contentParts" not in chapter:
|
||||
raise ValueError(f"Chapter {chapter.get('id')} missing 'contentParts' field - cannot auto-fix")
|
||||
|
||||
# ChatLog abschließen
|
||||
self.services.chat.progressLogFinish(structureOperationId, True)
|
||||
|
||||
return structure
|
||||
|
||||
except Exception as e:
|
||||
self.services.chat.progressLogFinish(structureOperationId, False)
|
||||
logger.error(f"Error in generateStructure: {str(e)}")
|
||||
raise
|
||||
|
||||
def _buildChapterStructurePrompt(
|
||||
self,
|
||||
userPrompt: str,
|
||||
contentParts: List[ContentPart],
|
||||
outputFormat: str
|
||||
) -> tuple[str, str]:
|
||||
"""Baue Prompt für Chapter-Struktur-Generierung."""
|
||||
# Baue ContentParts-Index - filtere leere Parts heraus
|
||||
contentPartsIndex = ""
|
||||
validParts = []
|
||||
filteredParts = []
|
||||
|
||||
for part in contentParts:
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
|
||||
# WICHTIG: Reference Parts haben absichtlich leere Daten - immer einschließen
|
||||
if contentFormat == "reference":
|
||||
validParts.append(part)
|
||||
logger.debug(f"Including reference ContentPart {part.id} (intentionally empty data)")
|
||||
continue
|
||||
|
||||
# Überspringe leere Parts (keine Daten oder nur Container ohne Inhalt)
|
||||
# ABER: Reference Parts wurden bereits oben behandelt
|
||||
if not part.data or (isinstance(part.data, str) and len(part.data.strip()) == 0):
|
||||
# Überspringe Container-Parts ohne Daten
|
||||
if part.typeGroup == "container" and not part.data:
|
||||
filteredParts.append((part.id, "container without data"))
|
||||
continue
|
||||
# Überspringe andere leere Parts (aber nicht Reference, die wurden bereits behandelt)
|
||||
if not part.data:
|
||||
filteredParts.append((part.id, f"no data (format: {contentFormat})"))
|
||||
continue
|
||||
|
||||
validParts.append(part)
|
||||
logger.debug(f"Including ContentPart {part.id}: format={contentFormat}, type={part.typeGroup}, dataLength={len(str(part.data)) if part.data else 0}")
|
||||
|
||||
if filteredParts:
|
||||
logger.debug(f"Filtered out {len(filteredParts)} empty ContentParts: {filteredParts}")
|
||||
|
||||
logger.info(f"Building structure prompt with {len(validParts)} valid ContentParts (from {len(contentParts)} total)")
|
||||
|
||||
# Baue Index nur für gültige Parts
|
||||
for i, part in enumerate(validParts, 1):
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
originalFileName = part.metadata.get('originalFileName', 'N/A')
|
||||
|
||||
contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
|
||||
contentPartsIndex += f" Format: {contentFormat}\n"
|
||||
contentPartsIndex += f" Type: {part.typeGroup}\n"
|
||||
contentPartsIndex += f" MIME Type: {part.mimeType or 'N/A'}\n"
|
||||
contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n"
|
||||
contentPartsIndex += f" Original file name: {originalFileName}\n"
|
||||
contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
|
||||
|
||||
if not contentPartsIndex:
|
||||
contentPartsIndex = "\n(No content parts available)"
|
||||
|
||||
# Get language from services (user intention analysis)
|
||||
language = self._getUserLanguage()
|
||||
logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}")
|
||||
|
||||
# Create template structure explicitly (not extracted from prompt)
|
||||
# This ensures exact identity between initial and continuation prompts
|
||||
templateStructure = f"""{{
|
||||
"metadata": {{
|
||||
"title": "Document Title",
|
||||
"language": "{language}"
|
||||
}},
|
||||
"documents": [{{
|
||||
"id": "doc_1",
|
||||
"title": "Document Title",
|
||||
"filename": "document.{outputFormat}",
|
||||
"outputFormat": "{outputFormat}",
|
||||
"language": "{language}",
|
||||
"chapters": [
|
||||
{{
|
||||
"id": "chapter_1",
|
||||
"level": 1,
|
||||
"title": "Chapter Title",
|
||||
"contentParts": {{
|
||||
"extracted_part_id": {{
|
||||
"instruction": "Use extracted content with ALL relevant details from user request"
|
||||
}}
|
||||
}},
|
||||
"generationHint": "Detailed description including ALL relevant details from user request for this chapter",
|
||||
"sections": []
|
||||
}}
|
||||
]
|
||||
}}]
|
||||
}}"""
|
||||
|
||||
prompt = f"""# TASK: Plan Document Structure (Documents + Chapters)
|
||||
|
||||
This is a STRUCTURE PLANNING task. You define which documents to create and which chapters each document will have.
|
||||
Chapter CONTENT will be generated in a later step - here you only plan the STRUCTURE and assign content references.
|
||||
Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
|
||||
|
||||
## USER REQUEST (for context)
|
||||
```
|
||||
{userPrompt}
|
||||
```
|
||||
|
||||
## AVAILABLE CONTENT PARTS
|
||||
{contentPartsIndex}
|
||||
|
||||
## CONTENT ASSIGNMENT RULE
|
||||
|
||||
CRITICAL: Every chapter MUST have contentParts assigned if it relates to documents/images/data from the user request.
|
||||
If the user request mentions documents/images/data, then EVERY chapter that generates content related to those references MUST assign the relevant ContentParts explicitly.
|
||||
|
||||
Assignment logic:
|
||||
- If chapter DISPLAYS a document/image → assign "object" format ContentPart with "caption"
|
||||
- If chapter generates text content ABOUT a document/image/data → assign ContentPart with "instruction":
|
||||
- Prefer "extracted" format if available (contains analyzed/extracted content)
|
||||
- If only "object" format is available, use "object" format with "instruction" (to write ABOUT the image/document)
|
||||
- If chapter's generationHint or purpose relates to a document/image/data mentioned in user request → it MUST have ContentParts assigned
|
||||
- Multiple chapters might assign the same ContentPart (e.g., one chapter displays image, another writes about it)
|
||||
- Use ContentPart IDs exactly as listed in AVAILABLE CONTENT PARTS above
|
||||
- Empty contentParts are only allowed if chapter generates content WITHOUT referencing any documents/images/data from the user request
|
||||
|
||||
CRITICAL RULE: If the user request mentions BOTH:
|
||||
a) Documents/images/data (listed in AVAILABLE CONTENT PARTS above), AND
|
||||
b) Generic content types (article text, main content, body text, etc.)
|
||||
Then chapters that generate those generic content types MUST assign the relevant ContentParts, because the content should relate to or be based on the provided documents/images/data.
|
||||
|
||||
## CONTENT EFFICIENCY PRINCIPLES
|
||||
- Generate COMPACT content: Focus on essential information only
|
||||
- AVOID verbose, lengthy, or repetitive text - be concise and direct
|
||||
- Prioritize FACTS over filler text - no introductions like "In this chapter..."
|
||||
- Minimize system resources: shorter content = faster processing
|
||||
- Quality over quantity: precise, meaningful content rather than padding
|
||||
|
||||
## CHAPTER STRUCTURE REQUIREMENTS
|
||||
- Generate chapters based on USER REQUEST - analyze what structure the user wants
|
||||
- Create ONLY the minimum chapters needed to cover the user's request - avoid over-structuring
|
||||
- IMPORTANT: Each chapter MUST have ALL these fields:
|
||||
- id: Unique identifier (e.g., "chapter_1")
|
||||
- level: Heading level (1, 2, 3, etc.)
|
||||
- title: Chapter title
|
||||
- contentParts: Object mapping ContentPart IDs to usage instructions (MUST assign if chapter relates to documents/data from user request)
|
||||
- generationHint: Description of what content to generate (including formatting/styling requirements)
|
||||
- sections: Empty array [] (REQUIRED - sections are generated in next phase)
|
||||
- contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Assign ContentParts as required by CONTENT ASSIGNMENT RULE above
|
||||
- The "instruction" field for each ContentPart MUST contain ALL relevant details from the USER REQUEST that apply to content extraction for this specific chapter. Include all formatting rules, data requirements, constraints, and specifications mentioned in the user request that are relevant for processing this ContentPart in this chapter.
|
||||
- generationHint: Keep CONCISE but include relevant details from the USER REQUEST. Focus on WHAT to generate, not HOW to phrase it verbosely.
|
||||
- The number of chapters depends on the user request - create only what is requested. Do NOT create chapters for topics without available data.
|
||||
|
||||
CRITICAL: Only create chapters for CONTENT sections, not for formatting/styling requirements. Formatting/styling requirements to be included in each generationHint if needed.
|
||||
|
||||
## DOCUMENT STRUCTURE
|
||||
|
||||
For each document, determine:
|
||||
- outputFormat: From USER REQUEST (explicit mention or infer from purpose/content type). Default: "{outputFormat}". Multiple documents can have different formats.
|
||||
- language: From USER REQUEST (map to ISO 639-1: de, en, fr, it...). Default: "{language}". Multiple documents can have different languages.
|
||||
- chapters: Structure appropriately for the format (e.g., pptx=slides, docx=sections, xlsx=worksheets). Match format capabilities and constraints.
|
||||
|
||||
Required JSON fields:
|
||||
- metadata: {{"title": "...", "language": "..."}}
|
||||
- documents: Array with id, title, filename, outputFormat, language, chapters[]
|
||||
- chapters: Array with id, level, title, contentParts, generationHint, sections[]
|
||||
|
||||
EXAMPLE STRUCTURE (for reference only - adapt to user request):
|
||||
{{
|
||||
"metadata": {{
|
||||
"title": "Document Title",
|
||||
"language": "{language}"
|
||||
}},
|
||||
"documents": [{{
|
||||
"id": "doc_1",
|
||||
"title": "Document Title",
|
||||
"filename": "document.{outputFormat}",
|
||||
"outputFormat": "{outputFormat}",
|
||||
"language": "{language}",
|
||||
"chapters": [
|
||||
{{
|
||||
"id": "chapter_1",
|
||||
"level": 1,
|
||||
"title": "Chapter Title",
|
||||
"contentParts": {{
|
||||
"extracted_part_id": {{
|
||||
"instruction": "Use extracted content with ALL relevant details from user request"
|
||||
}}
|
||||
}},
|
||||
"generationHint": "Detailed description including ALL relevant details from user request for this chapter",
|
||||
"sections": []
|
||||
}}
|
||||
]
|
||||
}}]
|
||||
}}
|
||||
|
||||
CRITICAL INSTRUCTIONS:
|
||||
- Generate chapters based on USER REQUEST, NOT based on the example above
|
||||
- The example shows the JSON structure format, NOT the required chapters
|
||||
- Create only the chapters that match the user's request
|
||||
- Adapt chapter titles and structure to match the user's specific request
|
||||
- Determine outputFormat and language for each document by analyzing the USER REQUEST above
|
||||
- The example shows placeholders "{outputFormat}" and "{language}" - YOU MUST REPLACE THESE with actual values determined from the USER REQUEST
|
||||
|
||||
MANDATORY CONTENT ASSIGNMENT CHECK:
|
||||
For each chapter, verify:
|
||||
1. Does the user request mention documents/images/data? (e.g., "photo", "image", "document", "data", "based on", "about")
|
||||
2. Does this chapter's generationHint, title, or purpose relate to those documents/images/data mentioned in step 1?
|
||||
- Examples: "article about the photo", "text describing the image", "analysis of the document", "content based on the data"
|
||||
- Even if chapter doesn't explicitly say "about the image", if user request mentions both the image AND this chapter's content type → relate them
|
||||
3. If YES to both → chapter MUST have contentParts assigned (cannot be empty {{}})
|
||||
4. If ContentPart is "object" format and chapter needs to write ABOUT it → assign with "instruction" field, not just "caption"
|
||||
|
||||
OUTPUT FORMAT: Start with {{ and end with }}. Do NOT use markdown code fences (```json). Do NOT add explanatory text before or after the JSON. Return ONLY the JSON object itself.
|
||||
"""
|
||||
return prompt, templateStructure
|
||||
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""Billing service module."""
|
||||
|
||||
from .mainServiceBilling import BillingService, getService
|
||||
|
||||
__all__ = ["BillingService", "getService"]
|
||||
|
|
@ -1,417 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Billing Service - Central service for billing operations.
|
||||
|
||||
Handles:
|
||||
- Balance checks before AI operations
|
||||
- Cost recording after AI operations
|
||||
- Provider permission checks via RBAC
|
||||
- Price calculation with markup
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from modules.datamodels.datamodelUam import User
|
||||
from modules.datamodels.datamodelBilling import (
|
||||
BillingModelEnum,
|
||||
BillingCheckResult,
|
||||
TransactionTypeEnum,
|
||||
ReferenceTypeEnum,
|
||||
BillingTransaction,
|
||||
BillingBalanceResponse,
|
||||
)
|
||||
from modules.interfaces.interfaceDbBilling import getInterface as getBillingInterface
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Markup percentage for internal pricing (+50% für Infrastruktur und Platform Service + 50% für Währungsrisiko ==> Faktor 2.0)
|
||||
BILLING_MARKUP_PERCENT = 100
|
||||
|
||||
# Singleton cache
|
||||
_billingServices: Dict[str, "BillingService"] = {}
|
||||
|
||||
|
||||
def getService(currentUser: User, mandateId: str, featureInstanceId: str = None, featureCode: str = None) -> "BillingService":
|
||||
"""
|
||||
Factory function to get or create a BillingService instance.
|
||||
|
||||
Args:
|
||||
currentUser: Current user object
|
||||
mandateId: Mandate ID for context
|
||||
featureInstanceId: Optional feature instance ID
|
||||
featureCode: Optional feature code (e.g., 'chatplayground', 'automation')
|
||||
|
||||
Returns:
|
||||
BillingService instance
|
||||
"""
|
||||
cacheKey = f"{currentUser.id}_{mandateId}_{featureInstanceId}"
|
||||
|
||||
if cacheKey not in _billingServices:
|
||||
_billingServices[cacheKey] = BillingService(currentUser, mandateId, featureInstanceId, featureCode)
|
||||
else:
|
||||
_billingServices[cacheKey].setContext(currentUser, mandateId, featureInstanceId, featureCode)
|
||||
|
||||
return _billingServices[cacheKey]
|
||||
|
||||
|
||||
class BillingService:
|
||||
"""
|
||||
Central billing service for AI operations.
|
||||
|
||||
Responsibilities:
|
||||
- Check balance before operations
|
||||
- Record usage costs
|
||||
- Apply pricing markup
|
||||
- Check provider permissions via RBAC
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
currentUser: User,
|
||||
mandateId: str,
|
||||
featureInstanceId: str = None,
|
||||
featureCode: str = None
|
||||
):
|
||||
"""
|
||||
Initialize the billing service.
|
||||
|
||||
Args:
|
||||
currentUser: Current user object
|
||||
mandateId: Mandate ID
|
||||
featureInstanceId: Optional feature instance ID
|
||||
featureCode: Optional feature code
|
||||
"""
|
||||
self.currentUser = currentUser
|
||||
self.mandateId = mandateId
|
||||
self.featureInstanceId = featureInstanceId
|
||||
self.featureCode = featureCode
|
||||
|
||||
# Get billing interface
|
||||
self._billingInterface = getBillingInterface(currentUser, mandateId)
|
||||
|
||||
# Cache settings
|
||||
self._settingsCache = None
|
||||
|
||||
def setContext(
|
||||
self,
|
||||
currentUser: User,
|
||||
mandateId: str,
|
||||
featureInstanceId: str = None,
|
||||
featureCode: str = None
|
||||
):
|
||||
"""Update service context."""
|
||||
self.currentUser = currentUser
|
||||
self.mandateId = mandateId
|
||||
self.featureInstanceId = featureInstanceId
|
||||
self.featureCode = featureCode
|
||||
self._billingInterface = getBillingInterface(currentUser, mandateId)
|
||||
self._settingsCache = None
|
||||
|
||||
def _getSettings(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get billing settings with caching."""
|
||||
if self._settingsCache is None:
|
||||
self._settingsCache = self._billingInterface.getSettings(self.mandateId)
|
||||
return self._settingsCache
|
||||
|
||||
# =========================================================================
|
||||
# Price Calculation
|
||||
# =========================================================================
|
||||
|
||||
def calculatePriceWithMarkup(self, basePriceCHF: float) -> float:
|
||||
"""
|
||||
Calculate final price with markup.
|
||||
|
||||
The AICore plugins return prices in their original currency (USD).
|
||||
This method applies the configured markup percentage.
|
||||
|
||||
Args:
|
||||
basePriceCHF: Base price from AI model (actually USD from provider)
|
||||
|
||||
Returns:
|
||||
Final price in CHF with markup applied
|
||||
"""
|
||||
if basePriceCHF <= 0:
|
||||
return 0.0
|
||||
|
||||
# Apply markup (50% = multiply by 1.5)
|
||||
markup_multiplier = 1 + (BILLING_MARKUP_PERCENT / 100)
|
||||
return round(basePriceCHF * markup_multiplier, 6)
|
||||
|
||||
# =========================================================================
|
||||
# Balance Operations
|
||||
# =========================================================================
|
||||
|
||||
def checkBalance(self, estimatedCost: float = 0.0) -> BillingCheckResult:
|
||||
"""
|
||||
Check if the current user/mandate has sufficient balance.
|
||||
|
||||
Args:
|
||||
estimatedCost: Estimated cost of the operation (with markup applied)
|
||||
|
||||
Returns:
|
||||
BillingCheckResult indicating if operation is allowed
|
||||
"""
|
||||
return self._billingInterface.checkBalance(
|
||||
self.mandateId,
|
||||
self.currentUser.id,
|
||||
estimatedCost
|
||||
)
|
||||
|
||||
def hasBalance(self, estimatedCost: float = 0.0) -> bool:
|
||||
"""
|
||||
Quick check if balance is sufficient.
|
||||
|
||||
Args:
|
||||
estimatedCost: Estimated cost with markup
|
||||
|
||||
Returns:
|
||||
True if operation is allowed
|
||||
"""
|
||||
result = self.checkBalance(estimatedCost)
|
||||
return result.allowed
|
||||
|
||||
def getCurrentBalance(self) -> float:
|
||||
"""
|
||||
Get current balance for the user/mandate.
|
||||
|
||||
Returns:
|
||||
Current balance in CHF
|
||||
"""
|
||||
result = self.checkBalance(0.0)
|
||||
return result.currentBalance or 0.0
|
||||
|
||||
# =========================================================================
|
||||
# Usage Recording
|
||||
# =========================================================================
|
||||
|
||||
def recordUsage(
|
||||
self,
|
||||
priceCHF: float,
|
||||
workflowId: str = None,
|
||||
aicoreProvider: str = None,
|
||||
aicoreModel: str = None,
|
||||
description: str = None
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Record AI usage cost as a billing transaction.
|
||||
|
||||
This method:
|
||||
1. Applies the pricing markup
|
||||
2. Creates a DEBIT transaction
|
||||
3. Updates the account balance
|
||||
|
||||
Args:
|
||||
priceCHF: Base price from AI model (before markup)
|
||||
workflowId: Optional workflow ID
|
||||
aicoreProvider: AICore provider name (e.g., 'anthropic', 'openai')
|
||||
aicoreModel: AICore model name (e.g., 'claude-4-sonnet', 'gpt-4o')
|
||||
description: Optional description
|
||||
|
||||
Returns:
|
||||
Created transaction dict or None if not recorded
|
||||
"""
|
||||
if priceCHF <= 0:
|
||||
return None
|
||||
|
||||
# Apply markup
|
||||
finalPrice = self.calculatePriceWithMarkup(priceCHF)
|
||||
|
||||
if finalPrice <= 0:
|
||||
return None
|
||||
|
||||
# Build description
|
||||
if not description:
|
||||
description = f"AI Usage: {aicoreModel or aicoreProvider or 'unknown'}"
|
||||
|
||||
return self._billingInterface.recordUsage(
|
||||
mandateId=self.mandateId,
|
||||
userId=self.currentUser.id,
|
||||
priceCHF=finalPrice,
|
||||
workflowId=workflowId,
|
||||
featureInstanceId=self.featureInstanceId,
|
||||
featureCode=self.featureCode,
|
||||
aicoreProvider=aicoreProvider,
|
||||
aicoreModel=aicoreModel,
|
||||
description=description
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# Provider Permission Check (via RBAC)
|
||||
# =========================================================================
|
||||
|
||||
def isProviderAllowed(self, provider: str) -> bool:
|
||||
"""
|
||||
Check if the user has permission to use an AICore provider.
|
||||
|
||||
Uses RBAC to check for resource permission:
|
||||
resource.aicore.{provider}
|
||||
|
||||
Args:
|
||||
provider: Provider name (e.g., 'anthropic', 'openai')
|
||||
|
||||
Returns:
|
||||
True if provider is allowed
|
||||
"""
|
||||
try:
|
||||
from modules.security.rbac import RbacClass
|
||||
from modules.datamodels.datamodelRbac import AccessRuleContext
|
||||
from modules.security.rootAccess import getRootDbAppConnector
|
||||
|
||||
# Get database connector via established pattern
|
||||
dbApp = getRootDbAppConnector()
|
||||
|
||||
rbac = RbacClass(dbApp, dbApp)
|
||||
resourceKey = f"resource.aicore.{provider}"
|
||||
|
||||
# Check if user has view permission for this resource (view = use for RESOURCE context)
|
||||
permissions = rbac.getUserPermissions(
|
||||
self.currentUser,
|
||||
AccessRuleContext.RESOURCE,
|
||||
resourceKey,
|
||||
mandateId=self.mandateId
|
||||
)
|
||||
|
||||
return permissions.view
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking provider permission: {e}")
|
||||
# Default to allowed if RBAC check fails
|
||||
return True
|
||||
|
||||
def getallowedProviders(self) -> List[str]:
|
||||
"""
|
||||
Get list of AICore providers the user is allowed to use.
|
||||
|
||||
Returns:
|
||||
List of allowed provider names
|
||||
"""
|
||||
try:
|
||||
from modules.aicore.aicoreModelRegistry import modelRegistry
|
||||
|
||||
# Get all available providers
|
||||
connectors = modelRegistry.discoverConnectors()
|
||||
allProviders = [c.getConnectorType() for c in connectors]
|
||||
|
||||
# Filter by RBAC permissions
|
||||
return [p for p in allProviders if self.isProviderAllowed(p)]
|
||||
except Exception as e:
|
||||
logger.warning(f"Error getting allowed providers: {e}")
|
||||
return []
|
||||
|
||||
# =========================================================================
|
||||
# Admin Operations
|
||||
# =========================================================================
|
||||
|
||||
def addCredit(
|
||||
self,
|
||||
amount: float,
|
||||
description: str = "Manual credit",
|
||||
referenceType: ReferenceTypeEnum = ReferenceTypeEnum.ADMIN
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Add credit to the account (admin operation).
|
||||
|
||||
Args:
|
||||
amount: Amount to credit (positive)
|
||||
description: Transaction description
|
||||
referenceType: Reference type (ADMIN, PAYMENT, SYSTEM)
|
||||
|
||||
Returns:
|
||||
Created transaction dict or None
|
||||
"""
|
||||
if amount <= 0:
|
||||
return None
|
||||
|
||||
settings = self._getSettings()
|
||||
if not settings:
|
||||
logger.warning(f"No billing settings for mandate {self.mandateId}")
|
||||
return None
|
||||
|
||||
billingModel = BillingModelEnum(settings.get("billingModel", BillingModelEnum.UNLIMITED.value))
|
||||
|
||||
# Get or create account
|
||||
if billingModel == BillingModelEnum.PREPAY_USER:
|
||||
account = self._billingInterface.getOrCreateUserAccount(
|
||||
self.mandateId,
|
||||
self.currentUser.id,
|
||||
initialBalance=0.0
|
||||
)
|
||||
else:
|
||||
account = self._billingInterface.getOrCreateMandateAccount(
|
||||
self.mandateId,
|
||||
initialBalance=0.0
|
||||
)
|
||||
|
||||
# Create credit transaction
|
||||
transaction = BillingTransaction(
|
||||
accountId=account["id"],
|
||||
transactionType=TransactionTypeEnum.CREDIT,
|
||||
amount=amount,
|
||||
description=description,
|
||||
referenceType=referenceType
|
||||
)
|
||||
|
||||
return self._billingInterface.createTransaction(transaction)
|
||||
|
||||
# =========================================================================
|
||||
# Statistics & Reporting
|
||||
# =========================================================================
|
||||
|
||||
def getBalancesForUser(self) -> List[BillingBalanceResponse]:
|
||||
"""
|
||||
Get all billing balances for the current user.
|
||||
|
||||
Returns:
|
||||
List of balance responses for each mandate
|
||||
"""
|
||||
return self._billingInterface.getBalancesForUser(self.currentUser.id)
|
||||
|
||||
def getTransactionHistory(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get transaction history for the user across all mandates.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of transactions
|
||||
|
||||
Returns:
|
||||
List of transactions
|
||||
"""
|
||||
return self._billingInterface.getTransactionsForUser(self.currentUser.id, limit=limit)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Exception Classes
|
||||
# ============================================================================
|
||||
|
||||
class InsufficientBalanceException(Exception):
|
||||
"""Raised when there's insufficient balance for an operation."""
|
||||
|
||||
def __init__(self, currentBalance: float, requiredAmount: float, message: str = None):
|
||||
self.currentBalance = currentBalance
|
||||
self.requiredAmount = requiredAmount
|
||||
self.message = message or f"Insufficient balance. Current: {currentBalance:.2f} CHF, Required: {requiredAmount:.2f} CHF"
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
class ProviderNotAllowedException(Exception):
|
||||
"""Raised when a user doesn't have permission to use an AI provider."""
|
||||
|
||||
def __init__(self, provider: str, message: str = None):
|
||||
self.provider = provider
|
||||
self.message = message or f"Provider '{provider}' is not allowed for your role"
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
class BillingContextError(Exception):
|
||||
"""Raised when billing context is incomplete (missing mandateId, user, etc.).
|
||||
|
||||
This is a FAIL-SAFE error: AI calls MUST NOT proceed without valid billing context.
|
||||
Acts like a 0 CHF credit card pre-authorization check - validates that billing
|
||||
CAN be recorded before any expensive AI operation starts.
|
||||
"""
|
||||
|
||||
def __init__(self, message: str = None):
|
||||
self.message = message or "Billing context incomplete - AI call blocked"
|
||||
super().__init__(self.message)
|
||||
|
|
@ -1,104 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Stripe Checkout service for billing credit top-ups.
|
||||
Creates Checkout Sessions for redirect-based payment flow.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from modules.shared.configuration import APP_CONFIG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Server-side allowed amounts in CHF - never trust client
|
||||
ALLOWED_AMOUNTS_CHF = [10, 25, 50, 100, 250, 500]
|
||||
|
||||
|
||||
def create_checkout_session(
|
||||
mandate_id: str,
|
||||
user_id: Optional[str],
|
||||
amount_chf: float
|
||||
) -> str:
|
||||
"""
|
||||
Create a Stripe Checkout Session for credit top-up.
|
||||
|
||||
Amount and currency are validated server-side. The client-provided amount
|
||||
must match an allowed preset.
|
||||
|
||||
Args:
|
||||
mandate_id: Target mandate ID
|
||||
user_id: Target user ID (for PREPAY_USER) or None (for mandate pool)
|
||||
amount_chf: Amount in CHF (must be in ALLOWED_AMOUNTS_CHF)
|
||||
|
||||
Returns:
|
||||
Stripe Checkout Session URL for redirect
|
||||
|
||||
Raises:
|
||||
ValueError: If amount is invalid
|
||||
"""
|
||||
import stripe
|
||||
|
||||
# Validate amount server-side
|
||||
if amount_chf not in ALLOWED_AMOUNTS_CHF:
|
||||
raise ValueError(
|
||||
f"Invalid amount {amount_chf} CHF. Allowed: {ALLOWED_AMOUNTS_CHF}"
|
||||
)
|
||||
|
||||
# Pin API version from config (match Stripe Dashboard)
|
||||
api_version = APP_CONFIG.get("STRIPE_API_VERSION")
|
||||
if api_version:
|
||||
stripe.api_version = api_version
|
||||
|
||||
# Get secrets
|
||||
secret_key = APP_CONFIG.get("STRIPE_SECRET_KEY_SECRET") or APP_CONFIG.get("STRIPE_SECRET_KEY")
|
||||
if not secret_key:
|
||||
raise ValueError("STRIPE_SECRET_KEY_SECRET not configured")
|
||||
|
||||
stripe.api_key = secret_key
|
||||
|
||||
frontend_url = APP_CONFIG.get("APP_FRONTEND_URL", "https://nyla-int.poweron-center.net")
|
||||
base_path = "/admin/billing"
|
||||
success_url = f"{frontend_url.rstrip('/')}{base_path}?success=true&session_id={{CHECKOUT_SESSION_ID}}"
|
||||
cancel_url = f"{frontend_url.rstrip('/')}{base_path}?canceled=true"
|
||||
|
||||
# Amount in cents for Stripe (CHF uses 2 decimal places)
|
||||
amount_cents = int(round(amount_chf * 100))
|
||||
|
||||
metadata = {
|
||||
"mandateId": mandate_id,
|
||||
"amountChf": str(amount_chf),
|
||||
}
|
||||
if user_id:
|
||||
metadata["userId"] = user_id
|
||||
|
||||
session = stripe.checkout.Session.create(
|
||||
mode="payment",
|
||||
line_items=[
|
||||
{
|
||||
"price_data": {
|
||||
"currency": "chf",
|
||||
"unit_amount": amount_cents,
|
||||
"product_data": {
|
||||
"name": "Guthaben aufladen",
|
||||
"description": "AI Service Guthaben (CHF)",
|
||||
},
|
||||
},
|
||||
"quantity": 1,
|
||||
}
|
||||
],
|
||||
success_url=success_url,
|
||||
cancel_url=cancel_url,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
if not session or not session.url:
|
||||
raise ValueError("Stripe Checkout Session creation failed")
|
||||
|
||||
logger.info(
|
||||
f"Created Stripe Checkout Session {session.id} for mandate {mandate_id}, "
|
||||
f"amount {amount_chf} CHF"
|
||||
)
|
||||
|
||||
return session.url
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,7 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from .mainServiceExtraction import ExtractionService
|
||||
|
||||
__all__ = ["ExtractionService"]
|
||||
|
||||
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
|
||||
|
||||
|
|
@ -1,184 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import base64
|
||||
import io
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Chunker
|
||||
|
||||
|
||||
class ImageChunker(Chunker):
|
||||
"""Chunker for reducing image size through resizing, compression, and tiling."""
|
||||
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
"""
|
||||
Chunk an image by reducing its size through various strategies.
|
||||
|
||||
Args:
|
||||
part: ContentPart containing image data (base64 encoded)
|
||||
options: Chunking options including:
|
||||
- imageChunkSize: Maximum size in bytes for each chunk
|
||||
- imageMaxPixels: Maximum pixels (width*height) for the image
|
||||
- imageQuality: JPEG quality (0-100, default 85)
|
||||
- imageTileSize: Size for tiling if image is still too large
|
||||
|
||||
Returns:
|
||||
List of image chunks with reduced size
|
||||
"""
|
||||
maxBytes = int(options.get("imageChunkSize", 1000000)) # 1MB default
|
||||
maxPixels = int(options.get("imageMaxPixels", 1024 * 1024)) # 1MP default
|
||||
quality = int(options.get("imageQuality", 85))
|
||||
tileSize = int(options.get("imageTileSize", 512)) # 512x512 tiles
|
||||
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
|
||||
try:
|
||||
# Lazy import PIL to avoid hanging during module import
|
||||
from PIL import Image
|
||||
|
||||
# Decode base64 image data
|
||||
imageData = base64.b64decode(part.data)
|
||||
image = Image.open(io.BytesIO(imageData))
|
||||
|
||||
# Get original dimensions
|
||||
originalWidth, originalHeight = image.size
|
||||
originalPixels = originalWidth * originalHeight
|
||||
|
||||
# Strategy 1: If image is small enough, return as-is
|
||||
if len(part.data) <= maxBytes and originalPixels <= maxPixels:
|
||||
chunks.append({
|
||||
"data": part.data,
|
||||
"size": len(part.data),
|
||||
"order": 0,
|
||||
"metadata": {
|
||||
"originalSize": len(part.data),
|
||||
"originalPixels": originalPixels,
|
||||
"strategy": "original"
|
||||
}
|
||||
})
|
||||
return chunks
|
||||
|
||||
# Strategy 2: Resize to fit within pixel limit
|
||||
if originalPixels > maxPixels:
|
||||
# Calculate new dimensions maintaining aspect ratio
|
||||
scale = (maxPixels / originalPixels) ** 0.5
|
||||
newWidth = int(originalWidth * scale)
|
||||
newHeight = int(originalHeight * scale)
|
||||
|
||||
# Ensure minimum size
|
||||
newWidth = max(newWidth, 64)
|
||||
newHeight = max(newHeight, 64)
|
||||
|
||||
image = image.resize((newWidth, newHeight), Image.Resampling.LANCZOS)
|
||||
|
||||
# Strategy 3: Compress with quality reduction
|
||||
currentSize = len(part.data)
|
||||
currentQuality = quality
|
||||
|
||||
while currentSize > maxBytes and currentQuality > 10:
|
||||
# Compress image
|
||||
output = io.BytesIO()
|
||||
image.save(output, format='JPEG', quality=currentQuality, optimize=True)
|
||||
compressedData = output.getvalue()
|
||||
compressedB64 = base64.b64encode(compressedData).decode('utf-8')
|
||||
currentSize = len(compressedB64)
|
||||
|
||||
if currentSize <= maxBytes:
|
||||
chunks.append({
|
||||
"data": compressedB64,
|
||||
"size": currentSize,
|
||||
"order": 0,
|
||||
"metadata": {
|
||||
"originalSize": len(part.data),
|
||||
"originalPixels": originalPixels,
|
||||
"compressedSize": currentSize,
|
||||
"quality": currentQuality,
|
||||
"strategy": "compressed"
|
||||
}
|
||||
})
|
||||
return chunks
|
||||
|
||||
currentQuality -= 10
|
||||
|
||||
# Strategy 4: Tile the image if still too large
|
||||
if currentSize > maxBytes:
|
||||
chunks = self._tileImage(image, maxBytes, tileSize, quality, originalPixels)
|
||||
return chunks
|
||||
|
||||
# Fallback: Return compressed version even if over limit
|
||||
output = io.BytesIO()
|
||||
image.save(output, format='JPEG', quality=10, optimize=True)
|
||||
compressedData = output.getvalue()
|
||||
compressedB64 = base64.b64encode(compressedData).decode('utf-8')
|
||||
|
||||
chunks.append({
|
||||
"data": compressedB64,
|
||||
"size": len(compressedB64),
|
||||
"order": 0,
|
||||
"metadata": {
|
||||
"originalSize": len(part.data),
|
||||
"originalPixels": originalPixels,
|
||||
"compressedSize": len(compressedB64),
|
||||
"quality": 10,
|
||||
"strategy": "fallback_compressed"
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
# Fallback: Return original data with error metadata
|
||||
chunks.append({
|
||||
"data": part.data,
|
||||
"size": len(part.data),
|
||||
"order": 0,
|
||||
"metadata": {
|
||||
"originalSize": len(part.data),
|
||||
"strategy": "error_fallback",
|
||||
"error": str(e)
|
||||
}
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
def _tileImage(self, image: Any, maxBytes: int, tileSize: int, quality: int, originalPixels: int) -> List[Dict[str, Any]]:
|
||||
"""Split image into tiles if it's still too large after compression."""
|
||||
chunks = []
|
||||
width, height = image.size
|
||||
|
||||
# Calculate tile grid
|
||||
tilesX = (width + tileSize - 1) // tileSize
|
||||
tilesY = (height + tileSize - 1) // tileSize
|
||||
|
||||
for y in range(tilesY):
|
||||
for x in range(tilesX):
|
||||
# Calculate tile boundaries
|
||||
left = x * tileSize
|
||||
top = y * tileSize
|
||||
right = min(left + tileSize, width)
|
||||
bottom = min(top + tileSize, height)
|
||||
|
||||
# Extract tile
|
||||
tile = image.crop((left, top, right, bottom))
|
||||
|
||||
# Compress tile
|
||||
output = io.BytesIO()
|
||||
tile.save(output, format='JPEG', quality=quality, optimize=True)
|
||||
tileData = output.getvalue()
|
||||
tileB64 = base64.b64encode(tileData).decode('utf-8')
|
||||
|
||||
chunks.append({
|
||||
"data": tileB64,
|
||||
"size": len(tileB64),
|
||||
"order": y * tilesX + x,
|
||||
"metadata": {
|
||||
"originalSize": len(image.tobytes()),
|
||||
"originalPixels": originalPixels,
|
||||
"tileSize": tileSize,
|
||||
"tilePosition": f"{x},{y}",
|
||||
"tileBounds": f"{left},{top},{right},{bottom}",
|
||||
"quality": quality,
|
||||
"strategy": "tiled"
|
||||
}
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
|
@ -1,91 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import json
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Chunker
|
||||
|
||||
|
||||
class StructureChunker(Chunker):
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
maxBytes = int(options.get("structureChunkSize", 40000))
|
||||
data = part.data or ""
|
||||
# best-effort: try JSON list/object bucketing; else fallback to line-based
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
try:
|
||||
obj = json.loads(data)
|
||||
def emit(bucket: Any):
|
||||
text = json.dumps(bucket, ensure_ascii=False)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
if isinstance(obj, list):
|
||||
bucket: list[Any] = []
|
||||
size = 0
|
||||
for item in obj:
|
||||
text = json.dumps(item, ensure_ascii=False)
|
||||
s = len(text.encode('utf-8'))
|
||||
if size + s > maxBytes and bucket:
|
||||
emit(bucket)
|
||||
bucket = [item]
|
||||
size = s
|
||||
else:
|
||||
bucket.append(item)
|
||||
size += s
|
||||
if bucket:
|
||||
emit(bucket)
|
||||
else:
|
||||
# JSON object (dict) - check if it fits
|
||||
text = json.dumps(obj, ensure_ascii=False)
|
||||
textSize = len(text.encode('utf-8'))
|
||||
if textSize <= maxBytes:
|
||||
emit(obj)
|
||||
else:
|
||||
# Object too large - try to split by keys if possible
|
||||
# For large objects, we need to chunk by character boundaries
|
||||
# since we can't split JSON objects arbitrarily
|
||||
if isinstance(obj, dict) and len(obj) > 1:
|
||||
# Try to split object into multiple chunks by keys
|
||||
# This preserves JSON structure better than line-based chunking
|
||||
currentChunk: Dict[str, Any] = {}
|
||||
currentSize = 2 # Start with "{}" overhead
|
||||
for key, value in obj.items():
|
||||
itemText = json.dumps({key: value}, ensure_ascii=False)
|
||||
itemSize = len(itemText.encode('utf-8'))
|
||||
# Account for comma and spacing between items
|
||||
if currentChunk:
|
||||
itemSize += 2 # ", " separator
|
||||
|
||||
if currentSize + itemSize > maxBytes and currentChunk:
|
||||
# Current chunk is full, emit it
|
||||
emit(currentChunk)
|
||||
currentChunk = {key: value}
|
||||
currentSize = len(itemText.encode('utf-8'))
|
||||
else:
|
||||
currentChunk[key] = value
|
||||
currentSize += itemSize
|
||||
|
||||
# Emit remaining chunk
|
||||
if currentChunk:
|
||||
emit(currentChunk)
|
||||
else:
|
||||
# Single large value or can't split - fallback to line chunking
|
||||
raise ValueError("too large")
|
||||
except Exception:
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
for line in data.split('\n'):
|
||||
s = len(line.encode('utf-8')) + 1
|
||||
if size + s > maxBytes and current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
current = [line]
|
||||
size = s
|
||||
else:
|
||||
current.append(line)
|
||||
size += s
|
||||
if current:
|
||||
text = '\n'.join(current)
|
||||
chunks.append({"data": text, "size": len(text.encode('utf-8')), "order": len(chunks)})
|
||||
return chunks
|
||||
|
||||
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Chunker
|
||||
|
||||
|
||||
class TableChunker(Chunker):
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
maxBytes = int(options.get("tableChunkSize", 40000))
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
for line in part.data.split('\n'):
|
||||
lineSize = len(line.encode('utf-8')) + 1
|
||||
if size + lineSize > maxBytes and current:
|
||||
data = '\n'.join(current)
|
||||
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
||||
current = [line]
|
||||
size = lineSize
|
||||
else:
|
||||
current.append(line)
|
||||
size += lineSize
|
||||
if current:
|
||||
data = '\n'.join(current)
|
||||
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
||||
return chunks
|
||||
|
||||
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import logging
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Chunker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextChunker(Chunker):
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
maxBytes = int(options.get("textChunkSize", 40000))
|
||||
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
|
||||
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
|
||||
# Split by lines first (preferred method for text)
|
||||
lines = part.data.split('\n')
|
||||
current: List[str] = []
|
||||
size = 0
|
||||
|
||||
for line in lines:
|
||||
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline character
|
||||
if size + lineSize > maxBytes and current:
|
||||
# Current chunk is full, save it and start new one
|
||||
data = '\n'.join(current)
|
||||
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
||||
current = []
|
||||
size = 0
|
||||
|
||||
# If a single line is larger than maxBytes, split it by character boundaries
|
||||
if lineSize > maxBytes:
|
||||
# Split the long line into chunks
|
||||
lineBytes = line.encode('utf-8')
|
||||
lineStart = 0
|
||||
while lineStart < len(lineBytes):
|
||||
chunkBytes = lineBytes[lineStart:lineStart + maxBytes]
|
||||
chunkText = chunkBytes.decode('utf-8', errors='ignore')
|
||||
chunks.append({"data": chunkText, "size": len(chunkBytes), "order": len(chunks)})
|
||||
lineStart += maxBytes
|
||||
# Don't add this line to current, it's already chunked
|
||||
continue
|
||||
|
||||
# Add line to current chunk
|
||||
current.append(line)
|
||||
size += lineSize
|
||||
|
||||
# Add remaining lines as final chunk
|
||||
if current:
|
||||
data = '\n'.join(current)
|
||||
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
|
||||
|
||||
logger.debug(f"TextChunker: Created {len(chunks)} chunks, total input size: {len(part.data.encode('utf-8'))} bytes")
|
||||
return chunks
|
||||
|
||||
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
|
||||
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import base64
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class BinaryExtractor(Extractor):
|
||||
"""
|
||||
Fallback extractor for unsupported file types.
|
||||
|
||||
This extractor handles any file type that doesn't match other extractors.
|
||||
It encodes the file as base64 and marks it as binary data.
|
||||
|
||||
Supported formats:
|
||||
- All file types (fallback)
|
||||
- MIME types: application/octet-stream (default)
|
||||
- File extensions: All (fallback)
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return True
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (all)."""
|
||||
return [] # Accepts all extensions as fallback
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types (all)."""
|
||||
return [] # Accepts all MIME types as fallback
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/octet-stream"
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="binary",
|
||||
typeGroup="binary",
|
||||
mimeType=mimeType,
|
||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||
metadata={"size": len(fileBytes), "warning": "Unsupported file type"}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class CsvExtractor(Extractor):
|
||||
"""
|
||||
Extractor for CSV files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/csv
|
||||
- File extensions: .csv
|
||||
- Special handling: Treats as table data
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/csv" or (fileName or "").lower().endswith(".csv")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".csv"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/csv"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/csv"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="table",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import io
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class DocxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Word documents.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.wordprocessingml.document
|
||||
- File extensions: .docx
|
||||
- Special handling: Extracts paragraphs and tables (converts tables to CSV)
|
||||
- Dependencies: python-docx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
||||
def _load(self):
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
try:
|
||||
global docx
|
||||
import docx # python-docx
|
||||
self._haveLibs = True
|
||||
except Exception:
|
||||
self._haveLibs = False
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".docx"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
rootId = makeId()
|
||||
parts.append(ContentPart(
|
||||
id=rootId,
|
||||
parentId=None,
|
||||
label="docx",
|
||||
typeGroup="container",
|
||||
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
data="",
|
||||
metadata={"size": len(fileBytes)}
|
||||
))
|
||||
|
||||
if not self._haveLibs:
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label="binary",
|
||||
typeGroup="binary",
|
||||
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
data="",
|
||||
metadata={"size": len(fileBytes), "warning": "DOCX lib not available"}
|
||||
))
|
||||
return parts
|
||||
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
d = docx.Document(buf)
|
||||
# paragraphs
|
||||
for i, para in enumerate(d.paragraphs):
|
||||
text = para.text or ""
|
||||
if text.strip():
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"p_{i+1}",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=text,
|
||||
metadata={"size": len(text.encode('utf-8'))}
|
||||
))
|
||||
# tables → CSV rows
|
||||
for ti, table in enumerate(d.tables):
|
||||
rows: list[str] = []
|
||||
for row in table.rows:
|
||||
cells = [ (cell.text or "").replace('"', '""') for cell in row.cells ]
|
||||
rows.append(",".join([f'"{c}"' for c in cells]))
|
||||
csvData = "\n".join(rows)
|
||||
if csvData:
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"table_{ti+1}",
|
||||
typeGroup="table",
|
||||
mimeType="text/csv",
|
||||
data=csvData,
|
||||
metadata={"size": len(csvData.encode('utf-8'))}
|
||||
))
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class HtmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for HTML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/html
|
||||
- File extensions: .html, .htm
|
||||
- Special handling: Uses BeautifulSoup for parsing
|
||||
- Dependencies: beautifulsoup4
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "text/html" or (fileName or "").lower().endswith((".html", ".htm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".html", ".htm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/html"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "text/html"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
try:
|
||||
BeautifulSoup(text, "html.parser")
|
||||
except Exception:
|
||||
pass
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=text,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImageExtractor(Extractor):
|
||||
"""
|
||||
Extractor for image files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: image/jpeg, image/png, image/gif, image/webp, image/bmp, image/tiff
|
||||
- File extensions: .jpg, .jpeg, .png, .gif, .webp, .bmp, .tiff
|
||||
- Special handling: GIF files are converted to PNG during extraction
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return ((mimeType or "").startswith("image/") or
|
||||
(fileName or "").lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp", "image/tiff"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "image/unknown"
|
||||
fileName = context.get("fileName", "")
|
||||
|
||||
# Convert GIF to PNG during extraction
|
||||
if mimeType.lower() == "image/gif":
|
||||
try:
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Open GIF and convert to PNG
|
||||
with Image.open(io.BytesIO(fileBytes)) as img:
|
||||
# Convert to RGB (removes animation)
|
||||
if img.mode in ('RGBA', 'LA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Save as PNG in memory
|
||||
png_buffer = io.BytesIO()
|
||||
img.save(png_buffer, format='PNG')
|
||||
png_data = png_buffer.getvalue()
|
||||
|
||||
# Update mimeType and fileBytes
|
||||
mimeType = "image/png"
|
||||
fileBytes = png_data
|
||||
|
||||
logger.info(f"GIF converted to PNG during extraction: {fileName}, original={len(fileBytes)} bytes, converted={len(png_data)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"GIF conversion failed during extraction for {fileName}: {str(e)}, using original")
|
||||
# Keep original GIF data if conversion fails
|
||||
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="image",
|
||||
typeGroup="image",
|
||||
mimeType=mimeType,
|
||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import json
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class JsonExtractor(Extractor):
|
||||
"""
|
||||
Extractor for JSON files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/json
|
||||
- File extensions: .json
|
||||
- Special handling: Validates JSON format, falls back to text if invalid
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/json" or (fileName or "").lower().endswith(".json")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".json"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/json"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/json"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
# verify JSON is well-formed; fall back to text if not
|
||||
try:
|
||||
json.loads(text)
|
||||
except Exception:
|
||||
pass
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=text,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,156 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import base64
|
||||
import io
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class PdfExtractor(Extractor):
|
||||
"""
|
||||
Extractor for PDF files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/pdf
|
||||
- File extensions: .pdf
|
||||
- Special handling: Extracts text per page and embedded images
|
||||
- Dependencies: PyPDF2, PyMuPDF (fitz)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
||||
def _load(self):
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
try:
|
||||
global PyPDF2, fitz
|
||||
import PyPDF2
|
||||
import fitz # PyMuPDF
|
||||
self._haveLibs = True
|
||||
except Exception:
|
||||
self._haveLibs = False
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/pdf" or (fileName or "").lower().endswith(".pdf")
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pdf"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/pdf"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
rootId = makeId()
|
||||
parts.append(ContentPart(
|
||||
id=rootId,
|
||||
parentId=None,
|
||||
label="pdf",
|
||||
typeGroup="container",
|
||||
mimeType="application/pdf",
|
||||
data="",
|
||||
metadata={"size": len(fileBytes)}
|
||||
))
|
||||
|
||||
if not self._haveLibs:
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label="binary",
|
||||
typeGroup="binary",
|
||||
mimeType="application/pdf",
|
||||
data=base64.b64encode(fileBytes).decode("utf-8"),
|
||||
metadata={"size": len(fileBytes), "warning": "PDF libs not available"}
|
||||
))
|
||||
return parts
|
||||
|
||||
# Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
|
||||
for i in range(len(doc)):
|
||||
try:
|
||||
page = doc[i]
|
||||
text = page.get_text() or ""
|
||||
if text.strip():
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"page_{i+1}",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=text,
|
||||
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
|
||||
has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
|
||||
if not has_text:
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
reader = PyPDF2.PdfReader(buf)
|
||||
for i, page in enumerate(reader.pages):
|
||||
try:
|
||||
text = page.extract_text() or ""
|
||||
if text.strip():
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"page_{i+1}",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=text,
|
||||
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract images with PyMuPDF
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf2:
|
||||
doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
images = page.get_images(full=True)
|
||||
for j, img in enumerate(images):
|
||||
try:
|
||||
xref = img[0]
|
||||
baseImage = doc.extract_image(xref)
|
||||
if baseImage:
|
||||
imgBytes = baseImage.get("image", b"")
|
||||
ext = baseImage.get("ext", "png")
|
||||
if imgBytes:
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"image_{i+1}_{j}",
|
||||
typeGroup="image",
|
||||
mimeType=f"image/{ext}",
|
||||
data=base64.b64encode(imgBytes).decode("utf-8"),
|
||||
metadata={"pageIndex": i, "size": len(imgBytes)}
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
import logging
|
||||
import base64
|
||||
from typing import List, Dict, Any, Optional
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, ContentExtracted
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PptxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for PowerPoint files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.presentationml.presentation, application/vnd.ms-powerpoint
|
||||
- File extensions: .pptx, .ppt
|
||||
- Special handling: Extracts slide content, tables, and images
|
||||
- Dependencies: python-pptx
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
||||
def _load(self):
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
try:
|
||||
global Presentation
|
||||
from pptx import Presentation
|
||||
self._haveLibs = True
|
||||
except Exception:
|
||||
self._haveLibs = False
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType in [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint"
|
||||
]) or (fileName or "").lower().endswith((".pptx", ".ppt"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".pptx", ".ppt"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-powerpoint"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
"""
|
||||
Extract content from PowerPoint files.
|
||||
|
||||
Args:
|
||||
fileBytes: Raw file data as bytes
|
||||
context: Context dictionary with file information
|
||||
|
||||
Returns:
|
||||
List of ContentPart objects with extracted content
|
||||
"""
|
||||
self._load()
|
||||
|
||||
if not self._haveLibs:
|
||||
logger.error("python-pptx library not installed. Install with: pip install python-pptx")
|
||||
return [ContentPart(
|
||||
id="error",
|
||||
label="PowerPoint Extraction Error",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data="Error: python-pptx library not installed",
|
||||
metadata={"error": True, "error_message": "python-pptx library not installed"}
|
||||
)]
|
||||
|
||||
try:
|
||||
import io
|
||||
|
||||
# Load presentation from bytes
|
||||
presentation = Presentation(io.BytesIO(fileBytes))
|
||||
|
||||
parts = []
|
||||
slide_index = 0
|
||||
|
||||
# Extract content from each slide
|
||||
for slide in presentation.slides:
|
||||
slide_index += 1
|
||||
slide_content = []
|
||||
|
||||
# Extract text from slide
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
slide_content.append(shape.text.strip())
|
||||
|
||||
# Extract table data
|
||||
for shape in slide.shapes:
|
||||
if shape.has_table:
|
||||
table = shape.table
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
row_data.append(cell.text.strip())
|
||||
table_data.append(row_data)
|
||||
|
||||
if table_data:
|
||||
# Convert table to markdown format
|
||||
table_md = self._table_to_markdown(table_data)
|
||||
slide_content.append(table_md)
|
||||
|
||||
# Extract images
|
||||
for shape in slide.shapes:
|
||||
if shape.shape_type == 13: # MSO_SHAPE_TYPE.PICTURE
|
||||
try:
|
||||
image = shape.image
|
||||
image_bytes = image.blob
|
||||
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
|
||||
|
||||
# Create image part
|
||||
image_part = ContentPart(
|
||||
id=f"slide_{slide_index}_image_{len(parts)}",
|
||||
label=f"Slide {slide_index} Image",
|
||||
typeGroup="image",
|
||||
mimeType="image/png", # Default to PNG
|
||||
data=image_b64,
|
||||
metadata={
|
||||
"slide_number": slide_index,
|
||||
"shape_type": "image",
|
||||
"extracted_from": "powerpoint"
|
||||
}
|
||||
)
|
||||
parts.append(image_part)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract image from slide {slide_index}: {str(e)}")
|
||||
|
||||
# Create slide content part
|
||||
if slide_content:
|
||||
slide_text = f"# Slide {slide_index}\n\n" + "\n\n".join(slide_content)
|
||||
|
||||
slide_part = ContentPart(
|
||||
id=f"slide_{slide_index}",
|
||||
label=f"Slide {slide_index} Content",
|
||||
typeGroup="structure",
|
||||
mimeType="text/plain",
|
||||
data=slide_text,
|
||||
metadata={
|
||||
"slide_number": slide_index,
|
||||
"content_type": "slide",
|
||||
"extracted_from": "powerpoint",
|
||||
"text_length": len(slide_text)
|
||||
}
|
||||
)
|
||||
parts.append(slide_part)
|
||||
|
||||
# Create presentation overview
|
||||
file_name = context.get("fileName", "presentation.pptx")
|
||||
overview_text = f"# PowerPoint Presentation: {file_name}\n\n"
|
||||
overview_text += f"**Total Slides:** {len(presentation.slides)}\n\n"
|
||||
overview_text += f"**Content Parts:** {len(parts)}\n\n"
|
||||
|
||||
# Add slide summaries
|
||||
for i, slide in enumerate(presentation.slides, 1):
|
||||
slide_text_parts = []
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
slide_text_parts.append(shape.text.strip())
|
||||
|
||||
if slide_text_parts:
|
||||
overview_text += f"## Slide {i}\n"
|
||||
overview_text += "\n".join(slide_text_parts[:3]) # First 3 text elements
|
||||
overview_text += "\n\n"
|
||||
|
||||
# Create overview part
|
||||
overview_part = ContentPart(
|
||||
id="presentation_overview",
|
||||
label="Presentation Overview",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=overview_text,
|
||||
metadata={
|
||||
"content_type": "overview",
|
||||
"extracted_from": "powerpoint",
|
||||
"total_slides": len(presentation.slides),
|
||||
"text_length": len(overview_text)
|
||||
}
|
||||
)
|
||||
parts.insert(0, overview_part) # Insert at beginning
|
||||
|
||||
return parts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting PowerPoint content: {str(e)}")
|
||||
return [ContentPart(
|
||||
id="error",
|
||||
label="PowerPoint Extraction Error",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=f"Error extracting PowerPoint content: {str(e)}",
|
||||
metadata={"error": True, "error_message": str(e)}
|
||||
)]
|
||||
|
||||
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
||||
"""Convert table data to markdown format."""
|
||||
if not table_data:
|
||||
return ""
|
||||
|
||||
markdown_lines = []
|
||||
|
||||
# Header row
|
||||
if table_data:
|
||||
header = "| " + " | ".join(table_data[0]) + " |"
|
||||
markdown_lines.append(header)
|
||||
|
||||
# Separator row
|
||||
separator = "| " + " | ".join(["---"] * len(table_data[0])) + " |"
|
||||
markdown_lines.append(separator)
|
||||
|
||||
# Data rows
|
||||
for row in table_data[1:]:
|
||||
data_row = "| " + " | ".join(row) + " |"
|
||||
markdown_lines.append(data_row)
|
||||
|
||||
return "\n".join(markdown_lines)
|
||||
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class SqlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for SQL files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/x-sql, application/sql
|
||||
- File extensions: .sql, .ddl, .dml, .dcl, .tcl
|
||||
- Special handling: Treats as structured text with SQL syntax
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return (mimeType in ("text/x-sql", "application/sql") or
|
||||
(fileName or "").lower().endswith((".sql", ".ddl", ".dml", ".dcl", ".tcl")))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".sql", ".ddl", ".dml", ".dcl", ".tcl"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["text/x-sql", "application/sql"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/x-sql"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
|
||||
# Add SQL-specific metadata
|
||||
metadata = {
|
||||
"size": len(fileBytes),
|
||||
"file_type": "sql",
|
||||
"line_count": len(data.splitlines()),
|
||||
"has_select": "SELECT" in data.upper(),
|
||||
"has_insert": "INSERT" in data.upper(),
|
||||
"has_update": "UPDATE" in data.upper(),
|
||||
"has_delete": "DELETE" in data.upper(),
|
||||
"has_create": "CREATE" in data.upper(),
|
||||
"has_drop": "DROP" in data.upper()
|
||||
}
|
||||
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)]
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class TextExtractor(Extractor):
|
||||
"""
|
||||
Extractor for plain text files and code files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: text/plain, text/markdown, text/x-python, text/x-java-source, text/javascript, etc.
|
||||
- File extensions: .txt, .md, .log, .java, .js, .jsx, .ts, .tsx, .py, .config, .ini, .cfg, .conf, .properties, .yaml, .yml, .toml, .sh, .bat, .ps1, .sql, .css, .scss, .sass, .less, .xml, .json, .csv, .tsv, .rtf, .tex, .rst, .adoc, .org, .pod, .man, .1, .2, .3, .4, .5, .6, .7, .8, .9, .n, .l, .m, .r, .t, .x, .y, .z
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
# Check MIME types
|
||||
if mimeType and mimeType.startswith("text/"):
|
||||
return True
|
||||
|
||||
# Check file extensions
|
||||
if fileName:
|
||||
ext = fileName.lower()
|
||||
return ext.endswith((
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
))
|
||||
|
||||
return False
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [
|
||||
# Basic text files
|
||||
".txt", ".md", ".log", ".rtf", ".tex", ".rst", ".adoc", ".org", ".pod",
|
||||
# Programming languages
|
||||
".java", ".js", ".jsx", ".ts", ".tsx", ".py", ".rb", ".go", ".rs", ".cpp", ".c", ".h", ".hpp", ".cc", ".cxx",
|
||||
".cs", ".php", ".swift", ".kt", ".scala", ".clj", ".hs", ".ml", ".fs", ".vb", ".dart", ".r", ".m", ".pl", ".sh",
|
||||
# Web technologies
|
||||
".html", ".htm", ".css", ".scss", ".sass", ".less", ".vue", ".svelte",
|
||||
# Configuration files
|
||||
".config", ".ini", ".cfg", ".conf", ".properties", ".yaml", ".yml", ".toml", ".json", ".xml",
|
||||
# Scripts and automation
|
||||
".bat", ".ps1", ".psm1", ".psd1", ".vbs", ".wsf", ".cmd", ".com",
|
||||
# Data files
|
||||
".csv", ".tsv", ".tab", ".dat", ".data",
|
||||
# Documentation
|
||||
".man", ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".n", ".l", ".m", ".r", ".t", ".x", ".y", ".z",
|
||||
# Other text formats
|
||||
".diff", ".patch", ".gitignore", ".dockerignore", ".editorconfig", ".gitattributes",
|
||||
".env", ".env.local", ".env.development", ".env.production", ".env.test",
|
||||
".lock", ".lockb", ".lockfile", ".pkg-lock", ".yarn-lock"
|
||||
]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return [
|
||||
"text/plain", "text/markdown", "text/x-python", "text/x-java-source",
|
||||
"text/javascript", "text/x-javascript", "text/typescript", "text/x-typescript",
|
||||
"text/x-c", "text/x-c++", "text/x-csharp", "text/x-php", "text/x-ruby",
|
||||
"text/x-go", "text/x-rust", "text/x-scala", "text/x-swift", "text/x-kotlin",
|
||||
"text/x-sql", "text/x-sh", "text/x-shellscript", "text/x-yaml", "text/x-toml",
|
||||
"text/x-ini", "text/x-config", "text/x-properties", "text/x-log",
|
||||
"text/html", "text/css", "text/x-scss", "text/x-sass", "text/x-less",
|
||||
"text/xml", "text/csv", "text/tab-separated-values", "text/rtf",
|
||||
"text/x-tex", "text/x-rst", "text/x-asciidoc", "text/x-org",
|
||||
"application/x-yaml", "application/x-toml", "application/x-ini",
|
||||
"application/x-config", "application/x-properties", "application/x-log"
|
||||
]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
fileName = context.get("fileName")
|
||||
mimeType = context.get("mimeType") or "text/plain"
|
||||
data = fileBytes.decode("utf-8", errors="replace")
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="text",
|
||||
mimeType=mimeType,
|
||||
data=data,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import io
|
||||
from datetime import datetime
|
||||
|
||||
from ..subUtils import makeId
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class XlsxExtractor(Extractor):
|
||||
"""
|
||||
Extractor for Microsoft Excel spreadsheets.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
|
||||
- File extensions: .xlsx, .xlsm
|
||||
- Special handling: Extracts all sheets as CSV data
|
||||
- Dependencies: openpyxl
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loaded = False
|
||||
self._haveLibs = False
|
||||
|
||||
def _load(self):
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
try:
|
||||
global openpyxl
|
||||
import openpyxl
|
||||
self._haveLibs = True
|
||||
except Exception:
|
||||
self._haveLibs = False
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
mt = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
return mimeType == mt or (fileName or "").lower().endswith((".xlsx", ".xlsm"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xlsx", ".xlsm"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
self._load()
|
||||
parts: List[ContentPart] = []
|
||||
rootId = makeId()
|
||||
parts.append(ContentPart(
|
||||
id=rootId,
|
||||
parentId=None,
|
||||
label="xlsx",
|
||||
typeGroup="container",
|
||||
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
data="",
|
||||
metadata={"size": len(fileBytes)}
|
||||
))
|
||||
|
||||
if not self._haveLibs:
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label="binary",
|
||||
typeGroup="binary",
|
||||
mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
data="",
|
||||
metadata={"size": len(fileBytes), "warning": "openpyxl not available"}
|
||||
))
|
||||
return parts
|
||||
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
wb = openpyxl.load_workbook(buf, data_only=True)
|
||||
for sheetName in wb.sheetnames:
|
||||
ws = wb[sheetName]
|
||||
# extract rectangular data region by min/max
|
||||
min_row = ws.min_row
|
||||
max_row = ws.max_row
|
||||
min_col = ws.min_column
|
||||
max_col = ws.max_column
|
||||
lines: list[str] = []
|
||||
for r in range(min_row, max_row + 1):
|
||||
cells: list[str] = []
|
||||
for c in range(min_col, max_col + 1):
|
||||
cell = ws.cell(row=r, column=c)
|
||||
v = cell.value
|
||||
if v is None:
|
||||
cells.append("")
|
||||
elif isinstance(v, (int, float)):
|
||||
cells.append(str(v))
|
||||
elif isinstance(v, datetime):
|
||||
cells.append(v.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
else:
|
||||
escaped_value = str(v).replace('"', '""')
|
||||
cells.append(f'"{escaped_value}"')
|
||||
lines.append(",".join(cells))
|
||||
csvData = "\n".join(lines)
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"sheet_{sheetName}",
|
||||
typeGroup="table",
|
||||
mimeType="text/csv",
|
||||
data=csvData,
|
||||
metadata={"sheet": sheetName, "size": len(csvData.encode('utf-8'))}
|
||||
))
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from ..subUtils import makeId
|
||||
from ..subRegistry import Extractor
|
||||
|
||||
|
||||
class XmlExtractor(Extractor):
|
||||
"""
|
||||
Extractor for XML files.
|
||||
|
||||
Supported formats:
|
||||
- MIME types: application/xml
|
||||
- File extensions: .xml, .rss, .atom
|
||||
- Special handling: Uses ElementTree for parsing
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
return mimeType == "application/xml" or (fileName or "").lower().endswith((".xml", ".rss", ".atom"))
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions."""
|
||||
return [".xml", ".rss", ".atom"]
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return ["application/xml"]
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
||||
mimeType = context.get("mimeType") or "application/xml"
|
||||
text = fileBytes.decode("utf-8", errors="replace")
|
||||
try:
|
||||
ET.fromstring(text)
|
||||
except Exception:
|
||||
pass
|
||||
return [ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="main",
|
||||
typeGroup="structure",
|
||||
mimeType=mimeType,
|
||||
data=text,
|
||||
metadata={"size": len(fileBytes)}
|
||||
)]
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,2 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
|
||||
|
||||
class DefaultMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Default merger that passes through parts unchanged.
|
||||
Used for image, binary, metadata, container typeGroups.
|
||||
"""
|
||||
return parts
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
from ..subUtils import makeId
|
||||
|
||||
|
||||
class TableMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Merge table parts based on strategy.
|
||||
Strategy options:
|
||||
- groupBy: "parentId" (default), "documentId", "sheet", "none"
|
||||
- maxSize: maximum size per merged part
|
||||
- combineSheets: bool - whether to combine multiple sheets into one table
|
||||
"""
|
||||
if not parts:
|
||||
return parts
|
||||
|
||||
groupBy = strategy.groupBy
|
||||
maxSize = strategy.maxSize or 0
|
||||
combineSheets = strategy.tableMerge.get("combineSheets", False) if strategy.tableMerge else False
|
||||
|
||||
# Group parts
|
||||
groups = self._groupParts(parts, groupBy, combineSheets)
|
||||
|
||||
merged: List[ContentPart] = []
|
||||
for groupKey, groupParts in groups.items():
|
||||
if maxSize > 0:
|
||||
merged.extend(self._mergeWithSizeLimit(groupParts, maxSize, groupKey))
|
||||
else:
|
||||
merged.extend(self._mergeGroup(groupParts, groupKey))
|
||||
|
||||
return merged
|
||||
|
||||
def _groupParts(self, parts: List[ContentPart], groupBy: str, combineSheets: bool) -> Dict[str, List[ContentPart]]:
|
||||
groups: Dict[str, List[ContentPart]] = {}
|
||||
|
||||
for part in parts:
|
||||
if part.typeGroup != "table":
|
||||
# Non-table parts go in their own group
|
||||
key = f"nontable_{part.id}"
|
||||
if key not in groups:
|
||||
groups[key] = []
|
||||
groups[key].append(part)
|
||||
continue
|
||||
|
||||
if groupBy == "parentId":
|
||||
key = part.parentId or "root"
|
||||
elif groupBy == "documentId":
|
||||
key = part.metadata.get("documentId", "unknown")
|
||||
elif groupBy == "sheet" and not combineSheets:
|
||||
key = part.metadata.get("sheet", "unknown")
|
||||
else: # "none" or combineSheets=True
|
||||
key = "all_tables"
|
||||
|
||||
if key not in groups:
|
||||
groups[key] = []
|
||||
groups[key].append(part)
|
||||
|
||||
return groups
|
||||
|
||||
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
|
||||
if not parts:
|
||||
return []
|
||||
if len(parts) == 1:
|
||||
return parts
|
||||
|
||||
# For tables, we typically keep them separate unless explicitly combining
|
||||
# But we can add metadata about the group
|
||||
for i, part in enumerate(parts):
|
||||
part.metadata["groupKey"] = groupKey
|
||||
part.metadata["groupIndex"] = i
|
||||
part.metadata["groupSize"] = len(parts)
|
||||
|
||||
return parts
|
||||
|
||||
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int, groupKey: str) -> List[ContentPart]:
|
||||
if not parts:
|
||||
return []
|
||||
|
||||
# For tables, we typically don't merge across different tables
|
||||
# Instead, we chunk individual large tables
|
||||
merged: List[ContentPart] = []
|
||||
|
||||
for part in parts:
|
||||
partSize = part.metadata.get("size", 0)
|
||||
|
||||
if partSize <= maxSize:
|
||||
# Part fits within limit
|
||||
part.metadata["groupKey"] = groupKey
|
||||
merged.append(part)
|
||||
else:
|
||||
# Chunk the large table
|
||||
chunks = self._chunkTable(part, maxSize)
|
||||
merged.extend(chunks)
|
||||
|
||||
return merged
|
||||
|
||||
def _chunkTable(self, part: ContentPart, maxSize: int) -> List[ContentPart]:
|
||||
"""Chunk a large table by rows while preserving CSV structure."""
|
||||
lines = part.data.split('\n')
|
||||
if not lines:
|
||||
return [part]
|
||||
|
||||
chunks: List[ContentPart] = []
|
||||
currentChunk: List[str] = []
|
||||
currentSize = 0
|
||||
|
||||
for line in lines:
|
||||
lineSize = len(line.encode('utf-8')) + 1 # +1 for newline
|
||||
|
||||
if currentSize + lineSize > maxSize and currentChunk:
|
||||
# Flush current chunk
|
||||
chunkData = '\n'.join(currentChunk)
|
||||
chunks.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=part.parentId,
|
||||
label=f"{part.label}_chunk_{len(chunks)}",
|
||||
typeGroup="table",
|
||||
mimeType=part.mimeType,
|
||||
data=chunkData,
|
||||
metadata={
|
||||
"size": len(chunkData.encode('utf-8')),
|
||||
"chunk": True,
|
||||
"originalPart": part.id,
|
||||
"chunkIndex": len(chunks)
|
||||
}
|
||||
))
|
||||
currentChunk = [line]
|
||||
currentSize = lineSize
|
||||
else:
|
||||
currentChunk.append(line)
|
||||
currentSize += lineSize
|
||||
|
||||
# Flush remaining chunk
|
||||
if currentChunk:
|
||||
chunkData = '\n'.join(currentChunk)
|
||||
chunks.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=part.parentId,
|
||||
label=f"{part.label}_chunk_{len(chunks)}",
|
||||
typeGroup="table",
|
||||
mimeType=part.mimeType,
|
||||
data=chunkData,
|
||||
metadata={
|
||||
"size": len(chunkData.encode('utf-8')),
|
||||
"chunk": True,
|
||||
"originalPart": part.id,
|
||||
"chunkIndex": len(chunks)
|
||||
}
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, List
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, MergeStrategy
|
||||
from ..subUtils import makeId
|
||||
|
||||
|
||||
class TextMerger:
|
||||
def merge(self, parts: List[ContentPart], strategy: MergeStrategy) -> List[ContentPart]:
|
||||
"""
|
||||
Merge text parts based on strategy.
|
||||
Strategy options:
|
||||
- groupBy: "parentId" (default), "documentId", "none"
|
||||
- orderBy: "label", "pageIndex", "sheetIndex", "none"
|
||||
- maxSize: maximum size per merged part
|
||||
"""
|
||||
if not parts:
|
||||
return parts
|
||||
|
||||
groupBy = strategy.groupBy
|
||||
orderBy = strategy.orderBy
|
||||
maxSize = strategy.maxSize or 0
|
||||
|
||||
# Group parts
|
||||
groups = self._groupParts(parts, groupBy)
|
||||
|
||||
merged: List[ContentPart] = []
|
||||
for groupKey, groupParts in groups.items():
|
||||
# Sort within group
|
||||
sortedParts = self._sortParts(groupParts, orderBy)
|
||||
|
||||
# Merge respecting maxSize
|
||||
if maxSize > 0:
|
||||
merged.extend(self._mergeWithSizeLimit(sortedParts, maxSize))
|
||||
else:
|
||||
merged.extend(self._mergeGroup(sortedParts, groupKey))
|
||||
|
||||
return merged
|
||||
|
||||
def _groupParts(self, parts: List[ContentPart], groupBy: str) -> Dict[str, List[ContentPart]]:
|
||||
groups: Dict[str, List[ContentPart]] = {}
|
||||
|
||||
for part in parts:
|
||||
if part.typeGroup != "text":
|
||||
# Non-text parts go in their own group
|
||||
key = f"nontext_{part.id}"
|
||||
if key not in groups:
|
||||
groups[key] = []
|
||||
groups[key].append(part)
|
||||
continue
|
||||
|
||||
if groupBy == "parentId":
|
||||
key = part.parentId or "root"
|
||||
elif groupBy == "documentId":
|
||||
key = part.metadata.get("documentId", "unknown")
|
||||
else: # "none"
|
||||
key = "all"
|
||||
|
||||
if key not in groups:
|
||||
groups[key] = []
|
||||
groups[key].append(part)
|
||||
|
||||
return groups
|
||||
|
||||
def _sortParts(self, parts: List[ContentPart], orderBy: str) -> List[ContentPart]:
|
||||
if orderBy == "pageIndex":
|
||||
return sorted(parts, key=lambda p: p.metadata.get("pageIndex", 0))
|
||||
elif orderBy == "sheetIndex":
|
||||
return sorted(parts, key=lambda p: p.metadata.get("sheetIndex", 0))
|
||||
elif orderBy == "label":
|
||||
return sorted(parts, key=lambda p: p.label)
|
||||
else: # "none"
|
||||
return parts
|
||||
|
||||
def _mergeGroup(self, parts: List[ContentPart], groupKey: str) -> List[ContentPart]:
|
||||
if not parts:
|
||||
return []
|
||||
if len(parts) == 1:
|
||||
return parts
|
||||
|
||||
# Merge all text parts in group
|
||||
textParts = [p for p in parts if p.typeGroup == "text"]
|
||||
nonTextParts = [p for p in parts if p.typeGroup != "text"]
|
||||
|
||||
if not textParts:
|
||||
return nonTextParts
|
||||
|
||||
# Combine text data
|
||||
combinedData = "\n".join([p.data for p in textParts])
|
||||
totalSize = sum(p.metadata.get("size", 0) for p in textParts)
|
||||
|
||||
mergedPart = ContentPart(
|
||||
id=makeId(),
|
||||
parentId=textParts[0].parentId,
|
||||
label=f"merged_{groupKey}",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=combinedData,
|
||||
metadata={
|
||||
"size": totalSize,
|
||||
"merged": len(textParts),
|
||||
"originalParts": [p.id for p in textParts]
|
||||
}
|
||||
)
|
||||
|
||||
return [mergedPart] + nonTextParts
|
||||
|
||||
def _mergeWithSizeLimit(self, parts: List[ContentPart], maxSize: int) -> List[ContentPart]:
|
||||
if not parts:
|
||||
return []
|
||||
|
||||
textParts = [p for p in parts if p.typeGroup == "text"]
|
||||
nonTextParts = [p for p in parts if p.typeGroup != "text"]
|
||||
|
||||
if not textParts:
|
||||
return nonTextParts
|
||||
|
||||
merged: List[ContentPart] = []
|
||||
currentGroup: List[ContentPart] = []
|
||||
currentSize = 0
|
||||
|
||||
for part in textParts:
|
||||
partSize = part.metadata.get("size", 0)
|
||||
|
||||
if currentSize + partSize > maxSize and currentGroup:
|
||||
# Flush current group
|
||||
merged.extend(self._mergeGroup(currentGroup, f"chunk_{len(merged)}"))
|
||||
currentGroup = [part]
|
||||
currentSize = partSize
|
||||
else:
|
||||
currentGroup.append(part)
|
||||
currentSize += partSize
|
||||
|
||||
# Flush remaining group
|
||||
if currentGroup:
|
||||
merged.extend(self._mergeGroup(currentGroup, f"chunk_{len(merged)}"))
|
||||
|
||||
return merged + nonTextParts
|
||||
|
|
@ -1,211 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Intelligent Token-Aware Merger for optimizing AI calls based on LLM token limits.
|
||||
"""
|
||||
from typing import List, Dict, Any
|
||||
import logging
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from .subUtils import makeId
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntelligentTokenAwareMerger:
|
||||
"""
|
||||
Intelligent merger that groups chunks based on LLM token limits to minimize AI calls.
|
||||
|
||||
Strategy:
|
||||
1. Calculate token count for each chunk
|
||||
2. Group chunks to maximize token usage without exceeding limits
|
||||
3. Preserve document structure and semantic boundaries
|
||||
4. Minimize total number of AI calls
|
||||
"""
|
||||
|
||||
def __init__(self, modelCapabilities: Dict[str, Any]):
|
||||
self.maxTokens = modelCapabilities.get("maxTokens", 4000)
|
||||
self.safetyMargin = modelCapabilities.get("safetyMargin", 0.1)
|
||||
self.effectiveMaxTokens = int(self.maxTokens * (1 - self.safetyMargin))
|
||||
self.charsPerToken = modelCapabilities.get("charsPerToken", 4) # Rough estimation
|
||||
|
||||
def mergeChunksIntelligently(self, chunks: List[ContentPart], prompt: str = "") -> List[ContentPart]:
|
||||
"""
|
||||
Merge chunks intelligently based on token limits.
|
||||
|
||||
Args:
|
||||
chunks: List of ContentPart chunks to merge
|
||||
prompt: AI prompt to account for in token calculation
|
||||
|
||||
Returns:
|
||||
List of optimally merged ContentPart objects
|
||||
"""
|
||||
if not chunks:
|
||||
return chunks
|
||||
|
||||
logger.info(f"🧠 Intelligent merging: {len(chunks)} chunks, maxTokens={self.effectiveMaxTokens}")
|
||||
|
||||
# Calculate tokens for prompt
|
||||
promptTokens = self._estimateTokens(prompt)
|
||||
availableTokens = self.effectiveMaxTokens - promptTokens
|
||||
|
||||
logger.info(f"📊 Prompt tokens: {promptTokens}, Available for content: {availableTokens}")
|
||||
|
||||
# Group chunks by document and type for semantic coherence
|
||||
groupedChunks = self._groupChunksByDocumentAndType(chunks)
|
||||
|
||||
mergedParts = []
|
||||
|
||||
for groupKey, groupChunks in groupedChunks.items():
|
||||
logger.info(f"📁 Processing group: {groupKey} ({len(groupChunks)} chunks)")
|
||||
|
||||
# Merge chunks within this group optimally
|
||||
groupMerged = self._mergeGroupOptimally(groupChunks, availableTokens)
|
||||
mergedParts.extend(groupMerged)
|
||||
|
||||
logger.info(f"✅ Intelligent merging complete: {len(chunks)} → {len(mergedParts)} parts")
|
||||
return mergedParts
|
||||
|
||||
def _groupChunksByDocumentAndType(self, chunks: List[ContentPart]) -> Dict[str, List[ContentPart]]:
|
||||
"""Group chunks by document and type for semantic coherence."""
|
||||
groups = {}
|
||||
|
||||
for chunk in chunks:
|
||||
# Create group key: document_id + type_group
|
||||
docId = chunk.metadata.get("documentId", "unknown")
|
||||
typeGroup = chunk.typeGroup
|
||||
groupKey = f"{docId}_{typeGroup}"
|
||||
|
||||
if groupKey not in groups:
|
||||
groups[groupKey] = []
|
||||
groups[groupKey].append(chunk)
|
||||
|
||||
return groups
|
||||
|
||||
def _mergeGroupOptimally(self, chunks: List[ContentPart], availableTokens: int) -> List[ContentPart]:
|
||||
"""Merge chunks within a group optimally to minimize AI calls."""
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# Sort chunks by size (smallest first for better packing)
|
||||
sortedChunks = sorted(chunks, key=lambda c: self._estimateTokens(c.data))
|
||||
|
||||
mergedParts = []
|
||||
currentGroup = []
|
||||
currentTokens = 0
|
||||
|
||||
for chunk in sortedChunks:
|
||||
chunkTokens = self._estimateTokens(chunk.data)
|
||||
|
||||
# Special case: If single chunk is already at max size, process it alone
|
||||
if chunkTokens >= availableTokens * 0.9: # 90% of available tokens
|
||||
# Finalize current group if it exists
|
||||
if currentGroup:
|
||||
mergedPart = self._createMergedPart(currentGroup, currentTokens)
|
||||
mergedParts.append(mergedPart)
|
||||
currentGroup = []
|
||||
currentTokens = 0
|
||||
|
||||
# Process large chunk individually
|
||||
mergedParts.append(chunk)
|
||||
logger.debug(f"🔍 Large chunk processed individually: {chunkTokens} tokens")
|
||||
continue
|
||||
|
||||
# If adding this chunk would exceed limit, finalize current group
|
||||
if currentTokens + chunkTokens > availableTokens and currentGroup:
|
||||
mergedPart = self._createMergedPart(currentGroup, currentTokens)
|
||||
mergedParts.append(mergedPart)
|
||||
currentGroup = [chunk]
|
||||
currentTokens = chunkTokens
|
||||
else:
|
||||
currentGroup.append(chunk)
|
||||
currentTokens += chunkTokens
|
||||
|
||||
# Finalize remaining group
|
||||
if currentGroup:
|
||||
mergedPart = self._createMergedPart(currentGroup, currentTokens)
|
||||
mergedParts.append(mergedPart)
|
||||
|
||||
logger.info(f"📦 Group merged: {len(chunks)} → {len(mergedParts)} parts")
|
||||
return mergedParts
|
||||
|
||||
def _createMergedPart(self, chunks: List[ContentPart], totalTokens: int) -> ContentPart:
|
||||
"""Create a merged ContentPart from multiple chunks."""
|
||||
if len(chunks) == 1:
|
||||
return chunks[0] # No need to merge single chunk
|
||||
|
||||
# Combine data with semantic separators
|
||||
combinedData = self._combineChunkData(chunks)
|
||||
|
||||
# Use metadata from first chunk as base
|
||||
baseChunk = chunks[0]
|
||||
mergedMetadata = baseChunk.metadata.copy()
|
||||
mergedMetadata.update({
|
||||
"merged": True,
|
||||
"originalChunkCount": len(chunks),
|
||||
"totalTokens": totalTokens,
|
||||
"originalChunkIds": [c.id for c in chunks],
|
||||
"size": len(combinedData.encode('utf-8'))
|
||||
})
|
||||
|
||||
mergedPart = ContentPart(
|
||||
id=makeId(),
|
||||
parentId=baseChunk.parentId,
|
||||
label=f"merged_{len(chunks)}_chunks",
|
||||
typeGroup=baseChunk.typeGroup,
|
||||
mimeType=baseChunk.mimeType,
|
||||
data=combinedData,
|
||||
metadata=mergedMetadata
|
||||
)
|
||||
|
||||
logger.debug(f"🔗 Created merged part: {len(chunks)} chunks, {totalTokens} tokens")
|
||||
return mergedPart
|
||||
|
||||
def _combineChunkData(self, chunks: List[ContentPart]) -> str:
|
||||
"""Combine chunk data with appropriate separators."""
|
||||
if not chunks:
|
||||
return ""
|
||||
|
||||
# Use different separators based on content type
|
||||
if chunks[0].typeGroup == "text":
|
||||
separator = "\n\n---\n\n" # Clear text separation
|
||||
elif chunks[0].typeGroup == "table":
|
||||
separator = "\n\n[TABLE BREAK]\n\n" # Table separation
|
||||
else:
|
||||
separator = "\n\n---\n\n" # Default separation
|
||||
|
||||
return separator.join([chunk.data for chunk in chunks])
|
||||
|
||||
def _estimateTokens(self, text: str) -> int:
|
||||
"""Estimate token count for text."""
|
||||
if not text:
|
||||
return 0
|
||||
return len(text) // self.charsPerToken
|
||||
|
||||
def calculateOptimizationStats(self, originalChunks: List[ContentPart], mergedParts: List[ContentPart]) -> Dict[str, Any]:
|
||||
"""Calculate optimization statistics with detailed analysis."""
|
||||
originalCalls = len(originalChunks)
|
||||
optimizedCalls = len(mergedParts)
|
||||
reductionPercent = ((originalCalls - optimizedCalls) / originalCalls * 100) if originalCalls > 0 else 0
|
||||
|
||||
# Analyze chunk sizes
|
||||
largeChunks = [c for c in originalChunks if self._estimateTokens(c.data) >= self.effectiveMaxTokens * 0.9]
|
||||
smallChunks = [c for c in originalChunks if self._estimateTokens(c.data) < self.effectiveMaxTokens * 0.9]
|
||||
|
||||
# Calculate theoretical maximum optimization (if all small chunks could be merged)
|
||||
theoreticalMinCalls = len(largeChunks) + max(1, len(smallChunks) // 3) # Assume 3 small chunks per call
|
||||
theoreticalReduction = ((originalCalls - theoreticalMinCalls) / originalCalls * 100) if originalCalls > 0 else 0
|
||||
|
||||
return {
|
||||
"original_ai_calls": originalCalls,
|
||||
"optimized_ai_calls": optimizedCalls,
|
||||
"reduction_percent": round(reductionPercent, 1),
|
||||
"cost_savings": f"{reductionPercent:.1f}%",
|
||||
"efficiency_gain": f"{originalCalls / optimizedCalls:.1f}x" if optimizedCalls > 0 else "∞",
|
||||
"analysis": {
|
||||
"large_chunks": len(largeChunks),
|
||||
"small_chunks": len(smallChunks),
|
||||
"theoretical_min_calls": theoreticalMinCalls,
|
||||
"theoretical_reduction": round(theoreticalReduction, 1),
|
||||
"optimization_potential": "high" if reductionPercent > 50 else "moderate" if reductionPercent > 20 else "low"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import List
|
||||
import logging
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentExtracted, ContentPart, ExtractionOptions, MergeStrategy
|
||||
from .subUtils import makeId
|
||||
from .subRegistry import ExtractorRegistry, ChunkerRegistry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# REMOVED: _mergeParts function - unused, functionality replaced by applyMerging in interfaceAiObjects.py
|
||||
|
||||
|
||||
def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: ChunkerRegistry, documentBytes: bytes, fileName: str, mimeType: str, options: ExtractionOptions) -> ContentExtracted:
|
||||
|
||||
extractor = extractorRegistry.resolve(mimeType, fileName)
|
||||
if extractor is None:
|
||||
# fallback: single binary part
|
||||
part = ContentPart(
|
||||
id=makeId(),
|
||||
parentId=None,
|
||||
label="file",
|
||||
typeGroup="binary",
|
||||
mimeType=mimeType or "application/octet-stream",
|
||||
data="",
|
||||
metadata={"warning": "No extractor registered"}
|
||||
)
|
||||
return ContentExtracted(id=makeId(), parts=[part])
|
||||
|
||||
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType})
|
||||
|
||||
# REMOVED: poolAndLimit(parts, chunkerRegistry, options)
|
||||
# REMOVED: Chunking logic - now handled in AI call phase
|
||||
|
||||
# Apply merging strategy if provided (preserve existing logic)
|
||||
if options.mergeStrategy:
|
||||
# Use module-level applyMerging function
|
||||
from .mainServiceExtraction import applyMerging
|
||||
parts = applyMerging(parts, options.mergeStrategy)
|
||||
|
||||
return ContentExtracted(id=makeId(), parts=parts)
|
||||
|
||||
|
||||
# REMOVED: poolAndLimit function - chunking now handled in AI call phase
|
||||
# REMOVED: applyMerging function - moved to interfaceAiObjects.py for proper interface-level access
|
||||
|
||||
|
|
@ -1,214 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Prompt builder for document extraction.
|
||||
This module builds prompts for extracting content from documents.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
||||
|
||||
# Type hint for renderer parameter
|
||||
from typing import TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
from modules.services.serviceGeneration.renderers.documentRendererBaseTemplate import BaseRenderer
|
||||
_RendererLike = BaseRenderer
|
||||
else:
|
||||
_RendererLike = Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def buildExtractionPrompt(
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None,
|
||||
services=None,
|
||||
renderer: _RendererLike = None
|
||||
) -> str:
|
||||
"""
|
||||
Build unified extraction prompt for extracting content from documents.
|
||||
Always uses multi-file format (single doc = multi with n=1).
|
||||
|
||||
Args:
|
||||
outputFormat: Target output format
|
||||
userPrompt: User's prompt describing what to extract
|
||||
title: Document title
|
||||
aiService: Optional AI service for intent parsing
|
||||
services: Services instance
|
||||
renderer: Optional renderer for format-specific guidelines
|
||||
|
||||
Returns:
|
||||
Complete extraction prompt string
|
||||
"""
|
||||
|
||||
# Flat extraction format - returns extracted content as structured data, not documents/sections
|
||||
# This format allows merging multiple contentParts into one response
|
||||
json_example = {
|
||||
"extracted_content": {
|
||||
"text": "Extracted text content from the document...",
|
||||
"tables": [
|
||||
{
|
||||
"headers": ["Column 1", "Column 2"],
|
||||
"rows": [
|
||||
["Value 1", "Value 2"],
|
||||
["Value 3", "Value 4"]
|
||||
]
|
||||
}
|
||||
],
|
||||
"headings": [
|
||||
{
|
||||
"level": 1,
|
||||
"text": "Main Heading"
|
||||
},
|
||||
{
|
||||
"level": 2,
|
||||
"text": "Subheading"
|
||||
}
|
||||
],
|
||||
"lists": [
|
||||
{
|
||||
"type": "bullet",
|
||||
"items": ["Item 1", "Item 2", "Item 3"]
|
||||
}
|
||||
],
|
||||
"images": [
|
||||
{
|
||||
"description": "Description of image content, including all visible text, tables, and visual elements"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
structure_instruction = """CRITICAL EXTRACTION REQUIREMENTS:
|
||||
1. Extract content from the provided ContentPart(s) - process what is provided in this call
|
||||
2. If this ContentPart contains tables, extract them with proper structure (headers and rows)
|
||||
3. If this ContentPart contains text, extract it as structured text
|
||||
4. Return ONE JSON object with extracted content from this ContentPart
|
||||
5. Preserve all original data - do not summarize or interpret
|
||||
6. The system will merge results from multiple ContentParts automatically - focus on extracting this ContentPart's content accurately"""
|
||||
|
||||
# Parse extraction intent if AI service is available
|
||||
extraction_intent = await _parseExtractionIntent(userPrompt, outputFormat, aiService, services) if aiService else userPrompt
|
||||
|
||||
# Extract user language for document language instruction
|
||||
userLanguage = 'en' # Default fallback
|
||||
if services:
|
||||
try:
|
||||
# Prefer detected language if available
|
||||
if hasattr(services, 'currentUserLanguage') and services.currentUserLanguage:
|
||||
userLanguage = services.currentUserLanguage
|
||||
elif hasattr(services, 'user') and services.user and hasattr(services.user, 'language'):
|
||||
userLanguage = services.user.language
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Build base prompt with clear user prompt markers
|
||||
sanitized_user_prompt = services.utils.sanitizePromptContent(userPrompt, 'userinput') if services else userPrompt
|
||||
adaptive_prompt = f"""
|
||||
{'='*80}
|
||||
USER REQUEST / USER PROMPT:
|
||||
{'='*80}
|
||||
{sanitized_user_prompt}
|
||||
{'='*80}
|
||||
END OF USER REQUEST / USER PROMPT
|
||||
{'='*80}
|
||||
|
||||
You are a document processing assistant that extracts content from documents. Your task is to analyze the provided ContentPart(s) and extract their content into a structured JSON format.
|
||||
|
||||
TASK: Extract content from the provided ContentPart(s). Extract all tables, text, headings, lists, and other content types accurately. The system processes ContentParts individually and merges results automatically.
|
||||
|
||||
LANGUAGE REQUIREMENT: All extracted content must be in the language '{userLanguage}'. Extract and preserve content in this language.
|
||||
|
||||
{extraction_intent}
|
||||
|
||||
{structure_instruction}
|
||||
|
||||
OUTPUT FORMAT: Return only valid JSON in this exact structure:
|
||||
{json.dumps(json_example, indent=2)}
|
||||
|
||||
CRITICAL EXTRACTION RULES:
|
||||
- Extract only content that is ACTUALLY PRESENT in the ContentPart - never create fake or placeholder data
|
||||
- Return empty arrays [] or empty strings "" when content is missing - this is normal and expected
|
||||
- Extract all tables, text, headings, lists accurately with proper structure
|
||||
- Preserve all original data - do not summarize or interpret
|
||||
- Return ONE JSON object per ContentPart (the system merges multiple ContentParts automatically)
|
||||
|
||||
Content Types to Extract:
|
||||
1. Tables: Extract all rows and columns with proper headers
|
||||
2. Lists: Extract all items with proper nesting
|
||||
3. Headings: Extract with appropriate levels
|
||||
4. Paragraphs: Extract as structured text
|
||||
5. Code: Extract code blocks with language identification
|
||||
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
|
||||
|
||||
Image Analysis Requirements:
|
||||
- If you cannot analyze an image for any reason, explain why in the JSON response
|
||||
- Describe everything you see in the image
|
||||
- Include all text content, tables, logos, graphics, layout, and visual elements
|
||||
- If the image is too small, corrupted, or unclear, explain this
|
||||
- Always provide feedback - never return empty responses
|
||||
|
||||
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
|
||||
|
||||
Extract only actual content from the ContentPart. Return empty arrays/strings when content is missing - never create fake data.
|
||||
""".strip()
|
||||
|
||||
# Add renderer-specific guidelines if provided
|
||||
if renderer:
|
||||
try:
|
||||
if hasattr(renderer, 'getExtractionGuidelines'):
|
||||
formatGuidelines = renderer.getExtractionGuidelines()
|
||||
adaptive_prompt = f"{adaptive_prompt}\n\n{formatGuidelines}".strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Save extraction prompt to debug file - only if debug enabled
|
||||
from modules.shared.debugLogger import writeDebugFile
|
||||
writeDebugFile(adaptive_prompt, "extraction_prompt")
|
||||
|
||||
return adaptive_prompt
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None, services=None) -> str:
|
||||
"""
|
||||
Parse user prompt to extract the core extraction intent.
|
||||
"""
|
||||
if not aiService:
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
try:
|
||||
analysis_prompt = f"""
|
||||
Analyze this user request and extract the core extraction intent:
|
||||
|
||||
User request: "{userPrompt}"
|
||||
Target format: {outputFormat}
|
||||
|
||||
Extract the main intent and requirements for document processing. Focus on:
|
||||
1. What content needs to be extracted
|
||||
2. How it should be organized
|
||||
3. Any specific requirements or preferences
|
||||
|
||||
Respond with a clear, concise statement of the extraction intent.
|
||||
"""
|
||||
request_options = AiCallOptions()
|
||||
request_options.operationType = OperationTypeEnum.DATA_GENERATE
|
||||
|
||||
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
||||
response = await aiService.aiObjects.call(request)
|
||||
|
||||
if response and response.content:
|
||||
return response.content.strip()
|
||||
else:
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
except Exception as e:
|
||||
services.utils.debugLogToFile(f"Extraction intent analysis failed: {str(e)}", "PROMPT_BUILDER")
|
||||
return f"Extract content from the provided documents and create a {outputFormat} report."
|
||||
|
||||
|
|
@ -1,208 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
from typing import Any, Dict, Optional
|
||||
import logging
|
||||
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Extractor:
|
||||
"""
|
||||
Base class for all document extractors.
|
||||
|
||||
Each extractor should implement:
|
||||
- detect(): Check if this extractor can handle the given file
|
||||
- extract(): Extract content from the file
|
||||
- getSupportedExtensions(): Return supported file extensions
|
||||
- getSupportedMimeTypes(): Return supported MIME types
|
||||
"""
|
||||
|
||||
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
||||
"""Check if this extractor can handle the given file."""
|
||||
return False
|
||||
|
||||
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> list[ContentPart]:
|
||||
"""Extract content from the file bytes."""
|
||||
raise NotImplementedError
|
||||
|
||||
def getSupportedExtensions(self) -> list[str]:
|
||||
"""Return list of supported file extensions (including dots)."""
|
||||
return []
|
||||
|
||||
def getSupportedMimeTypes(self) -> list[str]:
|
||||
"""Return list of supported MIME types."""
|
||||
return []
|
||||
|
||||
|
||||
class Chunker:
|
||||
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
return []
|
||||
|
||||
|
||||
class ExtractorRegistry:
|
||||
def __init__(self):
|
||||
self._map: Dict[str, Extractor] = {}
|
||||
self._fallback: Optional[Extractor] = None
|
||||
self._auto_discover_extractors()
|
||||
|
||||
def _auto_discover_extractors(self):
|
||||
"""Auto-discover and register all extractors from the extractors directory."""
|
||||
try:
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
# Get the extractors directory
|
||||
current_dir = Path(__file__).parent
|
||||
extractors_dir = current_dir / "extractors"
|
||||
|
||||
if not extractors_dir.exists():
|
||||
logger.error(f"Extractors directory not found: {extractors_dir}")
|
||||
return
|
||||
|
||||
# Import all extractor modules
|
||||
extractor_modules = []
|
||||
for file_path in extractors_dir.glob("extractor*.py"):
|
||||
if file_path.name == "__init__.py":
|
||||
continue
|
||||
|
||||
module_name = file_path.stem
|
||||
try:
|
||||
# Import the module
|
||||
module = importlib.import_module(f".{module_name}", package="modules.services.serviceExtraction.extractors")
|
||||
|
||||
# Find all extractor classes in the module
|
||||
for attr_name in dir(module):
|
||||
attr = getattr(module, attr_name)
|
||||
if (isinstance(attr, type) and
|
||||
issubclass(attr, Extractor) and
|
||||
attr != Extractor and
|
||||
not attr_name.startswith('_')):
|
||||
|
||||
# Create instance and auto-register
|
||||
extractor_instance = attr()
|
||||
self._auto_register_extractor(extractor_instance)
|
||||
extractor_modules.append(attr_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to import {module_name}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Set fallback extractor
|
||||
try:
|
||||
from .extractors.extractorBinary import BinaryExtractor
|
||||
self.setFallback(BinaryExtractor())
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set fallback extractor: {str(e)}")
|
||||
|
||||
logger.info(f"ExtractorRegistry: Auto-discovered and registered {len(extractor_modules)} extractor classes: {', '.join(extractor_modules)}")
|
||||
logger.info(f"ExtractorRegistry: Total registered formats: {len(self._map)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ExtractorRegistry: Failed to auto-discover extractors: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def _auto_register_extractor(self, extractor: Extractor):
|
||||
"""Auto-register an extractor based on its declared supported formats."""
|
||||
try:
|
||||
# Register MIME types
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
for mime_type in mime_types:
|
||||
self.register(mime_type, extractor)
|
||||
|
||||
# Register file extensions
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
for ext in extensions:
|
||||
# Remove leading dot for registry key
|
||||
ext_key = ext.lstrip('.')
|
||||
self.register(ext_key, extractor)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to auto-register {extractor.__class__.__name__}: {str(e)}")
|
||||
|
||||
def register(self, key: str, extractor: Extractor):
|
||||
self._map[key] = extractor
|
||||
|
||||
def setFallback(self, extractor: Extractor):
|
||||
self._fallback = extractor
|
||||
|
||||
def resolve(self, mimeType: str, fileName: str) -> Optional[Extractor]:
|
||||
if mimeType in self._map:
|
||||
return self._map[mimeType]
|
||||
# simple extension fallback
|
||||
if "." in fileName:
|
||||
ext = fileName.lower().rsplit(".", 1)[-1]
|
||||
if ext in self._map:
|
||||
return self._map[ext]
|
||||
return self._fallback
|
||||
|
||||
def getAllSupportedFormats(self) -> Dict[str, Dict[str, list[str]]]:
|
||||
"""
|
||||
Get all supported formats from all registered extractors.
|
||||
|
||||
Returns:
|
||||
Dictionary with format information:
|
||||
{
|
||||
"extensions": {
|
||||
"extractor_name": [".ext1", ".ext2", ...]
|
||||
},
|
||||
"mime_types": {
|
||||
"extractor_name": ["mime/type1", "mime/type2", ...]
|
||||
}
|
||||
}
|
||||
"""
|
||||
formats = {"extensions": {}, "mime_types": {}}
|
||||
|
||||
# Get formats from registered extractors
|
||||
for key, extractor in self._map.items():
|
||||
if hasattr(extractor, 'getSupportedExtensions'):
|
||||
extensions = extractor.getSupportedExtensions()
|
||||
if extensions:
|
||||
formats["extensions"][key] = extensions
|
||||
|
||||
if hasattr(extractor, 'getSupportedMimeTypes'):
|
||||
mime_types = extractor.getSupportedMimeTypes()
|
||||
if mime_types:
|
||||
formats["mime_types"][key] = mime_types
|
||||
|
||||
# Add fallback extractor info
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedExtensions'):
|
||||
formats["extensions"]["fallback"] = self._fallback.getSupportedExtensions()
|
||||
if self._fallback and hasattr(self._fallback, 'getSupportedMimeTypes'):
|
||||
formats["mime_types"]["fallback"] = self._fallback.getSupportedMimeTypes()
|
||||
|
||||
return formats
|
||||
|
||||
|
||||
class ChunkerRegistry:
|
||||
def __init__(self):
|
||||
self._map: Dict[str, Chunker] = {}
|
||||
self._noop = Chunker()
|
||||
# Register default chunkers
|
||||
try:
|
||||
from .chunking.chunkerText import TextChunker
|
||||
from .chunking.chunkerTable import TableChunker
|
||||
from .chunking.chunkerStructure import StructureChunker
|
||||
from .chunking.chunkerImage import ImageChunker
|
||||
self.register("text", TextChunker())
|
||||
self.register("table", TableChunker())
|
||||
self.register("structure", StructureChunker())
|
||||
self.register("image", ImageChunker())
|
||||
# Use text chunker for container and binary content
|
||||
self.register("container", TextChunker())
|
||||
self.register("binary", TextChunker())
|
||||
except Exception as e:
|
||||
logger.error(f"ChunkerRegistry: Failed to register chunkers: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def register(self, typeGroup: str, chunker: Chunker):
|
||||
self._map[typeGroup] = chunker
|
||||
|
||||
def resolve(self, typeGroup: str) -> Chunker:
|
||||
return self._map.get(typeGroup, self._noop)
|
||||
|
||||
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
import uuid
|
||||
|
||||
|
||||
def makeId() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
|
@ -1,587 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
import logging
|
||||
import uuid
|
||||
import base64
|
||||
import traceback
|
||||
from typing import Any, Dict, List, Optional, Callable
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from modules.datamodels.datamodelChat import ChatDocument
|
||||
from modules.services.serviceGeneration.subDocumentUtility import (
|
||||
getFileExtension,
|
||||
getMimeTypeFromExtension,
|
||||
detectMimeTypeFromContent,
|
||||
detectMimeTypeFromData,
|
||||
convertDocumentDataToString
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class GenerationService:
|
||||
def __init__(self, serviceCenter=None):
|
||||
# Directly use interfaces from the provided service center (no self.service calls)
|
||||
self.services = serviceCenter
|
||||
self.interfaceDbComponent = serviceCenter.interfaceDbComponent
|
||||
self.interfaceDbChat = serviceCenter.interfaceDbChat
|
||||
|
||||
def processActionResultDocuments(self, actionResult, action) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process documents produced by AI actions and convert them to ChatDocument format.
|
||||
This function handles AI-generated document data, not document references.
|
||||
Returns a list of processed document dictionaries.
|
||||
"""
|
||||
try:
|
||||
# Read documents from the standard documents field (not data.documents)
|
||||
documents = actionResult.documents if actionResult and hasattr(actionResult, 'documents') else []
|
||||
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Process each document from the AI action result
|
||||
processedDocuments = []
|
||||
for doc in documents:
|
||||
processedDoc = self.processSingleDocument(doc, action)
|
||||
if processedDoc:
|
||||
processedDocuments.append(processedDoc)
|
||||
|
||||
return processedDocuments
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing action result documents: {str(e)}")
|
||||
return []
|
||||
|
||||
def processSingleDocument(self, doc: Any, action) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single document from action result with simplified logic"""
|
||||
try:
|
||||
# ActionDocument objects have documentName, documentData, and mimeType
|
||||
mime_type = doc.mimeType
|
||||
if mime_type == "application/octet-stream":
|
||||
content = doc.documentData
|
||||
# Detect MIME without relying on a service center
|
||||
mime_type = detectMimeTypeFromContent(content, doc.documentName)
|
||||
|
||||
# WICHTIG: Für ActionDocuments mit validationMetadata (z.B. context.extractContent)
|
||||
# müssen wir das gesamte ActionDocument serialisieren, nicht nur documentData
|
||||
document_data = doc.documentData
|
||||
if hasattr(doc, 'validationMetadata') and doc.validationMetadata:
|
||||
# Wenn validationMetadata vorhanden ist, serialisiere das gesamte ActionDocument-Format
|
||||
if mime_type == "application/json":
|
||||
# Erstelle ActionDocument-Format mit validationMetadata und documentData
|
||||
if hasattr(document_data, 'model_dump'):
|
||||
# Pydantic v2
|
||||
document_data_dict = document_data.model_dump()
|
||||
elif hasattr(document_data, 'dict'):
|
||||
# Pydantic v1
|
||||
document_data_dict = document_data.dict()
|
||||
elif isinstance(document_data, dict):
|
||||
document_data_dict = document_data
|
||||
elif isinstance(document_data, str):
|
||||
# JSON-String: parsen und als dict speichern (z.B. von outlook.composeAndDraftEmailWithContext)
|
||||
import json
|
||||
try:
|
||||
document_data_dict = json.loads(document_data)
|
||||
except json.JSONDecodeError:
|
||||
# Kein valides JSON - als plain text speichern
|
||||
document_data_dict = {"data": document_data}
|
||||
else:
|
||||
document_data_dict = {"data": str(document_data)}
|
||||
|
||||
# Erstelle ActionDocument-Format
|
||||
document_data = {
|
||||
"validationMetadata": doc.validationMetadata,
|
||||
"documentData": document_data_dict
|
||||
}
|
||||
|
||||
return {
|
||||
'fileName': doc.documentName,
|
||||
'fileSize': len(str(document_data)),
|
||||
'mimeType': mime_type,
|
||||
'content': document_data,
|
||||
'document': doc
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing single document: {str(e)}")
|
||||
return None
|
||||
|
||||
def createDocumentsFromActionResult(self, actionResult, action, workflow, message_id=None) -> List[Any]:
|
||||
"""
|
||||
Create actual document objects from action result and store them in the system.
|
||||
Returns a list of created document objects with proper workflow context.
|
||||
"""
|
||||
try:
|
||||
processed_docs = self.processActionResultDocuments(actionResult, action)
|
||||
|
||||
createdDocuments = []
|
||||
for i, doc_data in enumerate(processed_docs):
|
||||
try:
|
||||
documentName = doc_data['fileName']
|
||||
documentData = doc_data['content']
|
||||
mimeType = doc_data['mimeType']
|
||||
|
||||
# Handle binary data (images, PDFs, Office docs) differently from text
|
||||
# Check if this is a binary MIME type
|
||||
binaryMimeTypes = {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/pdf",
|
||||
"image/png", "image/jpeg", "image/jpg", "image/gif", "image/webp", "image/bmp", "image/svg+xml",
|
||||
}
|
||||
|
||||
isBinaryMimeType = mimeType in binaryMimeTypes
|
||||
base64encoded = False
|
||||
content = None
|
||||
|
||||
if isBinaryMimeType:
|
||||
# For binary data, handle bytes vs base64 string vs regular string
|
||||
if isinstance(documentData, bytes):
|
||||
# Already bytes - encode to base64 string for storage
|
||||
# base64 is already imported at module level
|
||||
content = base64.b64encode(documentData).decode('utf-8')
|
||||
base64encoded = True
|
||||
elif isinstance(documentData, str):
|
||||
# Check if it's already valid base64
|
||||
# base64 is already imported at module level
|
||||
try:
|
||||
# Try to decode to verify it's base64
|
||||
base64.b64decode(documentData, validate=True)
|
||||
# Valid base64 - use as is
|
||||
content = documentData
|
||||
base64encoded = True
|
||||
except Exception:
|
||||
# Not valid base64 - might be raw string, try encoding
|
||||
try:
|
||||
content = base64.b64encode(documentData.encode('utf-8')).decode('utf-8')
|
||||
base64encoded = True
|
||||
except Exception:
|
||||
logger.warning(f"Could not process binary data for {documentName}, skipping")
|
||||
continue
|
||||
else:
|
||||
# Other types - convert to string then base64
|
||||
# base64 is already imported at module level
|
||||
try:
|
||||
content = base64.b64encode(str(documentData).encode('utf-8')).decode('utf-8')
|
||||
base64encoded = True
|
||||
except Exception:
|
||||
logger.warning(f"Could not encode binary data for {documentName}, skipping")
|
||||
continue
|
||||
else:
|
||||
# Text data - convert to string
|
||||
content = convertDocumentDataToString(documentData, getFileExtension(documentName))
|
||||
|
||||
# Skip empty or minimal content
|
||||
minimalContentPatterns = ['{}', '[]', 'null', '""', "''"]
|
||||
if not content or content.strip() == "" or content.strip() in minimalContentPatterns:
|
||||
logger.warning(f"Empty or minimal content for document {documentName}, skipping")
|
||||
continue
|
||||
|
||||
# Normalize file extension based on mime type if missing or incorrect
|
||||
try:
|
||||
mime_to_ext = {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"application/pdf": ".pdf",
|
||||
"text/html": ".html",
|
||||
"text/markdown": ".md",
|
||||
"text/plain": ".txt",
|
||||
"application/json": ".json",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
"image/bmp": ".bmp",
|
||||
"image/svg+xml": ".svg",
|
||||
}
|
||||
expectedExt = mime_to_ext.get(mimeType)
|
||||
if expectedExt:
|
||||
if not documentName.lower().endswith(expectedExt):
|
||||
# Append/replace extension to match mime type
|
||||
if "." in documentName:
|
||||
documentName = documentName.rsplit(".", 1)[0] + expectedExt
|
||||
else:
|
||||
documentName = documentName + expectedExt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Create document with file in one step using interfaces directly
|
||||
document = self._createDocument(
|
||||
fileName=documentName,
|
||||
mimeType=mimeType,
|
||||
content=content,
|
||||
base64encoded=base64encoded,
|
||||
messageId=message_id
|
||||
)
|
||||
if document:
|
||||
# Set workflow context on the document if possible
|
||||
self._setDocumentWorkflowContext(document, action, workflow)
|
||||
createdDocuments.append(document)
|
||||
else:
|
||||
logger.error(f"Failed to create ChatDocument object for {documentName}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating document {doc_data.get('fileName', 'unknown')}: {str(e)}")
|
||||
continue
|
||||
|
||||
return createdDocuments
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating documents from action result: {str(e)}")
|
||||
return []
|
||||
|
||||
def _setDocumentWorkflowContext(self, document, action, workflow):
|
||||
"""Set workflow context on a document for proper routing and labeling"""
|
||||
try:
|
||||
# Get current workflow context directly from workflow object
|
||||
workflowContext = self._getWorkflowContext(workflow)
|
||||
workflowStats = self._getWorkflowStats(workflow)
|
||||
|
||||
currentRound = workflowContext.get('currentRound', 0)
|
||||
currentTask = workflowContext.get('currentTask', 0)
|
||||
currentAction = workflowContext.get('currentAction', 0)
|
||||
|
||||
# Try to set workflow context attributes if they exist
|
||||
if hasattr(document, 'roundNumber'):
|
||||
document.roundNumber = currentRound
|
||||
if hasattr(document, 'taskNumber'):
|
||||
document.taskNumber = currentTask
|
||||
if hasattr(document, 'actionNumber'):
|
||||
document.actionNumber = currentAction
|
||||
if hasattr(document, 'actionId'):
|
||||
document.actionId = action.id if hasattr(action, 'id') else None
|
||||
|
||||
# Set additional workflow metadata if available
|
||||
if hasattr(document, 'workflowId'):
|
||||
document.workflowId = workflowStats.get('workflowId', workflow.id if hasattr(workflow, 'id') else None)
|
||||
if hasattr(document, 'workflowStatus'):
|
||||
document.workflowStatus = workflowStats.get('workflowStatus', workflow.status if hasattr(workflow, 'status') else 'unknown')
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not set workflow context on document: {str(e)}")
|
||||
|
||||
def _createDocument(self, fileName: str, mimeType: str, content: str, base64encoded: bool = True, messageId: str = None) -> Optional[ChatDocument]:
|
||||
"""Create file and ChatDocument using interfaces without service indirection."""
|
||||
try:
|
||||
if not self.interfaceDbComponent:
|
||||
logger.error("Component interface not available for document creation")
|
||||
return None
|
||||
# Convert content to bytes
|
||||
if base64encoded:
|
||||
# base64 is already imported at module level
|
||||
content_bytes = base64.b64decode(content)
|
||||
else:
|
||||
content_bytes = content.encode('utf-8')
|
||||
# Create file and store data
|
||||
file_item = self.interfaceDbComponent.createFile(
|
||||
name=fileName,
|
||||
mimeType=mimeType,
|
||||
content=content_bytes
|
||||
)
|
||||
self.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
||||
# Collect file info
|
||||
file_info = self._getFileInfo(file_item.id)
|
||||
if not file_info:
|
||||
logger.error(f"Could not get file info for fileId: {file_item.id}")
|
||||
return None
|
||||
# Build ChatDocument
|
||||
document = ChatDocument(
|
||||
id=str(uuid.uuid4()),
|
||||
messageId=messageId or "",
|
||||
fileId=file_item.id,
|
||||
fileName=file_info.get("fileName", fileName),
|
||||
fileSize=file_info.get("size", 0),
|
||||
mimeType=file_info.get("mimeType", mimeType)
|
||||
)
|
||||
# Ensure document can access component interface later
|
||||
if hasattr(document, 'setComponentInterface') and self.interfaceDbComponent:
|
||||
try:
|
||||
document.setComponentInterface(self.interfaceDbComponent)
|
||||
except Exception:
|
||||
pass
|
||||
return document
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating document: {str(e)}")
|
||||
return None
|
||||
|
||||
def _getFileInfo(self, fileId: str) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
if not self.interfaceDbComponent:
|
||||
return None
|
||||
file_item = self.interfaceDbComponent.getFile(fileId)
|
||||
if file_item:
|
||||
return {
|
||||
"id": file_item.id,
|
||||
"fileName": file_item.fileName,
|
||||
"size": file_item.fileSize,
|
||||
"mimeType": file_item.mimeType,
|
||||
"fileHash": getattr(file_item, 'fileHash', None),
|
||||
"creationDate": getattr(file_item, 'creationDate', None)
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting file info for {fileId}: {str(e)}")
|
||||
return None
|
||||
|
||||
def _getWorkflowContext(self, workflow) -> Dict[str, int]:
|
||||
try:
|
||||
return {
|
||||
'currentRound': getattr(workflow, 'currentRound', 0),
|
||||
'currentTask': getattr(workflow, 'currentTask', 0),
|
||||
'currentAction': getattr(workflow, 'currentAction', 0)
|
||||
}
|
||||
except Exception:
|
||||
return {'currentRound': 0, 'currentTask': 0, 'currentAction': 0}
|
||||
|
||||
def _getWorkflowStats(self, workflow) -> Dict[str, Any]:
|
||||
try:
|
||||
context = self._getWorkflowContext(workflow)
|
||||
return {
|
||||
'currentRound': context['currentRound'],
|
||||
'currentTask': context['currentTask'],
|
||||
'currentAction': context['currentAction'],
|
||||
'totalTasks': getattr(workflow, 'totalTasks', 0),
|
||||
'totalActions': getattr(workflow, 'totalActions', 0),
|
||||
'workflowStatus': getattr(workflow, 'status', 'unknown'),
|
||||
'workflowId': getattr(workflow, 'id', 'unknown')
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
'currentRound': 0,
|
||||
'currentTask': 0,
|
||||
'currentAction': 0,
|
||||
'totalTasks': 0,
|
||||
'totalActions': 0,
|
||||
'workflowStatus': 'unknown',
|
||||
'workflowId': 'unknown'
|
||||
}
|
||||
|
||||
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render extracted JSON content to the specified output format.
|
||||
Processes EACH document separately and calls renderer for each.
|
||||
Each renderer can return 1..n documents (e.g., HTML + images).
|
||||
|
||||
Per-document format and language are extracted from structure (validated in State 3).
|
||||
Multiple documents can have different formats and languages.
|
||||
|
||||
Args:
|
||||
extractedContent: Structured JSON document with documents array
|
||||
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx) - Global fallback
|
||||
language: Language (global fallback) - Per-document language extracted from structure
|
||||
title: Report title
|
||||
userPrompt: User's original prompt for report generation
|
||||
aiService: AI service instance for generation prompt creation
|
||||
parentOperationId: Optional parent operation ID for hierarchical logging
|
||||
|
||||
Returns:
|
||||
List of RenderedDocument objects.
|
||||
Each RenderedDocument represents one rendered file (main document or supporting file)
|
||||
"""
|
||||
try:
|
||||
# Validate JSON input
|
||||
if not isinstance(extractedContent, dict):
|
||||
raise ValueError("extractedContent must be a JSON dictionary")
|
||||
|
||||
# Unified approach: Always expect "documents" array
|
||||
if "documents" not in extractedContent:
|
||||
raise ValueError("extractedContent must contain 'documents' array")
|
||||
|
||||
documents = extractedContent["documents"]
|
||||
if len(documents) == 0:
|
||||
raise ValueError("No documents found in 'documents' array")
|
||||
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
allRenderedDocuments = []
|
||||
|
||||
# Process EACH document separately
|
||||
for docIndex, doc in enumerate(documents):
|
||||
if not isinstance(doc, dict):
|
||||
logger.warning(f"Skipping invalid document at index {docIndex}")
|
||||
continue
|
||||
|
||||
if "sections" not in doc:
|
||||
logger.warning(f"Document {doc.get('id', docIndex)} has no sections, skipping")
|
||||
continue
|
||||
|
||||
# Determine format for this document
|
||||
# Check outputFormat field first (per-document), then format field (legacy), then global fallback
|
||||
docFormat = doc.get("outputFormat") or doc.get("format") or outputFormat
|
||||
|
||||
# Determine language for this document
|
||||
# Extract per-document language from structure (validated in State 3), fallback to global
|
||||
docLanguage = doc.get("language") or language
|
||||
|
||||
# Validate language format (should be 2-character ISO code, validated in State 3)
|
||||
if not isinstance(docLanguage, str) or len(docLanguage) != 2:
|
||||
logger.warning(f"Document {doc.get('id')} has invalid language format: {docLanguage}, using fallback")
|
||||
docLanguage = language # Use global fallback
|
||||
|
||||
# Get renderer for this document's format
|
||||
renderer = self._getFormatRenderer(docFormat)
|
||||
if not renderer:
|
||||
logger.warning(f"Unsupported format '{docFormat}' for document {doc.get('id', docIndex)}, skipping")
|
||||
continue
|
||||
|
||||
# Check output style classification (code/document/image/etc.) from renderer
|
||||
from modules.services.serviceGeneration.renderers.registry import getOutputStyle
|
||||
outputStyle = getOutputStyle(docFormat)
|
||||
if outputStyle:
|
||||
logger.debug(f"Document {doc.get('id', docIndex)} format '{docFormat}' classified as '{outputStyle}' style")
|
||||
# Store style in document metadata for potential use in processing paths
|
||||
if "metadata" not in doc:
|
||||
doc["metadata"] = {}
|
||||
doc["metadata"]["outputStyle"] = outputStyle
|
||||
|
||||
# Create JSON structure with single document (preserving metadata)
|
||||
singleDocContent = {
|
||||
"metadata": {**metadata, "language": docLanguage}, # Add per-document language to metadata
|
||||
"documents": [doc] # Only this document
|
||||
}
|
||||
|
||||
# Use document title or fallback to provided title
|
||||
docTitle = doc.get("title", title)
|
||||
|
||||
# Render this document (can return multiple files, e.g., HTML + images)
|
||||
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService)
|
||||
allRenderedDocuments.extend(renderedDocs)
|
||||
|
||||
logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)")
|
||||
return allRenderedDocuments
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def generateDocumentWithTwoPhases(
|
||||
self,
|
||||
userPrompt: str,
|
||||
cachedContent: Optional[Dict[str, Any]] = None,
|
||||
contentParts: Optional[List[Any]] = None,
|
||||
maxSectionLength: int = 500,
|
||||
parallelGeneration: bool = True,
|
||||
progressCallback: Optional[Callable] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate document using two-phase approach:
|
||||
1. Generate structure skeleton with empty sections
|
||||
2. Generate content for each section iteratively
|
||||
|
||||
This is the core logic for document generation in AI calls.
|
||||
|
||||
Args:
|
||||
userPrompt: User's original prompt
|
||||
cachedContent: Optional extracted content cache (from extraction phase)
|
||||
contentParts: Optional list of ContentParts to use for structure generation
|
||||
maxSectionLength: Maximum words for simple sections
|
||||
parallelGeneration: Enable parallel section generation
|
||||
progressCallback: Optional callback function(progress, total, message) for progress updates
|
||||
|
||||
Returns:
|
||||
Complete document structure with populated elements ready for rendering
|
||||
"""
|
||||
try:
|
||||
from modules.services.serviceGeneration.subStructureGenerator import StructureGenerator
|
||||
from modules.services.serviceGeneration.subContentGenerator import ContentGenerator
|
||||
|
||||
# Phase 1: Generate structure skeleton
|
||||
if progressCallback:
|
||||
progressCallback(0, 100, "Generating document structure...")
|
||||
|
||||
structureGenerator = StructureGenerator(self.services)
|
||||
|
||||
# Extract imageDocuments from cachedContent if available
|
||||
existingImages = None
|
||||
if cachedContent and cachedContent.get("imageDocuments"):
|
||||
existingImages = cachedContent.get("imageDocuments")
|
||||
|
||||
structure = await structureGenerator.generateStructure(
|
||||
userPrompt=userPrompt,
|
||||
documentList=None, # Not used in current implementation
|
||||
cachedContent=cachedContent,
|
||||
contentParts=contentParts, # Pass ContentParts for structure generation
|
||||
maxSectionLength=maxSectionLength,
|
||||
existingImages=existingImages
|
||||
)
|
||||
|
||||
if progressCallback:
|
||||
progressCallback(30, 100, "Structure generated, starting content generation...")
|
||||
|
||||
# Phase 2: Generate content for each section
|
||||
contentGenerator = ContentGenerator(self.services)
|
||||
|
||||
# Create progress callback wrapper for content generation phase (30-90%)
|
||||
def contentProgressCallback(sectionIndex: int, totalSections: int, message: str):
|
||||
if progressCallback:
|
||||
# Map section progress to overall progress (30% to 90%)
|
||||
if totalSections > 0:
|
||||
overallProgress = 30 + int(60 * (sectionIndex / totalSections))
|
||||
else:
|
||||
overallProgress = 30
|
||||
progressCallback(overallProgress, 100, f"Section {sectionIndex}/{totalSections}: {message}")
|
||||
|
||||
completeStructure = await contentGenerator.generateContent(
|
||||
structure=structure,
|
||||
cachedContent=cachedContent,
|
||||
userPrompt=userPrompt,
|
||||
contentParts=contentParts, # Pass ContentParts for content generation
|
||||
progressCallback=contentProgressCallback,
|
||||
parallelGeneration=parallelGeneration
|
||||
)
|
||||
|
||||
if progressCallback:
|
||||
progressCallback(100, 100, "Document generation complete")
|
||||
|
||||
return completeStructure
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in two-phase document generation: {str(e)}")
|
||||
logger.debug(traceback.format_exc())
|
||||
raise
|
||||
|
||||
async def getAdaptiveExtractionPrompt(
|
||||
self,
|
||||
outputFormat: str,
|
||||
userPrompt: str,
|
||||
title: str,
|
||||
aiService=None
|
||||
) -> str:
|
||||
"""Get adaptive extraction prompt."""
|
||||
from modules.services.serviceExtraction.subPromptBuilderExtraction import buildExtractionPrompt
|
||||
return await buildExtractionPrompt(
|
||||
outputFormat=outputFormat,
|
||||
userPrompt=userPrompt,
|
||||
title=title,
|
||||
aiService=aiService,
|
||||
services=self.services
|
||||
)
|
||||
|
||||
|
||||
def _getFormatRenderer(self, output_format: str):
|
||||
"""Get the appropriate document renderer for the specified format."""
|
||||
try:
|
||||
from .renderers.registry import getRenderer, getSupportedFormats
|
||||
renderer = getRenderer(output_format, services=self.services, outputStyle='document')
|
||||
|
||||
if renderer:
|
||||
return renderer
|
||||
|
||||
# Log available formats for debugging
|
||||
availableFormats = getSupportedFormats()
|
||||
logger.error(
|
||||
f"No renderer found for format '{output_format}'. "
|
||||
f"Available formats: {availableFormats}"
|
||||
)
|
||||
|
||||
# Fallback to text renderer if no specific renderer found
|
||||
logger.warning(f"Falling back to text renderer for format {output_format}")
|
||||
fallbackRenderer = getRenderer('text', services=self.services, outputStyle='document')
|
||||
if fallbackRenderer:
|
||||
return fallbackRenderer
|
||||
|
||||
logger.error("Even text renderer fallback failed")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting renderer for {output_format}: {str(e)}")
|
||||
# traceback is already imported at module level
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
|
@ -1,939 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Code Generation Path
|
||||
|
||||
Handles code generation with multi-file project support, dependency handling,
|
||||
and proper cross-file references.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import re
|
||||
from typing import Dict, Any, List, Optional
|
||||
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
|
||||
from modules.datamodels.datamodelExtraction import ContentPart
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
||||
from modules.shared.jsonUtils import extractJsonString
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CodeGenerationPath:
|
||||
"""Code generation path."""
|
||||
|
||||
def __init__(self, services):
|
||||
self.services = services
|
||||
|
||||
|
||||
async def generateCode(
|
||||
self,
|
||||
userPrompt: str,
|
||||
outputFormat: str = None,
|
||||
contentParts: Optional[List[ContentPart]] = None,
|
||||
title: str = "Generated Code",
|
||||
parentOperationId: Optional[str] = None
|
||||
) -> AiResponse:
|
||||
"""
|
||||
Generate code files with multi-file project support.
|
||||
|
||||
Returns: AiResponse with code files as documents
|
||||
"""
|
||||
# Create operation ID
|
||||
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
||||
codeOperationId = f"code_gen_{workflowId}_{int(time.time())}"
|
||||
|
||||
# Start progress tracking
|
||||
self.services.chat.progressLogStart(
|
||||
codeOperationId,
|
||||
"Code Generation",
|
||||
"Code Generation",
|
||||
f"Format: {outputFormat or 'txt'}",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
# Detect language and project type from prompt or outputFormat
|
||||
language, projectType = self._detectLanguageAndProjectType(userPrompt, outputFormat)
|
||||
|
||||
# Phase 1: Code structure generation (with looping)
|
||||
self.services.chat.progressLogUpdate(codeOperationId, 0.2, "Generating code structure")
|
||||
codeStructure = await self._generateCodeStructure(
|
||||
userPrompt=userPrompt,
|
||||
language=language,
|
||||
outputFormat=outputFormat,
|
||||
contentParts=contentParts
|
||||
)
|
||||
|
||||
# Phase 2: Code content generation (with dependency handling)
|
||||
self.services.chat.progressLogUpdate(codeOperationId, 0.5, "Generating code content")
|
||||
codeFiles = await self._generateCodeContent(
|
||||
codeStructure,
|
||||
codeOperationId,
|
||||
userPrompt=userPrompt,
|
||||
contentParts=contentParts
|
||||
)
|
||||
|
||||
# Phase 3: Code formatting & validation
|
||||
self.services.chat.progressLogUpdate(codeOperationId, 0.8, "Formatting code files")
|
||||
formattedFiles = await self._formatAndValidateCode(codeFiles)
|
||||
|
||||
# Phase 4: Code Rendering (Renderer-Based)
|
||||
self.services.chat.progressLogUpdate(codeOperationId, 0.9, "Rendering code files")
|
||||
|
||||
# Group files by format
|
||||
filesByFormat = {}
|
||||
for file in formattedFiles:
|
||||
fileType = file.get("fileType", outputFormat or "txt")
|
||||
if fileType not in filesByFormat:
|
||||
filesByFormat[fileType] = []
|
||||
filesByFormat[fileType].append(file)
|
||||
|
||||
# Render each format group using appropriate renderer
|
||||
allRenderedDocuments = []
|
||||
for fileType, files in filesByFormat.items():
|
||||
# Get renderer for this format
|
||||
renderer = self._getCodeRenderer(fileType)
|
||||
|
||||
if renderer:
|
||||
# Use code renderer
|
||||
renderedDocs = await renderer.renderCodeFiles(
|
||||
codeFiles=files,
|
||||
metadata=codeStructure.get("metadata", {}),
|
||||
userPrompt=userPrompt
|
||||
)
|
||||
allRenderedDocuments.extend(renderedDocs)
|
||||
else:
|
||||
# Fallback: output directly (for formats without renderers)
|
||||
for file in files:
|
||||
mimeType = self._getMimeType(file.get("fileType", "txt"))
|
||||
content = file.get("content", "")
|
||||
contentBytes = content.encode('utf-8') if isinstance(content, str) else content
|
||||
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
allRenderedDocuments.append(
|
||||
RenderedDocument(
|
||||
documentData=contentBytes,
|
||||
mimeType=mimeType,
|
||||
filename=file.get("filename", "generated.txt"),
|
||||
metadata=codeStructure.get("metadata", {})
|
||||
)
|
||||
)
|
||||
|
||||
# Convert RenderedDocument to DocumentData
|
||||
documents = []
|
||||
for renderedDoc in allRenderedDocuments:
|
||||
documents.append(DocumentData(
|
||||
documentName=renderedDoc.filename,
|
||||
documentData=renderedDoc.documentData,
|
||||
mimeType=renderedDoc.mimeType,
|
||||
sourceJson=renderedDoc.metadata if hasattr(renderedDoc, 'metadata') else None
|
||||
))
|
||||
|
||||
metadata = AiResponseMetadata(
|
||||
title=title,
|
||||
operationType=OperationTypeEnum.DATA_GENERATE.value
|
||||
)
|
||||
|
||||
# Create summary JSON for content field
|
||||
summaryContent = {
|
||||
"type": "code_generation",
|
||||
"metadata": codeStructure.get("metadata", {}),
|
||||
"files": [
|
||||
{
|
||||
"filename": doc.documentName,
|
||||
"mimeType": doc.mimeType
|
||||
}
|
||||
for doc in documents
|
||||
],
|
||||
"fileCount": len(documents)
|
||||
}
|
||||
|
||||
self.services.chat.progressLogFinish(codeOperationId, True)
|
||||
|
||||
return AiResponse(
|
||||
documents=documents,
|
||||
content=json.dumps(summaryContent, ensure_ascii=False),
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in code generation: {str(e)}")
|
||||
self.services.chat.progressLogFinish(codeOperationId, False)
|
||||
raise
|
||||
|
||||
def _detectLanguageAndProjectType(self, userPrompt: str, outputFormat: Optional[str]) -> tuple:
|
||||
"""Detect programming language and project type from prompt or format."""
|
||||
promptLower = userPrompt.lower()
|
||||
|
||||
# Detect language
|
||||
language = None
|
||||
if outputFormat:
|
||||
if outputFormat == "py":
|
||||
language = "python"
|
||||
elif outputFormat in ["js", "ts"]:
|
||||
language = outputFormat
|
||||
elif outputFormat == "html":
|
||||
language = "html"
|
||||
|
||||
if not language:
|
||||
if "python" in promptLower or ".py" in promptLower:
|
||||
language = "python"
|
||||
elif "javascript" in promptLower or ".js" in promptLower:
|
||||
language = "javascript"
|
||||
elif "typescript" in promptLower or ".ts" in promptLower:
|
||||
language = "typescript"
|
||||
elif "html" in promptLower:
|
||||
language = "html"
|
||||
else:
|
||||
language = "python" # Default
|
||||
|
||||
# Detect project type
|
||||
projectType = "single_file"
|
||||
if "multi" in promptLower or "multiple files" in promptLower or "project" in promptLower:
|
||||
projectType = "multi_file"
|
||||
|
||||
return language, projectType
|
||||
|
||||
async def _generateCodeStructure(
|
||||
self,
|
||||
userPrompt: str,
|
||||
language: str,
|
||||
outputFormat: Optional[str],
|
||||
contentParts: Optional[List[ContentPart]]
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate code structure using looping system."""
|
||||
|
||||
# Build content parts index (similar to document generation)
|
||||
contentPartsIndex = ""
|
||||
if contentParts:
|
||||
validParts = []
|
||||
for part in contentParts:
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
originalFileName = part.metadata.get('originalFileName', 'N/A')
|
||||
|
||||
# Include reference parts and parts with data
|
||||
if contentFormat == "reference" or (part.data and len(str(part.data).strip()) > 0):
|
||||
validParts.append(part)
|
||||
|
||||
if validParts:
|
||||
contentPartsIndex = "\n## AVAILABLE CONTENT PARTS\n"
|
||||
for i, part in enumerate(validParts, 1):
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
originalFileName = part.metadata.get('originalFileName', 'N/A')
|
||||
|
||||
contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n"
|
||||
contentPartsIndex += f" Format: {contentFormat}\n"
|
||||
contentPartsIndex += f" Type: {part.typeGroup}\n"
|
||||
contentPartsIndex += f" MIME Type: {part.mimeType or 'N/A'}\n"
|
||||
contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n"
|
||||
contentPartsIndex += f" Original file name: {originalFileName}\n"
|
||||
contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
|
||||
|
||||
if not contentPartsIndex:
|
||||
contentPartsIndex = "\n(No content parts available)"
|
||||
|
||||
# Create template structure explicitly (not extracted from prompt)
|
||||
templateStructure = f"""{{
|
||||
"metadata": {{
|
||||
"language": "{language}",
|
||||
"projectType": "single_file|multi_file",
|
||||
"projectName": ""
|
||||
}},
|
||||
"files": [
|
||||
{{
|
||||
"id": "",
|
||||
"filename": "",
|
||||
"fileType": "",
|
||||
"dependencies": [],
|
||||
"imports": [],
|
||||
"functions": [],
|
||||
"classes": []
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
# Build structure generation prompt
|
||||
structurePrompt = f"""# TASK: Generate Code Project Structure
|
||||
|
||||
This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
|
||||
|
||||
## USER REQUEST (for context)
|
||||
```
|
||||
{userPrompt}
|
||||
```
|
||||
{contentPartsIndex}
|
||||
|
||||
## LANGUAGE
|
||||
{language}
|
||||
|
||||
## TASK DESCRIPTION
|
||||
Analyze the USER REQUEST above and create a project structure that fulfills ALL requirements mentioned in the request.
|
||||
|
||||
IMPORTANT: If the request mentions multiple files (e.g., "3 files", "config.json and customers.json", etc.), you MUST include ALL requested files in the files array. Set projectType to "multi_file" when multiple files are requested.
|
||||
|
||||
## CONTENT PARTS USAGE (if available)
|
||||
If AVAILABLE CONTENT PARTS are listed above, use them to inform the file structure:
|
||||
|
||||
**Analyzing Content Parts:**
|
||||
- Review each ContentPart's format, type, original file name, and usage hint
|
||||
- Content parts with "reference" format = documents/images that will be processed/extracted
|
||||
- Content parts with "extracted" format = pre-processed data ready to use
|
||||
- Content parts with "object" format = images/documents to be displayed or processed
|
||||
|
||||
**Mapping Content Parts to Files:**
|
||||
- If content parts contain data (e.g., expense receipts, customer lists), create data files (JSON/CSV) that will store/represent that data
|
||||
- If content parts are documents to be processed (e.g., PDFs), you may need code files that parse/process them
|
||||
- Use the original file names and usage hints to determine appropriate filenames and file types
|
||||
|
||||
**Populating File Structure Fields:**
|
||||
- **dependencies**: List file IDs that this file depends on (e.g., if a Python script reads a JSON config file, the script depends on the config file)
|
||||
- **imports**: For code files, list imports needed based on content parts (e.g., if processing PDFs: ["import PyPDF2"], if processing CSV: ["import csv"], if processing JSON: ["import json"])
|
||||
- **functions**: For CODE files only - list function signatures if the USER REQUEST specifies functionality (e.g., {{"name": "parseReceipt", "signature": "def parseReceipt(pdf_path: str) -> dict"}})
|
||||
- **classes**: For CODE files only - list class definitions if the USER REQUEST specifies OOP structure
|
||||
- **functions/classes for DATA files**: Leave as empty arrays [] - data files (JSON/CSV/XML) don't contain executable code
|
||||
|
||||
## FILE STRUCTURE REQUIREMENTS
|
||||
Create a JSON structure with:
|
||||
1. metadata: {{"language": "{language}", "projectType": "single_file|multi_file", "projectName": "..."}}
|
||||
- projectName: Derive from USER REQUEST or content parts (e.g., "expense-tracker", "customer-manager")
|
||||
|
||||
2. files: Array of file structures, each with:
|
||||
- id: Unique identifier (e.g., "file_1", "file_2")
|
||||
- filename: File name matching USER REQUEST requirements (e.g., "config.json", "customers.json", "expenses.csv")
|
||||
- fileType: File extension matching the requested format (e.g., "json", "py", "js", "csv", "xml")
|
||||
- dependencies: List of file IDs this file depends on (for multi-file projects where files reference each other)
|
||||
- imports: List of import statements that this file will need (e.g., ["import json", "import csv"] for Python files processing JSON/CSV)
|
||||
- functions: Array of function signatures {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV)
|
||||
- classes: Array of class definitions {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV)
|
||||
|
||||
IMPORTANT FOR DATA FILES (JSON, CSV, XML):
|
||||
- For pure data files (config.json, customers.json, expenses.csv), leave functions and classes as empty arrays []
|
||||
- These files contain structured data, not executable code
|
||||
- Use imports only if the file will be processed by code (e.g., a Python script that reads the CSV)
|
||||
|
||||
IMPORTANT FOR CODE FILES (Python, JavaScript, etc.):
|
||||
- Include functions/classes if the USER REQUEST specifies functionality
|
||||
- Use dependencies to indicate which data files this code file reads/processes
|
||||
- Use imports to specify what libraries/modules are needed
|
||||
|
||||
For single-file projects, return one file. For multi-file projects, include ALL requested files in the files array.
|
||||
|
||||
Return ONLY valid JSON matching the request above.
|
||||
"""
|
||||
|
||||
# Build continuation prompt builder
|
||||
async def buildCodeStructurePromptWithContinuation(
|
||||
continuationContext: Any,
|
||||
templateStructure: str,
|
||||
basePrompt: str
|
||||
) -> str:
|
||||
"""Build code structure prompt with continuation context. Uses unified signature.
|
||||
|
||||
Note: All initial context (userPrompt, contentParts, etc.) is already
|
||||
contained in basePrompt. This function only adds continuation-specific instructions.
|
||||
"""
|
||||
# Extract continuation context fields (only what's needed for continuation)
|
||||
incompletePart = continuationContext.incomplete_part
|
||||
lastRawJson = continuationContext.last_raw_json
|
||||
|
||||
# Generate both overlap context and hierarchy context using jsonContinuation
|
||||
overlapContext = ""
|
||||
unifiedContext = ""
|
||||
if lastRawJson:
|
||||
# Get contexts directly from jsonContinuation
|
||||
from modules.shared.jsonContinuation import getContexts
|
||||
contexts = getContexts(lastRawJson)
|
||||
overlapContext = contexts.overlapContext
|
||||
unifiedContext = contexts.hierarchyContextForPrompt
|
||||
elif incompletePart:
|
||||
unifiedContext = incompletePart
|
||||
else:
|
||||
unifiedContext = "Unable to extract context - response was completely broken"
|
||||
|
||||
# Build unified continuation prompt format
|
||||
continuationPrompt = f"""{basePrompt}
|
||||
|
||||
--- CONTINUATION REQUEST ---
|
||||
The previous JSON response was incomplete. Continue from where it stopped.
|
||||
|
||||
Context showing structure hierarchy with cut point:
|
||||
```
|
||||
{unifiedContext}
|
||||
```
|
||||
|
||||
Overlap Requirement:
|
||||
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
|
||||
|
||||
Overlap context (start your response with this exact text):
|
||||
```json
|
||||
{overlapContext if overlapContext else "No overlap context available"}
|
||||
```
|
||||
|
||||
TASK:
|
||||
1. Start your response EXACTLY with the overlap context shown above (character by character)
|
||||
2. Continue seamlessly from where the overlap context ends
|
||||
3. Complete the remaining content following the JSON structure template above
|
||||
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
|
||||
|
||||
CRITICAL:
|
||||
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
|
||||
- Continue seamlessly after the overlap context with new content
|
||||
- Your response must be valid JSON matching the structure template above"""
|
||||
return continuationPrompt
|
||||
|
||||
# Use generic looping system with code_structure use case
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
structureJson = await self.services.ai.callAiWithLooping(
|
||||
prompt=structurePrompt,
|
||||
options=options,
|
||||
promptBuilder=buildCodeStructurePromptWithContinuation,
|
||||
promptArgs={
|
||||
"userPrompt": userPrompt,
|
||||
"contentParts": contentParts,
|
||||
"templateStructure": templateStructure,
|
||||
"basePrompt": structurePrompt
|
||||
},
|
||||
useCaseId="code_structure",
|
||||
debugPrefix="code_structure_generation",
|
||||
contentParts=contentParts
|
||||
)
|
||||
|
||||
# Extract JSON from markdown fences if present
|
||||
extractedJson = extractJsonString(structureJson)
|
||||
parsed = json.loads(extractedJson)
|
||||
return parsed
|
||||
|
||||
async def _generateCodeContent(
|
||||
self,
|
||||
codeStructure: Dict[str, Any],
|
||||
parentOperationId: str,
|
||||
userPrompt: str = None,
|
||||
contentParts: Optional[List[ContentPart]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Generate code content for each file with dependency handling."""
|
||||
files = codeStructure.get("files", [])
|
||||
metadata = codeStructure.get("metadata", {})
|
||||
|
||||
if not files:
|
||||
raise ValueError("No files found in code structure")
|
||||
|
||||
# Step 1: Resolve dependency order
|
||||
orderedFiles = self._resolveDependencyOrder(files)
|
||||
|
||||
# Step 2: Generate dependency files first (requirements.txt, package.json, etc.)
|
||||
dependencyFiles = await self._generateDependencyFiles(metadata, orderedFiles)
|
||||
|
||||
# Step 3: Generate code files in dependency order (not fully parallel)
|
||||
codeFiles = []
|
||||
generatedFileContext = {} # Track what's been generated for cross-file references
|
||||
|
||||
for idx, fileStructure in enumerate(orderedFiles):
|
||||
# Update progress
|
||||
progress = 0.5 + (0.4 * (idx / len(orderedFiles)))
|
||||
self.services.chat.progressLogUpdate(
|
||||
parentOperationId,
|
||||
progress,
|
||||
f"Generating {fileStructure.get('filename', 'file')}"
|
||||
)
|
||||
|
||||
# Provide context about already-generated files for proper imports
|
||||
fileContext = self._buildFileContext(generatedFileContext, fileStructure)
|
||||
|
||||
# Generate this file with context
|
||||
fileContent = await self._generateSingleFileContent(
|
||||
fileStructure,
|
||||
fileContext=fileContext,
|
||||
allFilesStructure=orderedFiles,
|
||||
metadata=metadata,
|
||||
userPrompt=userPrompt,
|
||||
contentParts=contentParts
|
||||
)
|
||||
|
||||
codeFiles.append(fileContent)
|
||||
|
||||
# Update context with generated file info (for next files)
|
||||
generatedFileContext[fileStructure["id"]] = {
|
||||
"filename": fileContent.get("filename", fileStructure.get("filename")),
|
||||
"functions": fileContent.get("functions", []),
|
||||
"classes": fileContent.get("classes", []),
|
||||
"exports": fileContent.get("exports", [])
|
||||
}
|
||||
|
||||
# Combine dependency files and code files
|
||||
return dependencyFiles + codeFiles
|
||||
|
||||
def _resolveDependencyOrder(self, files: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Resolve file generation order based on dependencies using topological sort."""
|
||||
# Build dependency graph
|
||||
fileMap = {f["id"]: f for f in files}
|
||||
dependencies = {}
|
||||
|
||||
for file in files:
|
||||
fileId = file["id"]
|
||||
deps = file.get("dependencies", []) # List of file IDs this file depends on
|
||||
dependencies[fileId] = deps
|
||||
|
||||
# Topological sort
|
||||
ordered = []
|
||||
visited = set()
|
||||
tempMark = set()
|
||||
|
||||
def visit(fileId: str):
|
||||
if fileId in tempMark:
|
||||
# Circular dependency detected - break it
|
||||
logger.warning(f"Circular dependency detected involving {fileId}")
|
||||
return
|
||||
if fileId in visited:
|
||||
return
|
||||
|
||||
tempMark.add(fileId)
|
||||
for depId in dependencies.get(fileId, []):
|
||||
if depId in fileMap:
|
||||
visit(depId)
|
||||
tempMark.remove(fileId)
|
||||
visited.add(fileId)
|
||||
ordered.append(fileMap[fileId])
|
||||
|
||||
for file in files:
|
||||
if file["id"] not in visited:
|
||||
visit(file["id"])
|
||||
|
||||
return ordered
|
||||
|
||||
async def _generateDependencyFiles(
|
||||
self,
|
||||
metadata: Dict[str, Any],
|
||||
files: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Generate dependency files (requirements.txt, package.json, etc.)."""
|
||||
language = metadata.get("language", "").lower()
|
||||
dependencyFiles = []
|
||||
|
||||
# Generate requirements.txt for Python
|
||||
if language in ["python", "py"]:
|
||||
requirementsContent = await self._generateRequirementsTxt(files)
|
||||
if requirementsContent:
|
||||
dependencyFiles.append({
|
||||
"filename": "requirements.txt",
|
||||
"content": requirementsContent,
|
||||
"fileType": "txt",
|
||||
"id": "requirements_txt"
|
||||
})
|
||||
|
||||
# Generate package.json for JavaScript/TypeScript
|
||||
elif language in ["javascript", "typescript", "js", "ts"]:
|
||||
packageJson = await self._generatePackageJson(files, metadata)
|
||||
if packageJson:
|
||||
dependencyFiles.append({
|
||||
"filename": "package.json",
|
||||
"content": json.dumps(packageJson, indent=2),
|
||||
"fileType": "json",
|
||||
"id": "package_json"
|
||||
})
|
||||
|
||||
return dependencyFiles
|
||||
|
||||
async def _generateRequirementsTxt(
|
||||
self,
|
||||
files: List[Dict[str, Any]]
|
||||
) -> Optional[str]:
|
||||
"""Generate requirements.txt content from Python imports."""
|
||||
pythonPackages = set()
|
||||
|
||||
for file in files:
|
||||
imports = file.get("imports", [])
|
||||
if isinstance(imports, list):
|
||||
for imp in imports:
|
||||
if isinstance(imp, str):
|
||||
# Extract package name from import
|
||||
# Handle: "from flask import", "import flask", "from flask import Flask"
|
||||
imp = imp.strip()
|
||||
if "import" in imp:
|
||||
if "from" in imp:
|
||||
# "from package import ..."
|
||||
parts = imp.split("from")
|
||||
if len(parts) > 1:
|
||||
package = parts[1].split("import")[0].strip()
|
||||
if package and not package.startswith("."):
|
||||
pythonPackages.add(package.split(".")[0]) # Get root package
|
||||
else:
|
||||
# "import package" or "import package.module"
|
||||
parts = imp.split("import")
|
||||
if len(parts) > 1:
|
||||
package = parts[1].strip().split(".")[0].strip()
|
||||
if package and not package.startswith("."):
|
||||
pythonPackages.add(package)
|
||||
|
||||
if pythonPackages:
|
||||
return "\n".join(sorted(pythonPackages))
|
||||
return None
|
||||
|
||||
async def _generatePackageJson(
|
||||
self,
|
||||
files: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Generate package.json content from JavaScript/TypeScript imports."""
|
||||
npmPackages = {}
|
||||
|
||||
for file in files:
|
||||
imports = file.get("imports", [])
|
||||
if isinstance(imports, list):
|
||||
for imp in imports:
|
||||
if isinstance(imp, str):
|
||||
# Extract npm package from import
|
||||
# Handle: "import express from 'express'", "const express = require('express')"
|
||||
imp = imp.strip()
|
||||
if "from" in imp:
|
||||
# ES6 import: "import ... from 'package'"
|
||||
parts = imp.split("from")
|
||||
if len(parts) > 1:
|
||||
package = parts[1].strip().strip("'\"")
|
||||
if package and not package.startswith(".") and not package.startswith("/"):
|
||||
npmPackages[package] = "*"
|
||||
elif "require" in imp:
|
||||
# CommonJS: "require('package')"
|
||||
match = re.search(r"require\(['\"]([^'\"]+)['\"]\)", imp)
|
||||
if match:
|
||||
package = match.group(1)
|
||||
if not package.startswith(".") and not package.startswith("/"):
|
||||
npmPackages[package] = "*"
|
||||
|
||||
if npmPackages:
|
||||
return {
|
||||
"name": metadata.get("projectName", "generated-project"),
|
||||
"version": "1.0.0",
|
||||
"dependencies": npmPackages
|
||||
}
|
||||
return None
|
||||
|
||||
def _buildFileContext(
|
||||
self,
|
||||
generatedFileContext: Dict[str, Dict[str, Any]],
|
||||
currentFile: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Build context about other files for proper imports/references."""
|
||||
context = {
|
||||
"availableFiles": [],
|
||||
"availableFunctions": {},
|
||||
"availableClasses": {}
|
||||
}
|
||||
|
||||
# Add info about already-generated files
|
||||
for fileId, fileInfo in generatedFileContext.items():
|
||||
context["availableFiles"].append({
|
||||
"id": fileId,
|
||||
"filename": fileInfo["filename"],
|
||||
"functions": fileInfo.get("functions", []),
|
||||
"classes": fileInfo.get("classes", []),
|
||||
"exports": fileInfo.get("exports", [])
|
||||
})
|
||||
|
||||
# Build function/class maps for easy lookup
|
||||
for func in fileInfo.get("functions", []):
|
||||
funcName = func.get("name", "")
|
||||
if funcName:
|
||||
context["availableFunctions"][funcName] = {
|
||||
"file": fileInfo["filename"],
|
||||
"signature": func.get("signature", "")
|
||||
}
|
||||
|
||||
for cls in fileInfo.get("classes", []):
|
||||
className = cls.get("name", "")
|
||||
if className:
|
||||
context["availableClasses"][className] = {
|
||||
"file": fileInfo["filename"]
|
||||
}
|
||||
|
||||
return context
|
||||
|
||||
async def _generateSingleFileContent(
|
||||
self,
|
||||
fileStructure: Dict[str, Any],
|
||||
fileContext: Dict[str, Any] = None,
|
||||
allFilesStructure: List[Dict[str, Any]] = None,
|
||||
metadata: Dict[str, Any] = None,
|
||||
userPrompt: str = None,
|
||||
contentParts: Optional[List[ContentPart]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate code content for a single file with context about other files."""
|
||||
|
||||
# Build prompt with context about other files for proper imports
|
||||
filename = fileStructure.get("filename", "generated.py")
|
||||
fileType = fileStructure.get("fileType", "py")
|
||||
dependencies = fileStructure.get("dependencies", [])
|
||||
functions = fileStructure.get("functions", [])
|
||||
classes = fileStructure.get("classes", [])
|
||||
|
||||
contextInfo = ""
|
||||
if fileContext and fileContext.get("availableFiles"):
|
||||
contextInfo = "\n\nAvailable files and their exports:\n"
|
||||
for fileInfo in fileContext["availableFiles"]:
|
||||
contextInfo += f"- {fileInfo['filename']}: "
|
||||
funcs = [f.get("name", "") for f in fileInfo.get("functions", [])]
|
||||
cls = [c.get("name", "") for c in fileInfo.get("classes", [])]
|
||||
exports = []
|
||||
if funcs:
|
||||
exports.extend(funcs)
|
||||
if cls:
|
||||
exports.extend(cls)
|
||||
if exports:
|
||||
contextInfo += ", ".join(exports)
|
||||
contextInfo += "\n"
|
||||
|
||||
# Build content parts section if available
|
||||
contentPartsSection = ""
|
||||
if contentParts:
|
||||
relevantParts = []
|
||||
for part in contentParts:
|
||||
# Include parts that might be relevant to this file
|
||||
usageHint = part.metadata.get('usageHint', '').lower()
|
||||
originalFileName = part.metadata.get('originalFileName', '').lower()
|
||||
filenameLower = filename.lower()
|
||||
|
||||
# Check if this content part is relevant to this file
|
||||
if (filenameLower in usageHint or
|
||||
filenameLower in originalFileName or
|
||||
part.metadata.get('contentFormat') == 'reference' or
|
||||
(part.data and len(str(part.data).strip()) > 0)):
|
||||
relevantParts.append(part)
|
||||
|
||||
if relevantParts:
|
||||
contentPartsSection = "\n## AVAILABLE CONTENT PARTS\n"
|
||||
for i, part in enumerate(relevantParts, 1):
|
||||
contentFormat = part.metadata.get("contentFormat", "unknown")
|
||||
originalFileName = part.metadata.get('originalFileName', 'N/A')
|
||||
contentPartsSection += f"\n{i}. ContentPart ID: {part.id}\n"
|
||||
contentPartsSection += f" Format: {contentFormat}\n"
|
||||
contentPartsSection += f" Type: {part.typeGroup}\n"
|
||||
contentPartsSection += f" Original file name: {originalFileName}\n"
|
||||
contentPartsSection += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n"
|
||||
# Include actual content if it's small enough (for data files like CSV, JSON)
|
||||
if part.data and isinstance(part.data, str) and len(part.data) < 2000:
|
||||
contentPartsSection += f" Content preview: {part.data[:500]}...\n"
|
||||
|
||||
# Build user request section
|
||||
userRequestSection = ""
|
||||
if userPrompt:
|
||||
userRequestSection = f"""
|
||||
## ORIGINAL USER REQUEST
|
||||
```
|
||||
{userPrompt}
|
||||
```
|
||||
"""
|
||||
|
||||
# Create template structure explicitly (not extracted from prompt)
|
||||
templateStructure = f"""{{
|
||||
"files": [
|
||||
{{
|
||||
"filename": "{filename}",
|
||||
"content": "// Complete code here",
|
||||
"functions": {json.dumps(functions, indent=2) if functions else '[]'},
|
||||
"classes": {json.dumps(classes, indent=2) if classes else '[]'}
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
# Build base prompt
|
||||
contentPrompt = f"""# TASK: Generate Code File Content
|
||||
|
||||
Generate complete, executable code for the file: {filename}
|
||||
{userRequestSection}## FILE SPECIFICATIONS
|
||||
|
||||
File Type: {fileType}
|
||||
Language: {metadata.get('language', 'python') if metadata else 'python'}
|
||||
{contentPartsSection}
|
||||
|
||||
Required functions:
|
||||
{json.dumps(functions, indent=2) if functions else 'None specified'}
|
||||
|
||||
Required classes:
|
||||
{json.dumps(classes, indent=2) if classes else 'None specified'}
|
||||
|
||||
Dependencies on other files: {', '.join(dependencies) if dependencies else 'None'}
|
||||
{contextInfo}
|
||||
|
||||
Generate complete, production-ready code with:
|
||||
1. Proper imports (including imports from other files in the project if dependencies exist)
|
||||
2. All required functions and classes
|
||||
3. Error handling
|
||||
4. Documentation/docstrings
|
||||
5. Type hints where appropriate
|
||||
|
||||
Return ONLY valid JSON in this format:
|
||||
{templateStructure}
|
||||
"""
|
||||
|
||||
# Build continuation prompt builder
|
||||
async def buildCodeContentPromptWithContinuation(
|
||||
continuationContext: Any,
|
||||
templateStructure: str,
|
||||
basePrompt: str
|
||||
) -> str:
|
||||
"""Build code content prompt with continuation context. Uses unified signature.
|
||||
|
||||
Note: All initial context (filename, fileType, functions, etc.) is already
|
||||
contained in basePrompt. This function only adds continuation-specific instructions.
|
||||
"""
|
||||
# Extract continuation context fields (only what's needed for continuation)
|
||||
incompletePart = continuationContext.incomplete_part
|
||||
lastRawJson = continuationContext.last_raw_json
|
||||
|
||||
# Generate both overlap context and hierarchy context using jsonContinuation
|
||||
overlapContext = ""
|
||||
unifiedContext = ""
|
||||
if lastRawJson:
|
||||
# Get contexts directly from jsonContinuation
|
||||
from modules.shared.jsonContinuation import getContexts
|
||||
contexts = getContexts(lastRawJson)
|
||||
overlapContext = contexts.overlapContext
|
||||
unifiedContext = contexts.hierarchyContextForPrompt
|
||||
elif incompletePart:
|
||||
unifiedContext = incompletePart
|
||||
else:
|
||||
unifiedContext = "Unable to extract context - response was completely broken"
|
||||
|
||||
# Build unified continuation prompt format
|
||||
continuationPrompt = f"""{basePrompt}
|
||||
|
||||
--- CONTINUATION REQUEST ---
|
||||
The previous JSON response was incomplete. Continue from where it stopped.
|
||||
|
||||
Context showing structure hierarchy with cut point:
|
||||
```
|
||||
{unifiedContext}
|
||||
```
|
||||
|
||||
Overlap Requirement:
|
||||
To ensure proper merging, your response MUST start EXACTLY with the overlap context shown below, then continue with new content.
|
||||
|
||||
Overlap context (start your response with this exact text):
|
||||
```json
|
||||
{overlapContext if overlapContext else "No overlap context available"}
|
||||
```
|
||||
|
||||
TASK:
|
||||
1. Start your response EXACTLY with the overlap context shown above (character by character)
|
||||
2. Continue seamlessly from where the overlap context ends
|
||||
3. Complete the remaining content following the JSON structure template above
|
||||
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
|
||||
|
||||
CRITICAL:
|
||||
- Your response MUST begin with the exact overlap context text (this enables automatic merging)
|
||||
- Continue seamlessly after the overlap context with new content
|
||||
- Your response must be valid JSON matching the structure template above"""
|
||||
return continuationPrompt
|
||||
|
||||
# Use generic looping system with code_content use case
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.DATA_GENERATE,
|
||||
resultFormat="json"
|
||||
)
|
||||
|
||||
contentJson = await self.services.ai.callAiWithLooping(
|
||||
prompt=contentPrompt,
|
||||
options=options,
|
||||
promptBuilder=buildCodeContentPromptWithContinuation,
|
||||
promptArgs={
|
||||
"filename": filename,
|
||||
"fileType": fileType,
|
||||
"functions": functions,
|
||||
"classes": classes,
|
||||
"dependencies": dependencies,
|
||||
"metadata": metadata,
|
||||
"userPrompt": userPrompt,
|
||||
"contentParts": contentParts,
|
||||
"contextInfo": contextInfo,
|
||||
"templateStructure": templateStructure,
|
||||
"basePrompt": contentPrompt
|
||||
},
|
||||
useCaseId="code_content",
|
||||
debugPrefix=f"code_content_{fileStructure.get('id', 'file')}",
|
||||
)
|
||||
|
||||
# Extract JSON from markdown fences if present
|
||||
extractedJson = extractJsonString(contentJson)
|
||||
parsed = json.loads(extractedJson)
|
||||
|
||||
# Extract file content and metadata
|
||||
files = parsed.get("files", [])
|
||||
if files and len(files) > 0:
|
||||
fileData = files[0]
|
||||
return {
|
||||
"filename": fileData.get("filename", filename),
|
||||
"content": fileData.get("content", ""),
|
||||
"fileType": fileType,
|
||||
"functions": fileData.get("functions", functions),
|
||||
"classes": fileData.get("classes", classes),
|
||||
"id": fileStructure.get("id")
|
||||
}
|
||||
|
||||
# Fallback if structure is different
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": parsed.get("content", ""),
|
||||
"fileType": fileType,
|
||||
"functions": functions,
|
||||
"classes": classes,
|
||||
"id": fileStructure.get("id")
|
||||
}
|
||||
|
||||
async def _formatAndValidateCode(self, codeFiles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Format and validate generated code files."""
|
||||
# For now, just return files as-is
|
||||
# TODO: Add code formatting (black, prettier, etc.) and validation
|
||||
formatted = []
|
||||
for file in codeFiles:
|
||||
content = file.get("content", "")
|
||||
# Basic cleanup: remove markdown code fences if present
|
||||
if isinstance(content, str):
|
||||
content = re.sub(r'^```[\w]*\n', '', content, flags=re.MULTILINE)
|
||||
content = re.sub(r'\n```$', '', content, flags=re.MULTILINE)
|
||||
file["content"] = content.strip()
|
||||
formatted.append(file)
|
||||
return formatted
|
||||
|
||||
def _getMimeType(self, fileType: str) -> str:
|
||||
"""Get MIME type for file type."""
|
||||
mimeTypes = {
|
||||
"py": "text/x-python",
|
||||
"js": "application/javascript",
|
||||
"ts": "application/typescript",
|
||||
"html": "text/html",
|
||||
"css": "text/css",
|
||||
"json": "application/json",
|
||||
"txt": "text/plain",
|
||||
"md": "text/markdown",
|
||||
"java": "text/x-java-source",
|
||||
"cpp": "text/x-c++src",
|
||||
"c": "text/x-csrc",
|
||||
"csv": "text/csv",
|
||||
"xml": "application/xml"
|
||||
}
|
||||
return mimeTypes.get(fileType.lower(), "text/plain")
|
||||
|
||||
def _getCodeRenderer(self, fileType: str):
|
||||
"""Get code renderer for file type."""
|
||||
from modules.services.serviceGeneration.renderers.registry import getRenderer
|
||||
|
||||
# Map file types to renderer formats (code path)
|
||||
formatMap = {
|
||||
'json': 'json',
|
||||
'csv': 'csv',
|
||||
'xml': 'xml'
|
||||
}
|
||||
|
||||
rendererFormat = formatMap.get(fileType.lower())
|
||||
if rendererFormat:
|
||||
renderer = getRenderer(rendererFormat, self.services, outputStyle='code')
|
||||
# Check if renderer supports code rendering
|
||||
if renderer and hasattr(renderer, 'renderCodeFiles'):
|
||||
return renderer
|
||||
|
||||
return None
|
||||
|
|
@ -1,214 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Document Generation Path
|
||||
|
||||
Handles document generation using existing chapter/section model.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import copy
|
||||
from typing import Dict, Any, List, Optional
|
||||
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
|
||||
from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from modules.workflows.processing.shared.stateTools import checkWorkflowStopped
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentGenerationPath:
|
||||
"""Document generation path (existing functionality, refactored)."""
|
||||
|
||||
def __init__(self, services):
|
||||
self.services = services
|
||||
|
||||
async def generateDocument(
|
||||
self,
|
||||
userPrompt: str,
|
||||
documentList: Optional[Any] = None, # DocumentReferenceList
|
||||
documentIntents: Optional[List[DocumentIntent]] = None,
|
||||
contentParts: Optional[List[ContentPart]] = None,
|
||||
outputFormat: str = "txt",
|
||||
title: Optional[str] = None,
|
||||
parentOperationId: Optional[str] = None
|
||||
) -> AiResponse:
|
||||
"""
|
||||
Generate document using existing chapter/section model.
|
||||
|
||||
Returns: AiResponse with documents list
|
||||
"""
|
||||
# Create operation ID
|
||||
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
||||
docOperationId = f"doc_gen_{workflowId}_{int(time.time())}"
|
||||
|
||||
# Start progress tracking
|
||||
self.services.chat.progressLogStart(
|
||||
docOperationId,
|
||||
"Document Generation",
|
||||
"Document Generation",
|
||||
f"Format: {outputFormat}",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
# Schritt 5A: Kläre Dokument-Intents
|
||||
documents = []
|
||||
if documentList:
|
||||
documents = self.services.chat.getChatDocumentsFromDocumentList(documentList)
|
||||
|
||||
# Filter: Entferne Original-Dokumente, wenn bereits Pre-Extracted JSONs existieren
|
||||
# (um Duplikate zu vermeiden - Pre-Extracted JSONs enthalten bereits die ContentParts)
|
||||
# Schritt 1: Identifiziere alle Original-Dokument-IDs, die durch Pre-Extracted JSONs abgedeckt werden
|
||||
originalDocIdsCoveredByPreExtracted = set()
|
||||
for doc in documents:
|
||||
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
|
||||
if preExtracted:
|
||||
originalDocId = preExtracted["originalDocument"]["id"]
|
||||
originalDocIdsCoveredByPreExtracted.add(originalDocId)
|
||||
logger.debug(f"Found pre-extracted JSON {doc.id} covering original document {originalDocId}")
|
||||
|
||||
# Schritt 2: Filtere Dokumente - entferne Original-Dokumente, die bereits durch Pre-Extracted JSONs abgedeckt werden
|
||||
filteredDocuments = []
|
||||
for doc in documents:
|
||||
preExtracted = self.services.ai.intentAnalyzer.resolvePreExtractedDocument(doc)
|
||||
if preExtracted:
|
||||
# Pre-Extracted JSON behalten
|
||||
filteredDocuments.append(doc)
|
||||
elif doc.id in originalDocIdsCoveredByPreExtracted:
|
||||
# Original-Dokument, das bereits durch Pre-Extracted JSON abgedeckt wird - entfernen
|
||||
logger.info(f"Skipping original document {doc.id} ({doc.fileName}) - already covered by pre-extracted JSON")
|
||||
else:
|
||||
# Normales Dokument ohne Pre-Extracted JSON - behalten
|
||||
filteredDocuments.append(doc)
|
||||
|
||||
documents = filteredDocuments
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
|
||||
if not documentIntents and documents:
|
||||
documentIntents = await self.services.ai.clarifyDocumentIntents(
|
||||
documents,
|
||||
userPrompt,
|
||||
{"outputFormat": outputFormat},
|
||||
docOperationId
|
||||
)
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
|
||||
# Schritt 5B: Extrahiere und bereite Content vor
|
||||
if documents:
|
||||
preparedContentParts = await self.services.ai.extractAndPrepareContent(
|
||||
documents,
|
||||
documentIntents or [],
|
||||
docOperationId
|
||||
)
|
||||
|
||||
# Merge mit bereitgestellten contentParts (falls vorhanden)
|
||||
if contentParts:
|
||||
# Prüfe auf pre-extracted Content
|
||||
for part in contentParts:
|
||||
if part.metadata.get("skipExtraction", False):
|
||||
# Bereits extrahiert - verwende as-is, stelle sicher dass Metadaten vollständig
|
||||
part.metadata.setdefault("contentFormat", "extracted")
|
||||
part.metadata.setdefault("isPreExtracted", True)
|
||||
preparedContentParts.extend(contentParts)
|
||||
|
||||
contentParts = preparedContentParts
|
||||
|
||||
# Schritt 5B.5: Documents are converted to contentParts (like pre-processed JSON files)
|
||||
# No AI extraction here - AI extraction happens during section generation
|
||||
if contentParts:
|
||||
logger.info(f"Using {len(contentParts)} content parts for generation (no AI extraction at this stage)")
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
|
||||
# Schritt 5C: Generiere Struktur
|
||||
structure = await self.services.ai.generateStructure(
|
||||
userPrompt,
|
||||
contentParts or [],
|
||||
outputFormat,
|
||||
docOperationId
|
||||
)
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
|
||||
# Schritt 5D: Fülle Struktur
|
||||
# Language will be extracted from services (user intention analysis) in fillStructure
|
||||
filledStructure = await self.services.ai.fillStructure(
|
||||
structure,
|
||||
contentParts or [],
|
||||
userPrompt,
|
||||
docOperationId
|
||||
)
|
||||
|
||||
checkWorkflowStopped(self.services)
|
||||
|
||||
# Schritt 5E: Rendere Resultat
|
||||
# Jedes Dokument wird einzeln gerendert, kann 1..n Dateien zurückgeben (z.B. HTML + Bilder)
|
||||
# Language is already validated in structure (State 3) and preserved in filled structure (State 4)
|
||||
# Per-document language will be extracted in renderReport() from filledStructure
|
||||
# Use validated currentUserLanguage as global fallback (always valid infrastructure)
|
||||
language = self.services.currentUserLanguage if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage else "en"
|
||||
|
||||
# IMPORTANT: Create deep copy BEFORE renderResult to preserve filledStructure with elements
|
||||
# renderResult might modify the structure, so we need to preserve the original for sourceJson
|
||||
# This ensures sourceJson contains the complete structure with elements for validation
|
||||
filledStructureForSourceJson = copy.deepcopy(filledStructure) if filledStructure else None
|
||||
|
||||
renderedDocuments = await self.services.ai.renderResult(
|
||||
filledStructure,
|
||||
outputFormat,
|
||||
language, # Global fallback (per-document language extracted from structure in renderReport)
|
||||
title or "Generated Document",
|
||||
userPrompt,
|
||||
docOperationId
|
||||
)
|
||||
|
||||
# Baue Response: Konvertiere alle gerenderten Dokumente zu DocumentData
|
||||
documentDataList = []
|
||||
for renderedDoc in renderedDocuments:
|
||||
try:
|
||||
# Erstelle DocumentData für jedes gerenderte Dokument
|
||||
# Use the preserved filledStructureForSourceJson (with elements) for sourceJson
|
||||
docDataObj = DocumentData(
|
||||
documentName=renderedDoc.filename,
|
||||
documentData=renderedDoc.documentData,
|
||||
mimeType=renderedDoc.mimeType,
|
||||
sourceJson=filledStructureForSourceJson if len(documentDataList) == 0 else None # Nur für erstes Dokument
|
||||
)
|
||||
documentDataList.append(docDataObj)
|
||||
logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error creating document {renderedDoc.filename}: {str(e)}")
|
||||
|
||||
if not documentDataList:
|
||||
raise ValueError("No documents were rendered")
|
||||
|
||||
metadata = AiResponseMetadata(
|
||||
title=title or filledStructure.get("metadata", {}).get("title", "Generated Document"),
|
||||
operationType=OperationTypeEnum.DATA_GENERATE.value
|
||||
)
|
||||
|
||||
# Debug-Log (harmonisiert)
|
||||
self.services.utils.writeDebugFile(
|
||||
json.dumps(filledStructure, indent=2, ensure_ascii=False, default=str),
|
||||
"document_generation_response"
|
||||
)
|
||||
|
||||
self.services.chat.progressLogFinish(docOperationId, True)
|
||||
|
||||
return AiResponse(
|
||||
content=json.dumps(filledStructure),
|
||||
metadata=metadata,
|
||||
documents=documentDataList
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document generation: {str(e)}")
|
||||
self.services.chat.progressLogFinish(docOperationId, False)
|
||||
raise
|
||||
|
||||
|
|
@ -1,128 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Image Generation Path
|
||||
|
||||
Handles image generation with support for single and batch generation.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData
|
||||
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum, AiCallRequest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImageGenerationPath:
|
||||
"""Image generation path."""
|
||||
|
||||
def __init__(self, services):
|
||||
self.services = services
|
||||
|
||||
async def generateImages(
|
||||
self,
|
||||
userPrompt: str,
|
||||
count: int = 1,
|
||||
style: Optional[str] = None,
|
||||
format: str = "png",
|
||||
title: Optional[str] = None,
|
||||
parentOperationId: Optional[str] = None
|
||||
) -> AiResponse:
|
||||
"""
|
||||
Generate image files.
|
||||
|
||||
Returns: AiResponse with image files as documents
|
||||
"""
|
||||
# Create operation ID
|
||||
workflowId = self.services.workflow.id if self.services.workflow else f"no-workflow-{int(time.time())}"
|
||||
imageOperationId = f"image_gen_{workflowId}_{int(time.time())}"
|
||||
|
||||
# Start progress tracking
|
||||
self.services.chat.progressLogStart(
|
||||
imageOperationId,
|
||||
"Image Generation",
|
||||
"Image Generation",
|
||||
f"Format: {format}",
|
||||
parentOperationId=parentOperationId
|
||||
)
|
||||
|
||||
try:
|
||||
self.services.chat.progressLogUpdate(imageOperationId, 0.4, "Calling AI for image generation")
|
||||
|
||||
# Build prompt with style if provided
|
||||
imagePrompt = userPrompt
|
||||
if style:
|
||||
imagePrompt = f"{userPrompt}\n\nStyle: {style}"
|
||||
|
||||
# Use IMAGE_GENERATE operation
|
||||
options = AiCallOptions(
|
||||
operationType=OperationTypeEnum.IMAGE_GENERATE,
|
||||
resultFormat=format
|
||||
)
|
||||
|
||||
request = AiCallRequest(
|
||||
prompt=imagePrompt,
|
||||
context="",
|
||||
options=options
|
||||
)
|
||||
|
||||
response = await self.services.ai.callAi(request)
|
||||
|
||||
if not response.content:
|
||||
errorMsg = f"No image data returned: {response.content}"
|
||||
logger.error(f"Error in AI image generation: {errorMsg}")
|
||||
self.services.chat.progressLogFinish(imageOperationId, False)
|
||||
raise ValueError(errorMsg)
|
||||
|
||||
# Handle response content (could be base64 string or bytes)
|
||||
imageData = response.content
|
||||
if isinstance(imageData, str):
|
||||
# Assume base64 encoded string
|
||||
import base64
|
||||
try:
|
||||
imageData = base64.b64decode(imageData)
|
||||
except Exception:
|
||||
# If not base64, try encoding as bytes
|
||||
imageData = imageData.encode('utf-8')
|
||||
elif not isinstance(imageData, bytes):
|
||||
imageData = bytes(imageData)
|
||||
|
||||
# Create document
|
||||
imageDoc = DocumentData(
|
||||
documentName=f"generated_image.{format}",
|
||||
documentData=imageData,
|
||||
mimeType=f"image/{format}"
|
||||
)
|
||||
|
||||
metadata = AiResponseMetadata(
|
||||
title=title or "Generated Image",
|
||||
operationType=OperationTypeEnum.IMAGE_GENERATE.value
|
||||
)
|
||||
|
||||
# Note: Stats are now stored centrally in callAi() - no need to duplicate here
|
||||
|
||||
self.services.chat.progressLogUpdate(imageOperationId, 0.9, "Image generated")
|
||||
self.services.chat.progressLogFinish(imageOperationId, True)
|
||||
|
||||
# Create content string describing the image generation
|
||||
import json
|
||||
contentJson = json.dumps({
|
||||
"type": "image",
|
||||
"format": format,
|
||||
"prompt": userPrompt,
|
||||
"filename": imageDoc.documentName
|
||||
}, ensure_ascii=False)
|
||||
|
||||
return AiResponse(
|
||||
content=contentJson, # JSON string describing the image generation
|
||||
metadata=metadata,
|
||||
documents=[imageDoc]
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in image generation: {str(e)}")
|
||||
self.services.chat.progressLogFinish(imageOperationId, False)
|
||||
raise
|
||||
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Base renderer class for code format renderers.
|
||||
"""
|
||||
|
||||
from abc import abstractmethod
|
||||
from .documentRendererBaseTemplate import BaseRenderer
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from typing import Dict, Any, List, Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseCodeRenderer(BaseRenderer):
|
||||
"""Base class for code format renderers."""
|
||||
|
||||
@abstractmethod
|
||||
async def renderCodeFiles(
|
||||
self,
|
||||
codeFiles: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any],
|
||||
userPrompt: str = None
|
||||
) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render code files to format-specific output.
|
||||
|
||||
Args:
|
||||
codeFiles: List of file dictionaries with:
|
||||
- filename: str
|
||||
- fileType: str (json, csv, xml, etc.)
|
||||
- content: str (generated code)
|
||||
- id: str (optional)
|
||||
metadata: Project metadata (language, projectType, etc.)
|
||||
userPrompt: Original user prompt
|
||||
|
||||
Returns:
|
||||
List of RenderedDocument objects (can be 1..n files)
|
||||
"""
|
||||
pass
|
||||
|
||||
def _validateCodeFile(self, codeFile: Dict[str, Any]) -> bool:
|
||||
"""Validate code file structure."""
|
||||
required = ['filename', 'fileType', 'content']
|
||||
return all(key in codeFile for key in required)
|
||||
|
|
@ -1,484 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Base renderer class for all format renderers.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, List, Tuple, Optional
|
||||
from modules.datamodels.datamodelJson import supportedSectionTypes
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, UTC
|
||||
import base64
|
||||
import io
|
||||
from PIL import Image
|
||||
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseRenderer(ABC):
|
||||
"""Base class for all format renderers."""
|
||||
|
||||
def __init__(self, services=None):
|
||||
self.logger = logger
|
||||
self.services = services # Add services attribute
|
||||
|
||||
@classmethod
|
||||
def getSupportedFormats(cls) -> List[str]:
|
||||
"""
|
||||
Return list of supported format names for this renderer.
|
||||
Override this method in subclasses to specify supported formats.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def getFormatAliases(cls) -> List[str]:
|
||||
"""
|
||||
Return list of format aliases for this renderer.
|
||||
Override this method in subclasses to specify format aliases.
|
||||
"""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def getPriority(cls) -> int:
|
||||
"""
|
||||
Return priority for this renderer (higher number = higher priority).
|
||||
Used when multiple renderers support the same format.
|
||||
"""
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||||
"""
|
||||
Return the output style classification for this renderer.
|
||||
Returns: 'code', 'document', 'image', or other (e.g., 'video' for future use)
|
||||
Override this method in subclasses to specify the output style.
|
||||
|
||||
Args:
|
||||
formatName: Optional format name (e.g., 'txt', 'js', 'csv') - useful for renderers
|
||||
that handle multiple formats with different styles (e.g., RendererText)
|
||||
"""
|
||||
return 'document' # Default to document style
|
||||
|
||||
@classmethod
|
||||
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Return list of section content types that this renderer accepts.
|
||||
This allows renderers to declare which section types they can process.
|
||||
|
||||
Default implementation returns all supported section types.
|
||||
Override this method in subclasses to restrict accepted types.
|
||||
|
||||
Args:
|
||||
formatName: Optional format name (e.g., 'txt', 'js', 'csv') - useful for renderers
|
||||
that handle multiple formats with different accepted types (e.g., RendererText)
|
||||
|
||||
Returns:
|
||||
List of accepted section content types (e.g., ["table", "paragraph", "heading"])
|
||||
Valid types: "table", "bullet_list", "heading", "paragraph", "code_block", "image"
|
||||
"""
|
||||
# Default: accept all section types
|
||||
return list(supportedSectionTypes)
|
||||
|
||||
@abstractmethod
|
||||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render extracted JSON content to multiple documents.
|
||||
Each renderer must implement this method.
|
||||
Can return 1..n documents (e.g., HTML + images).
|
||||
|
||||
Args:
|
||||
extractedContent: Structured JSON content with sections and metadata (contains single document)
|
||||
title: Report title
|
||||
userPrompt: Original user prompt for context
|
||||
aiService: AI service instance for additional processing
|
||||
|
||||
Returns:
|
||||
List of RenderedDocument objects.
|
||||
First document is the main document, additional documents are supporting files (e.g., images).
|
||||
Even if only one document is returned, it must be wrapped in a list.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _determineFilename(self, title: str, mimeType: str) -> str:
|
||||
"""Determine filename from title and mimeType."""
|
||||
import re
|
||||
# Get extension from mimeType
|
||||
extensionMap = {
|
||||
"text/html": "html",
|
||||
"application/pdf": "pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
||||
"text/plain": "txt",
|
||||
"text/markdown": "md",
|
||||
"application/json": "json",
|
||||
"text/csv": "csv"
|
||||
}
|
||||
extension = extensionMap.get(mimeType, "txt")
|
||||
|
||||
# Sanitize title for filename
|
||||
sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", title)
|
||||
sanitized = re.sub(r"_+", "_", sanitized).strip("_")
|
||||
if not sanitized:
|
||||
sanitized = "document"
|
||||
|
||||
return f"{sanitized}.{extension}"
|
||||
|
||||
def _extractSections(self, reportData: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract sections from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
|
||||
Phase 5: Supports multiple documents - extracts all sections from all documents.
|
||||
"""
|
||||
if "documents" not in reportData:
|
||||
raise ValueError("Report data must follow standardized schema with 'documents' array")
|
||||
|
||||
documents = reportData.get("documents", [])
|
||||
if not isinstance(documents, list) or len(documents) == 0:
|
||||
raise ValueError("Standardized schema must contain at least one document in 'documents' array")
|
||||
|
||||
# Phase 5: Extract sections from ALL documents
|
||||
all_sections = []
|
||||
for doc in documents:
|
||||
if isinstance(doc, dict) and "sections" in doc:
|
||||
sections = doc.get("sections", [])
|
||||
if isinstance(sections, list):
|
||||
all_sections.extend(sections)
|
||||
|
||||
if not all_sections:
|
||||
raise ValueError("No sections found in any document")
|
||||
|
||||
return all_sections
|
||||
|
||||
def _extractMetadata(self, reportData: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
|
||||
"""
|
||||
if "metadata" not in reportData:
|
||||
raise ValueError("Report data must follow standardized schema with 'metadata' field")
|
||||
|
||||
metadata = reportData.get("metadata", {})
|
||||
if not isinstance(metadata, dict):
|
||||
raise ValueError("Metadata in standardized schema must be a dictionary")
|
||||
|
||||
return metadata
|
||||
|
||||
def _getTitle(self, reportData: Dict[str, Any], fallbackTitle: str) -> str:
|
||||
"""Get title from report data or use fallback."""
|
||||
metadata = reportData.get('metadata', {})
|
||||
return metadata.get('title', fallbackTitle)
|
||||
|
||||
def _validateJsonStructure(self, jsonContent: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Validate that JSON content follows standardized schema: {metadata: {...}, documents: [{sections: [...]}]}
|
||||
"""
|
||||
if not isinstance(jsonContent, dict):
|
||||
return False
|
||||
|
||||
# Validate metadata field exists
|
||||
if "metadata" not in jsonContent:
|
||||
return False
|
||||
|
||||
if not isinstance(jsonContent.get("metadata"), dict):
|
||||
return False
|
||||
|
||||
# Validate documents array exists and is not empty
|
||||
if "documents" not in jsonContent:
|
||||
return False
|
||||
|
||||
documents = jsonContent.get("documents", [])
|
||||
if not isinstance(documents, list) or len(documents) == 0:
|
||||
return False
|
||||
|
||||
# Validate first document has sections
|
||||
firstDoc = documents[0]
|
||||
if not isinstance(firstDoc, dict) or "sections" not in firstDoc:
|
||||
return False
|
||||
|
||||
sections = firstDoc.get("sections", [])
|
||||
if not isinstance(sections, list):
|
||||
return False
|
||||
|
||||
# Validate each section has content_type and elements
|
||||
for section in sections:
|
||||
if not isinstance(section, dict):
|
||||
return False
|
||||
if "content_type" not in section or "elements" not in section:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _getSectionType(self, section: Dict[str, Any]) -> str:
|
||||
"""Get the type of a section; default to 'paragraph' for non-dict inputs."""
|
||||
if isinstance(section, dict):
|
||||
return section.get("content_type", "paragraph")
|
||||
# If section is a list or any other type, treat as paragraph elements
|
||||
return "paragraph"
|
||||
|
||||
def _getSectionData(self, section: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Get the elements of a section; if a list is provided directly, return it."""
|
||||
if isinstance(section, dict):
|
||||
return section.get("elements", [])
|
||||
if isinstance(section, list):
|
||||
return section
|
||||
return []
|
||||
|
||||
def _getSectionId(self, section: Dict[str, Any]) -> str:
|
||||
"""Get the ID of a section (if available)."""
|
||||
if isinstance(section, dict):
|
||||
return section.get("id", "unknown")
|
||||
return "unknown"
|
||||
|
||||
def _validateImageData(self, base64Data: str, altText: str) -> bool:
|
||||
"""Validate image data."""
|
||||
if not base64Data:
|
||||
self.logger.warning("Image section has no base64 data")
|
||||
return False
|
||||
|
||||
if not altText:
|
||||
self.logger.warning("Image section has no alt text")
|
||||
return False
|
||||
|
||||
# Basic base64 validation
|
||||
try:
|
||||
base64.b64decode(base64Data, validate=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Invalid base64 image data: {str(e)}")
|
||||
return False
|
||||
|
||||
def _getImageDimensions(self, base64Data: str) -> Tuple[int, int]:
|
||||
"""
|
||||
Get image dimensions from base64 data.
|
||||
This is a helper method that format-specific renderers can use.
|
||||
"""
|
||||
try:
|
||||
# Decode base64 data
|
||||
imageData = base64.b64decode(base64Data)
|
||||
image = Image.open(io.BytesIO(imageData))
|
||||
|
||||
return image.size # Returns (width, height)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not determine image dimensions: {str(e)}")
|
||||
return (0, 0)
|
||||
|
||||
def _resizeImageIfNeeded(self, base64Data: str, maxWidth: int = 800, maxHeight: int = 600) -> str:
|
||||
"""
|
||||
Resize image if it exceeds maximum dimensions.
|
||||
Returns the resized image as base64 string.
|
||||
"""
|
||||
try:
|
||||
# Decode base64 data
|
||||
imageData = base64.b64decode(base64Data)
|
||||
image = Image.open(io.BytesIO(imageData))
|
||||
|
||||
# Check if resizing is needed
|
||||
width, height = image.size
|
||||
if width <= maxWidth and height <= maxHeight:
|
||||
return base64Data # No resizing needed
|
||||
|
||||
# Calculate new dimensions maintaining aspect ratio
|
||||
ratio = min(maxWidth / width, maxHeight / height)
|
||||
newWidth = int(width * ratio)
|
||||
newHeight = int(height * ratio)
|
||||
|
||||
# Resize image
|
||||
resizedImage = image.resize((newWidth, newHeight), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert back to base64
|
||||
buffer = io.BytesIO()
|
||||
resizedImage.save(buffer, format=image.format or 'PNG')
|
||||
resizedData = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(resizedData).decode('utf-8')
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not resize image: {str(e)}")
|
||||
return base64Data # Return original if resize fails
|
||||
|
||||
def _getSupportedSectionTypes(self) -> List[str]:
|
||||
"""Return list of supported section types (from unified schema)."""
|
||||
return supportedSectionTypes
|
||||
|
||||
def _isValidSectionType(self, sectionType: str) -> bool:
|
||||
"""Check if a section type is valid."""
|
||||
return sectionType in self._getSupportedSectionTypes()
|
||||
|
||||
def _formatTimestamp(self, timestamp: str = None) -> str:
|
||||
"""Format timestamp for display."""
|
||||
if timestamp:
|
||||
return timestamp
|
||||
return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
# ===== GENERIC AI STYLING HELPERS =====
|
||||
|
||||
async def _getAiStyles(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Generic AI styling method that can be used by all renderers.
|
||||
|
||||
Args:
|
||||
aiService: AI service instance
|
||||
styleTemplate: Format-specific style template
|
||||
defaultStyles: Default styles to fall back to
|
||||
|
||||
Returns:
|
||||
Dict with styling definitions
|
||||
"""
|
||||
# DEBUG: Show which renderer is calling this method
|
||||
|
||||
if not aiService:
|
||||
return defaultStyles
|
||||
|
||||
try:
|
||||
|
||||
requestOptions = AiCallOptions()
|
||||
requestOptions.operationType = OperationTypeEnum.DATA_GENERATE
|
||||
|
||||
request = AiCallRequest(prompt=styleTemplate, context="", options=requestOptions)
|
||||
|
||||
# DEBUG: Show the actual prompt being sent to AI
|
||||
self.logger.debug(f"AI Style Template Prompt:")
|
||||
self.logger.debug(f"{styleTemplate}")
|
||||
|
||||
response = await aiService.callAi(request)
|
||||
|
||||
# Save styling prompt and response to debug (fire and forget - don't block on slow file I/O)
|
||||
# The writeDebugFile calls os.listdir() which can be slow with many files
|
||||
# Run in background thread to avoid blocking rendering
|
||||
import threading
|
||||
def _writeDebugFiles():
|
||||
try:
|
||||
self.services.utils.writeDebugFile(styleTemplate, "renderer_styling_prompt")
|
||||
self.services.utils.writeDebugFile(response.content or '', "renderer_styling_response")
|
||||
except Exception:
|
||||
pass # Silently fail - debug writing should never block rendering
|
||||
|
||||
threading.Thread(target=_writeDebugFiles, daemon=True).start()
|
||||
|
||||
# Clean and parse JSON
|
||||
result = response.content.strip() if response and response.content else ""
|
||||
|
||||
# Check if result is empty
|
||||
if not result:
|
||||
self.logger.warning("AI styling returned empty response, using defaults")
|
||||
return defaultStyles
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
||||
if jsonMatch:
|
||||
result = jsonMatch.group(1).strip()
|
||||
elif result.startswith('```json'):
|
||||
result = re.sub(r'^```json\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
elif result.startswith('```'):
|
||||
result = re.sub(r'^```\s*', '', result)
|
||||
result = re.sub(r'\s*```$', '', result)
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
except json.JSONDecodeError as jsonError:
|
||||
self.logger.warning(f"AI styling returned invalid JSON: {jsonError}")
|
||||
|
||||
# Use print instead of logger to avoid truncation
|
||||
self.services.utils.debugLogToFile(f"FULL AI RESPONSE THAT FAILED TO PARSE: {result}", "RENDERER")
|
||||
self.services.utils.debugLogToFile(f"RESPONSE LENGTH: {len(result)} characters", "RENDERER")
|
||||
|
||||
self.logger.warning(f"Raw content that failed to parse: {result}")
|
||||
|
||||
# Try to fix incomplete JSON by adding missing closing braces
|
||||
openBraces = result.count('{')
|
||||
closeBraces = result.count('}')
|
||||
|
||||
if openBraces > closeBraces:
|
||||
# JSON is incomplete, add missing closing braces
|
||||
missingBraces = openBraces - closeBraces
|
||||
result = result + '}' * missingBraces
|
||||
self.logger.info(f"Added {missingBraces} missing closing brace(s)")
|
||||
self.logger.debug(f"Fixed JSON: {result}")
|
||||
|
||||
# Try parsing the fixed JSON
|
||||
try:
|
||||
styles = json.loads(result)
|
||||
self.logger.info("Successfully fixed incomplete JSON")
|
||||
except json.JSONDecodeError as fixError:
|
||||
self.logger.warning(f"Fixed JSON still invalid: {fixError}")
|
||||
self.logger.warning(f"Fixed JSON content: {result}")
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
jsonStart = result.find('{')
|
||||
jsonEnd = result.rfind('}')
|
||||
if jsonStart != -1 and jsonEnd != -1 and jsonEnd > jsonStart:
|
||||
jsonPart = result[jsonStart:jsonEnd+1]
|
||||
try:
|
||||
styles = json.loads(jsonPart)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return defaultStyles
|
||||
else:
|
||||
return defaultStyles
|
||||
else:
|
||||
# Try to extract just the JSON part if it's embedded in text
|
||||
jsonStart = result.find('{')
|
||||
jsonEnd = result.rfind('}')
|
||||
if jsonStart != -1 and jsonEnd != -1 and jsonEnd > jsonStart:
|
||||
jsonPart = result[jsonStart:jsonEnd+1]
|
||||
try:
|
||||
styles = json.loads(jsonPart)
|
||||
self.logger.info("Successfully extracted JSON from explanatory text")
|
||||
except json.JSONDecodeError:
|
||||
self.logger.warning("Could not extract valid JSON from response, using defaults")
|
||||
return defaultStyles
|
||||
else:
|
||||
return defaultStyles
|
||||
|
||||
# Convert colors to appropriate format
|
||||
styles = self._convertColorsFormat(styles)
|
||||
|
||||
return styles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"AI styling failed: {str(e)}, using defaults")
|
||||
return defaultStyles
|
||||
|
||||
def _convertColorsFormat(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert colors to appropriate format based on renderer type.
|
||||
Override this method in subclasses for format-specific color handling.
|
||||
"""
|
||||
return styles
|
||||
|
||||
def _createAiStyleTemplate(self, formatName: str, userPrompt: str, styleSchema: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Create a standardized AI style template for any format.
|
||||
|
||||
Args:
|
||||
formatName: Name of the format (e.g., "docx", "xlsx", "pptx")
|
||||
userPrompt: User's original prompt
|
||||
styleSchema: Format-specific style schema
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
schemaJson = json.dumps(styleSchema, indent=4)
|
||||
|
||||
# DEBUG: Show the schema being sent
|
||||
|
||||
return f"""You are a professional document styling expert. Generate a complete JSON styling configuration for {formatName.upper()} documents.
|
||||
|
||||
User request: {userPrompt}
|
||||
|
||||
Use this schema as a template:
|
||||
{schemaJson}
|
||||
|
||||
Requirements:
|
||||
- Return ONLY the complete JSON object (no markdown, no explanations)
|
||||
- If the user request contains style/formatting/design instructions (in any language), customize the styling accordingly (adapt styles and add styles if needed)
|
||||
- If the user request has NO style instructions, return the default schema values unchanged
|
||||
- Ensure all objects are properly closed with closing braces
|
||||
- Only modify styles if style instructions are present in the user request
|
||||
|
||||
Return the complete JSON:"""
|
||||
|
|
@ -1,238 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
Renderer registry for automatic discovery and registration of renderers.
|
||||
|
||||
Renderers are indexed by (format, outputStyle) so that document generation
|
||||
and code generation each get the correct renderer for the same format.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import importlib
|
||||
from typing import Dict, Type, List, Optional, Tuple
|
||||
from .documentRendererBaseTemplate import BaseRenderer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RendererRegistry:
|
||||
"""Registry for automatic renderer discovery and management.
|
||||
|
||||
Maintains separate renderer mappings per outputStyle ('document', 'code', etc.)
|
||||
so that document-generation and code-generation paths each resolve to the
|
||||
correct renderer, even when both support the same format (e.g. 'csv').
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Key: (formatName, outputStyle) -> rendererClass
|
||||
self._renderers: Dict[Tuple[str, str], Type[BaseRenderer]] = {}
|
||||
self._format_mappings: Dict[str, str] = {}
|
||||
self._discovered = False
|
||||
|
||||
def discoverRenderers(self) -> None:
|
||||
"""Automatically discover and register all renderers by scanning files."""
|
||||
if self._discovered:
|
||||
return
|
||||
|
||||
try:
|
||||
from pathlib import Path
|
||||
|
||||
currentDir = Path(__file__).parent
|
||||
packageName = __name__.rsplit('.', 1)[0]
|
||||
|
||||
for filePath in currentDir.glob("*.py"):
|
||||
if filePath.name in ['registry.py', 'documentRendererBaseTemplate.py', 'codeRendererBaseTemplate.py', '__init__.py']:
|
||||
continue
|
||||
|
||||
moduleName = filePath.stem
|
||||
|
||||
try:
|
||||
fullModuleName = f"{packageName}.{moduleName}"
|
||||
module = importlib.import_module(fullModuleName)
|
||||
|
||||
for attrName in dir(module):
|
||||
attr = getattr(module, attrName)
|
||||
if (isinstance(attr, type) and
|
||||
issubclass(attr, BaseRenderer) and
|
||||
attr != BaseRenderer and
|
||||
hasattr(attr, 'getSupportedFormats')):
|
||||
self._registerRendererClass(attr)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load renderer from {moduleName}: {str(e)}")
|
||||
continue
|
||||
|
||||
self._discovered = True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during renderer discovery: {str(e)}")
|
||||
self._discovered = True
|
||||
|
||||
def _registerRendererClass(self, rendererClass: Type[BaseRenderer]) -> None:
|
||||
"""Register a renderer class keyed by (format, outputStyle)."""
|
||||
try:
|
||||
supportedFormats = rendererClass.getSupportedFormats()
|
||||
outputStyle = rendererClass.getOutputStyle() if hasattr(rendererClass, 'getOutputStyle') else 'document'
|
||||
priority = rendererClass.getPriority() if hasattr(rendererClass, 'getPriority') else 0
|
||||
|
||||
for formatName in supportedFormats:
|
||||
formatKey = formatName.lower()
|
||||
registryKey = (formatKey, outputStyle)
|
||||
|
||||
if registryKey in self._renderers:
|
||||
existingRenderer = self._renderers[registryKey]
|
||||
existingPriority = existingRenderer.getPriority() if hasattr(existingRenderer, 'getPriority') else 0
|
||||
|
||||
if priority > existingPriority:
|
||||
logger.debug(f"Replacing {existingRenderer.__name__} with {rendererClass.__name__} for ({formatKey}, {outputStyle}) (priority {priority} > {existingPriority})")
|
||||
self._renderers[registryKey] = rendererClass
|
||||
else:
|
||||
logger.debug(f"Keeping {existingRenderer.__name__} for ({formatKey}, {outputStyle}) (priority {existingPriority} >= {priority})")
|
||||
else:
|
||||
self._renderers[registryKey] = rendererClass
|
||||
|
||||
# Register aliases
|
||||
if hasattr(rendererClass, 'getFormatAliases'):
|
||||
aliases = rendererClass.getFormatAliases()
|
||||
for alias in aliases:
|
||||
self._format_mappings[alias.lower()] = formatKey
|
||||
|
||||
logger.debug(f"Registered {rendererClass.__name__} for formats={supportedFormats}, style={outputStyle}, priority={priority}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error registering renderer {rendererClass.__name__}: {str(e)}")
|
||||
|
||||
def getRenderer(self, outputFormat: str, services=None, outputStyle: str = None) -> Optional[BaseRenderer]:
|
||||
"""Get a renderer instance for the specified format and style.
|
||||
|
||||
Args:
|
||||
outputFormat: Format name (e.g. 'csv', 'json', 'pdf')
|
||||
services: Services instance passed to renderer constructor
|
||||
outputStyle: 'document' or 'code'. If None, returns the first match
|
||||
with preference: document > code (most callers are document path).
|
||||
"""
|
||||
if not self._discovered:
|
||||
self.discoverRenderers()
|
||||
|
||||
formatName = outputFormat.lower().strip()
|
||||
if formatName in self._format_mappings:
|
||||
formatName = self._format_mappings[formatName]
|
||||
|
||||
rendererClass = None
|
||||
|
||||
if outputStyle:
|
||||
# Exact match by style
|
||||
rendererClass = self._renderers.get((formatName, outputStyle))
|
||||
else:
|
||||
# No style specified — prefer 'document', then 'code', then any
|
||||
for style in ['document', 'code']:
|
||||
rendererClass = self._renderers.get((formatName, style))
|
||||
if rendererClass:
|
||||
break
|
||||
# Fallback: check any registered style
|
||||
if not rendererClass:
|
||||
for key, cls in self._renderers.items():
|
||||
if key[0] == formatName:
|
||||
rendererClass = cls
|
||||
break
|
||||
|
||||
if rendererClass:
|
||||
try:
|
||||
return rendererClass(services=services)
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating renderer instance for {formatName}: {str(e)}")
|
||||
return None
|
||||
|
||||
logger.warning(f"No renderer found for format={outputFormat}, style={outputStyle}")
|
||||
return None
|
||||
|
||||
def getSupportedFormats(self) -> List[str]:
|
||||
"""Get list of all supported formats."""
|
||||
if not self._discovered:
|
||||
self.discoverRenderers()
|
||||
|
||||
formats = set()
|
||||
for (fmt, _style) in self._renderers.keys():
|
||||
formats.add(fmt)
|
||||
formats.update(self._format_mappings.keys())
|
||||
return sorted(formats)
|
||||
|
||||
def getRendererInfo(self) -> Dict[str, Dict[str, str]]:
|
||||
"""Get information about all registered renderers."""
|
||||
if not self._discovered:
|
||||
self.discoverRenderers()
|
||||
|
||||
info = {}
|
||||
for (formatName, style), rendererClass in self._renderers.items():
|
||||
key = f"{formatName}:{style}"
|
||||
info[key] = {
|
||||
'class_name': rendererClass.__name__,
|
||||
'module': rendererClass.__module__,
|
||||
'outputStyle': style,
|
||||
'description': getattr(rendererClass, '__doc__', 'No description').strip().split('\n')[0] if rendererClass.__doc__ else 'No description'
|
||||
}
|
||||
|
||||
return info
|
||||
|
||||
def getOutputStyle(self, outputFormat: str) -> Optional[str]:
|
||||
"""
|
||||
Get the output style classification for a given format.
|
||||
When both 'document' and 'code' renderers exist for a format,
|
||||
returns the default ('document') since this is called during document generation.
|
||||
"""
|
||||
if not self._discovered:
|
||||
self.discoverRenderers()
|
||||
|
||||
formatName = outputFormat.lower().strip()
|
||||
if formatName in self._format_mappings:
|
||||
formatName = self._format_mappings[formatName]
|
||||
|
||||
# Check document first, then code
|
||||
for style in ['document', 'code']:
|
||||
rendererClass = self._renderers.get((formatName, style))
|
||||
if rendererClass:
|
||||
try:
|
||||
return rendererClass.getOutputStyle(formatName)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: any style
|
||||
for key, rendererClass in self._renderers.items():
|
||||
if key[0] == formatName:
|
||||
try:
|
||||
return rendererClass.getOutputStyle(formatName)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.warning(f"No renderer found for format: {outputFormat}, cannot determine output style")
|
||||
return None
|
||||
|
||||
|
||||
# Global registry instance
|
||||
_registry = RendererRegistry()
|
||||
|
||||
|
||||
def getRenderer(outputFormat: str, services=None, outputStyle: str = None) -> Optional[BaseRenderer]:
|
||||
"""Get a renderer instance for the specified format and style.
|
||||
|
||||
Args:
|
||||
outputFormat: Format name (e.g. 'csv', 'json', 'pdf')
|
||||
services: Services instance
|
||||
outputStyle: 'document' or 'code'. If None, prefers document renderer.
|
||||
"""
|
||||
return _registry.getRenderer(outputFormat, services, outputStyle=outputStyle)
|
||||
|
||||
|
||||
def getSupportedFormats() -> List[str]:
|
||||
"""Get list of all supported formats."""
|
||||
return _registry.getSupportedFormats()
|
||||
|
||||
|
||||
def getRendererInfo() -> Dict[str, Dict[str, str]]:
|
||||
"""Get information about all registered renderers."""
|
||||
return _registry.getRendererInfo()
|
||||
|
||||
|
||||
def getOutputStyle(outputFormat: str) -> Optional[str]:
|
||||
"""Get the output style classification for a given format."""
|
||||
return _registry.getOutputStyle(outputFormat)
|
||||
|
|
@ -1,159 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
CSV code renderer for code generation.
|
||||
"""
|
||||
|
||||
from .codeRendererBaseTemplate import BaseCodeRenderer
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from typing import Dict, Any, List, Optional
|
||||
import csv
|
||||
import io
|
||||
|
||||
class RendererCodeCsv(BaseCodeRenderer):
|
||||
"""Renders CSV code files."""
|
||||
|
||||
@classmethod
|
||||
def getSupportedFormats(cls) -> List[str]:
|
||||
"""Return supported CSV formats."""
|
||||
return ['csv']
|
||||
|
||||
@classmethod
|
||||
def getFormatAliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def getPriority(cls) -> int:
|
||||
"""Return priority for CSV code renderer."""
|
||||
return 75 # Higher than document renderer (70) for code generation
|
||||
|
||||
@classmethod
|
||||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||||
"""Return output style classification: CSV requires specific structure."""
|
||||
return 'code'
|
||||
|
||||
async def renderCodeFiles(
|
||||
self,
|
||||
codeFiles: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any],
|
||||
userPrompt: str = None
|
||||
) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render CSV code files.
|
||||
For single file: output as-is (validate structure)
|
||||
For multiple files: output separately (each is independent CSV)
|
||||
"""
|
||||
renderedDocs = []
|
||||
|
||||
for codeFile in codeFiles:
|
||||
if not self._validateCodeFile(codeFile):
|
||||
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
|
||||
continue
|
||||
|
||||
filename = codeFile['filename']
|
||||
content = codeFile['content']
|
||||
|
||||
# Validate CSV structure (header row, consistent columns)
|
||||
validatedContent = self._validateAndFixCsv(content)
|
||||
|
||||
# Extract CSV statistics for validation
|
||||
csvStats = self._extractCsvStatistics(validatedContent)
|
||||
|
||||
# Merge file-specific metadata with project metadata
|
||||
fileMetadata = dict(metadata) if metadata else {}
|
||||
fileMetadata.update({
|
||||
"filename": filename,
|
||||
"fileType": "csv",
|
||||
"statistics": csvStats
|
||||
})
|
||||
|
||||
renderedDocs.append(
|
||||
RenderedDocument(
|
||||
documentData=validatedContent.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=filename,
|
||||
metadata=fileMetadata
|
||||
)
|
||||
)
|
||||
|
||||
return renderedDocs
|
||||
|
||||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render method for document generation compatibility.
|
||||
Delegates to document renderer if needed, or handles code files directly.
|
||||
"""
|
||||
# Check if this is code generation (has files array) or document generation (has documents array)
|
||||
if "files" in extractedContent:
|
||||
# Code generation path - use renderCodeFiles
|
||||
files = extractedContent.get("files", [])
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
return await self.renderCodeFiles(files, metadata, userPrompt)
|
||||
else:
|
||||
# Document generation path - delegate to document renderer
|
||||
from .rendererCsv import RendererCsv
|
||||
documentRenderer = RendererCsv(self.services)
|
||||
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
|
||||
|
||||
def _validateAndFixCsv(self, content: str) -> str:
|
||||
"""Validate CSV structure and fix common issues."""
|
||||
try:
|
||||
# Parse CSV to validate structure
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
rows = list(reader)
|
||||
|
||||
if not rows:
|
||||
return content # Empty CSV
|
||||
|
||||
# Check header row exists
|
||||
headerRow = rows[0]
|
||||
headerCount = len(headerRow)
|
||||
|
||||
# Validate all rows have same column count
|
||||
fixedRows = [headerRow] # Start with header
|
||||
|
||||
for i, row in enumerate(rows[1:], 1):
|
||||
if len(row) != headerCount:
|
||||
self.logger.debug(f"Row {i} has {len(row)} columns, expected {headerCount}. Auto-fixing...")
|
||||
# Pad or truncate to match header
|
||||
if len(row) < headerCount:
|
||||
row.extend([''] * (headerCount - len(row)))
|
||||
else:
|
||||
row = row[:headerCount]
|
||||
fixedRows.append(row)
|
||||
|
||||
# Convert back to CSV string
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
for row in fixedRows:
|
||||
writer.writerow(row)
|
||||
|
||||
return output.getvalue()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"CSV validation failed: {e}, returning original content")
|
||||
return content
|
||||
|
||||
def _extractCsvStatistics(self, content: str) -> Dict[str, Any]:
|
||||
"""Extract CSV statistics for validation (row count, column count, headers)."""
|
||||
try:
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
rows = list(reader)
|
||||
|
||||
if not rows:
|
||||
return {"rowCount": 0, "columnCount": 0, "headerRow": []}
|
||||
|
||||
headerRow = rows[0]
|
||||
columnCount = len(headerRow)
|
||||
rowCount = len(rows) - 1 # Exclude header
|
||||
|
||||
return {
|
||||
"rowCount": rowCount,
|
||||
"columnCount": columnCount,
|
||||
"headerRow": headerRow,
|
||||
"dataRowCount": rowCount
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"CSV statistics extraction failed: {e}")
|
||||
return {}
|
||||
|
|
@ -1,141 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
JSON code renderer for code generation.
|
||||
"""
|
||||
|
||||
from .codeRendererBaseTemplate import BaseCodeRenderer
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from typing import Dict, Any, List, Optional
|
||||
import json
|
||||
|
||||
class RendererCodeJson(BaseCodeRenderer):
|
||||
"""Renders JSON code files."""
|
||||
|
||||
@classmethod
|
||||
def getSupportedFormats(cls) -> List[str]:
|
||||
"""Return supported JSON formats."""
|
||||
return ['json']
|
||||
|
||||
@classmethod
|
||||
def getFormatAliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def getPriority(cls) -> int:
|
||||
"""Return priority for JSON code renderer."""
|
||||
return 85 # Higher than document renderer (80) for code generation
|
||||
|
||||
@classmethod
|
||||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||||
"""Return output style classification: JSON is structured data format."""
|
||||
return 'code'
|
||||
|
||||
async def renderCodeFiles(
|
||||
self,
|
||||
codeFiles: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any],
|
||||
userPrompt: str = None
|
||||
) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render JSON code files.
|
||||
For single file: output as-is
|
||||
For multiple files: output separately (each file is independent JSON)
|
||||
"""
|
||||
renderedDocs = []
|
||||
|
||||
for codeFile in codeFiles:
|
||||
if not self._validateCodeFile(codeFile):
|
||||
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
|
||||
continue
|
||||
|
||||
filename = codeFile['filename']
|
||||
content = codeFile['content']
|
||||
|
||||
# Validate JSON syntax and extract statistics
|
||||
parsed = None
|
||||
try:
|
||||
parsed = json.loads(content) # Validate JSON
|
||||
except json.JSONDecodeError as e:
|
||||
self.logger.warning(f"Invalid JSON in {filename}: {e}")
|
||||
# Could fix/format JSON here if needed
|
||||
|
||||
# Format JSON (pretty print)
|
||||
try:
|
||||
if parsed is None:
|
||||
parsed = json.loads(content)
|
||||
formattedContent = json.dumps(parsed, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
formattedContent = content # Use original if formatting fails
|
||||
|
||||
# Extract JSON statistics for validation
|
||||
jsonStats = self._extractJsonStatistics(parsed) if parsed else {}
|
||||
|
||||
# Merge file-specific metadata with project metadata
|
||||
fileMetadata = dict(metadata) if metadata else {}
|
||||
fileMetadata.update({
|
||||
"filename": filename,
|
||||
"fileType": "json",
|
||||
"statistics": jsonStats
|
||||
})
|
||||
|
||||
renderedDocs.append(
|
||||
RenderedDocument(
|
||||
documentData=formattedContent.encode('utf-8'),
|
||||
mimeType="application/json",
|
||||
filename=filename,
|
||||
metadata=fileMetadata
|
||||
)
|
||||
)
|
||||
|
||||
return renderedDocs
|
||||
|
||||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render method for document generation compatibility.
|
||||
Delegates to document renderer if needed, or handles code files directly.
|
||||
"""
|
||||
# Check if this is code generation (has files array) or document generation (has documents array)
|
||||
if "files" in extractedContent:
|
||||
# Code generation path - use renderCodeFiles
|
||||
files = extractedContent.get("files", [])
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
return await self.renderCodeFiles(files, metadata, userPrompt)
|
||||
else:
|
||||
# Document generation path - delegate to document renderer
|
||||
# Import here to avoid circular dependency
|
||||
from .rendererJson import RendererJson
|
||||
documentRenderer = RendererJson(self.services)
|
||||
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
|
||||
|
||||
def _extractJsonStatistics(self, parsed: Any) -> Dict[str, Any]:
|
||||
"""Extract JSON statistics for validation (object count, array count, key count)."""
|
||||
try:
|
||||
stats = {
|
||||
"isArray": isinstance(parsed, list),
|
||||
"isObject": isinstance(parsed, dict),
|
||||
"itemCount": 0,
|
||||
"keyCount": 0
|
||||
}
|
||||
|
||||
if isinstance(parsed, list):
|
||||
stats["itemCount"] = len(parsed)
|
||||
# Count nested objects/arrays
|
||||
objectCount = sum(1 for item in parsed if isinstance(item, dict))
|
||||
arrayCount = sum(1 for item in parsed if isinstance(item, list))
|
||||
stats["objectCount"] = objectCount
|
||||
stats["arrayCount"] = arrayCount
|
||||
elif isinstance(parsed, dict):
|
||||
stats["keyCount"] = len(parsed)
|
||||
stats["keys"] = list(parsed.keys())
|
||||
# Count nested objects/arrays
|
||||
objectCount = sum(1 for v in parsed.values() if isinstance(v, dict))
|
||||
arrayCount = sum(1 for v in parsed.values() if isinstance(v, list))
|
||||
stats["objectCount"] = objectCount
|
||||
stats["arrayCount"] = arrayCount
|
||||
|
||||
return stats
|
||||
except Exception as e:
|
||||
self.logger.warning(f"JSON statistics extraction failed: {e}")
|
||||
return {}
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
XML code renderer for code generation.
|
||||
"""
|
||||
|
||||
from .codeRendererBaseTemplate import BaseCodeRenderer
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from typing import Dict, Any, List, Optional
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.dom import minidom
|
||||
|
||||
class RendererCodeXml(BaseCodeRenderer):
|
||||
"""Renders XML code files."""
|
||||
|
||||
@classmethod
|
||||
def getSupportedFormats(cls) -> List[str]:
|
||||
"""Return supported XML formats."""
|
||||
return ['xml']
|
||||
|
||||
@classmethod
|
||||
def getFormatAliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def getPriority(cls) -> int:
|
||||
"""Return priority for XML code renderer."""
|
||||
return 80
|
||||
|
||||
@classmethod
|
||||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||||
"""Return output style classification: XML is structured data format."""
|
||||
return 'code'
|
||||
|
||||
async def renderCodeFiles(
|
||||
self,
|
||||
codeFiles: List[Dict[str, Any]],
|
||||
metadata: Dict[str, Any],
|
||||
userPrompt: str = None
|
||||
) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render XML code files.
|
||||
Validates XML syntax and formats (pretty print).
|
||||
"""
|
||||
renderedDocs = []
|
||||
|
||||
for codeFile in codeFiles:
|
||||
if not self._validateCodeFile(codeFile):
|
||||
self.logger.warning(f"Invalid code file: {codeFile.get('filename', 'unknown')}")
|
||||
continue
|
||||
|
||||
filename = codeFile['filename']
|
||||
content = codeFile['content']
|
||||
|
||||
# Validate and format XML
|
||||
formattedContent = self._validateAndFormatXml(content)
|
||||
|
||||
# Extract XML statistics for validation
|
||||
xmlStats = self._extractXmlStatistics(formattedContent)
|
||||
|
||||
# Merge file-specific metadata with project metadata
|
||||
fileMetadata = dict(metadata) if metadata else {}
|
||||
fileMetadata.update({
|
||||
"filename": filename,
|
||||
"fileType": "xml",
|
||||
"statistics": xmlStats
|
||||
})
|
||||
|
||||
renderedDocs.append(
|
||||
RenderedDocument(
|
||||
documentData=formattedContent.encode('utf-8'),
|
||||
mimeType="application/xml",
|
||||
filename=filename,
|
||||
metadata=fileMetadata
|
||||
)
|
||||
)
|
||||
|
||||
return renderedDocs
|
||||
|
||||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||||
"""
|
||||
Render method for document generation compatibility.
|
||||
For XML, we only support code generation (no document renderer exists yet).
|
||||
"""
|
||||
# Check if this is code generation (has files array)
|
||||
if "files" in extractedContent:
|
||||
# Code generation path - use renderCodeFiles
|
||||
files = extractedContent.get("files", [])
|
||||
metadata = extractedContent.get("metadata", {})
|
||||
return await self.renderCodeFiles(files, metadata, userPrompt)
|
||||
else:
|
||||
# Document generation path - not supported yet, return error
|
||||
self.logger.warning("XML document generation not supported, only code generation")
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=f"XML document generation not yet supported".encode('utf-8'),
|
||||
mimeType="text/plain",
|
||||
filename="error.txt",
|
||||
metadata={}
|
||||
)
|
||||
]
|
||||
|
||||
def _validateAndFormatXml(self, content: str) -> str:
|
||||
"""Validate XML syntax and format (pretty print)."""
|
||||
try:
|
||||
# Parse XML to validate
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# Format XML (pretty print)
|
||||
rough_string = ET.tostring(root, encoding='unicode')
|
||||
reparsed = minidom.parseString(rough_string)
|
||||
formatted = reparsed.toprettyxml(indent=" ")
|
||||
|
||||
# Remove extra blank lines
|
||||
lines = [line for line in formatted.split('\n') if line.strip()]
|
||||
return '\n'.join(lines)
|
||||
|
||||
except ET.ParseError as e:
|
||||
self.logger.warning(f"Invalid XML: {e}, returning original content")
|
||||
return content
|
||||
except Exception as e:
|
||||
self.logger.warning(f"XML formatting failed: {e}, returning original content")
|
||||
return content
|
||||
|
||||
def _extractXmlStatistics(self, content: str) -> Dict[str, Any]:
|
||||
"""Extract XML statistics for validation (element count, attribute count, root element)."""
|
||||
try:
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# Count all elements recursively
|
||||
elementCount = len(list(root.iter()))
|
||||
|
||||
# Count attributes
|
||||
attributeCount = sum(len(elem.attrib) for elem in root.iter())
|
||||
|
||||
# Get root element name
|
||||
rootElement = root.tag
|
||||
|
||||
return {
|
||||
"elementCount": elementCount,
|
||||
"attributeCount": attributeCount,
|
||||
"rootElement": rootElement,
|
||||
"hasRoot": True
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"XML statistics extraction failed: {e}")
|
||||
return {}
|
||||
|
|
@ -1,415 +0,0 @@
|
|||
# Copyright (c) 2025 Patrick Motsch
|
||||
# All rights reserved.
|
||||
"""
|
||||
CSV renderer for report generation.
|
||||
"""
|
||||
|
||||
from .documentRendererBaseTemplate import BaseRenderer
|
||||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
class RendererCsv(BaseRenderer):
|
||||
"""Renders content to CSV format with format-specific extraction."""
|
||||
|
||||
@classmethod
|
||||
def getSupportedFormats(cls) -> List[str]:
|
||||
"""Return supported CSV formats."""
|
||||
return ['csv']
|
||||
|
||||
@classmethod
|
||||
def getFormatAliases(cls) -> List[str]:
|
||||
"""Return format aliases."""
|
||||
return ['spreadsheet', 'table']
|
||||
|
||||
@classmethod
|
||||
def getPriority(cls) -> int:
|
||||
"""Return priority for CSV renderer."""
|
||||
return 70
|
||||
|
||||
@classmethod
|
||||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||||
"""Return output style classification: CSV document renderer converts structured document content to CSV."""
|
||||
return 'document'
|
||||
|
||||
@classmethod
|
||||
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Return list of section content types that CSV renderer accepts.
|
||||
CSV renderer accepts table sections and code_block sections (for raw CSV content).
|
||||
"""
|
||||
return ["table", "code_block"]
|
||||
|
||||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||||
"""Render extracted JSON content to CSV format. Produces one CSV file per table section."""
|
||||
try:
|
||||
# Validate JSON structure
|
||||
if not self._validateJsonStructure(extractedContent):
|
||||
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
|
||||
|
||||
# Extract sections and metadata
|
||||
sections = self._extractSections(extractedContent)
|
||||
metadata = self._extractMetadata(extractedContent)
|
||||
|
||||
# Determine base filename from document or title
|
||||
documents = extractedContent.get("documents", [])
|
||||
baseFilename = None
|
||||
if documents and isinstance(documents[0], dict):
|
||||
baseFilename = documents[0].get("filename")
|
||||
if not baseFilename:
|
||||
baseFilename = self._determineFilename(title, "text/csv")
|
||||
|
||||
# Remove extension from base filename if present
|
||||
if baseFilename.endswith('.csv'):
|
||||
baseFilename = baseFilename[:-4]
|
||||
|
||||
# Collect CSV-producing sections: table sections AND code_block sections with CSV language
|
||||
tableSections = []
|
||||
codeBlockCsvSections = []
|
||||
for section in sections:
|
||||
sectionType = section.get("content_type", "paragraph")
|
||||
if sectionType == "table":
|
||||
tableSections.append(section)
|
||||
elif sectionType == "code_block":
|
||||
# Check if any element is a code_block with language "csv"
|
||||
for element in section.get("elements", []):
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict) and content.get("language", "").lower() == "csv":
|
||||
codeBlockCsvSections.append(section)
|
||||
break
|
||||
|
||||
# If no usable sections found, return empty CSV
|
||||
if not tableSections and not codeBlockCsvSections:
|
||||
self.logger.warning("No table or CSV code_block sections found in CSV document - returning empty CSV")
|
||||
emptyCsv = self._convertRowsToCsv([["No table data available"]])
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=emptyCsv.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=self._determineFilename(title, "text/csv"),
|
||||
documentType=metadata.get("documentType") if isinstance(metadata, dict) else None,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
]
|
||||
|
||||
allCsvSections = tableSections + codeBlockCsvSections
|
||||
|
||||
# Generate one CSV file per section
|
||||
renderedDocuments = []
|
||||
for i, csvSection in enumerate(allCsvSections):
|
||||
sectionType = csvSection.get("content_type", "paragraph")
|
||||
sectionTitle = csvSection.get("title")
|
||||
csvContent = ""
|
||||
|
||||
if sectionType == "code_block":
|
||||
# Extract raw CSV content directly from code_block elements
|
||||
rawCsvParts = []
|
||||
for element in csvSection.get("elements", []):
|
||||
content = element.get("content", {})
|
||||
if isinstance(content, dict) and content.get("language", "").lower() == "csv":
|
||||
code = content.get("code", "")
|
||||
if code:
|
||||
rawCsvParts.append(code)
|
||||
csvContent = "\n".join(rawCsvParts)
|
||||
else:
|
||||
# Table section — render via table logic
|
||||
csvRows = []
|
||||
if sectionTitle:
|
||||
csvRows.append([sectionTitle])
|
||||
csvRows.append([]) # Empty row after title
|
||||
|
||||
elements = csvSection.get("elements", [])
|
||||
for element in elements:
|
||||
tableRows = self._renderJsonTableToCsv(element)
|
||||
if tableRows:
|
||||
csvRows.extend(tableRows)
|
||||
|
||||
csvContent = self._convertRowsToCsv(csvRows)
|
||||
|
||||
# Determine filename
|
||||
if len(allCsvSections) == 1:
|
||||
filename = f"{baseFilename}.csv"
|
||||
else:
|
||||
sectionId = csvSection.get("id", f"csv_{i+1}")
|
||||
if sectionTitle:
|
||||
safeTitle = "".join(c for c in sectionTitle if c.isalnum() or c in (' ', '-', '_')).strip()
|
||||
safeTitle = safeTitle.replace(' ', '_')[:30]
|
||||
filename = f"{baseFilename}_{safeTitle}.csv"
|
||||
else:
|
||||
filename = f"{baseFilename}_{sectionId}.csv"
|
||||
|
||||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||||
|
||||
renderedDocuments.append(
|
||||
RenderedDocument(
|
||||
documentData=csvContent.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=filename,
|
||||
documentType=documentType,
|
||||
metadata=metadata if isinstance(metadata, dict) else None
|
||||
)
|
||||
)
|
||||
|
||||
return renderedDocuments
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error rendering CSV: {str(e)}")
|
||||
# Return minimal CSV fallback
|
||||
fallbackCsv = self._convertRowsToCsv([["Title", "Content"], [title, f"Error rendering report: {str(e)}"]])
|
||||
return [
|
||||
RenderedDocument(
|
||||
documentData=fallbackCsv.encode('utf-8'),
|
||||
mimeType="text/csv",
|
||||
filename=self._determineFilename(title, "text/csv"),
|
||||
metadata=extractedContent.get("metadata", {}) if extractedContent else None
|
||||
)
|
||||
]
|
||||
|
||||
async def _generateCsvFromJson(self, jsonContent: Dict[str, Any], title: str) -> str:
|
||||
"""Generate CSV content from structured JSON document. DEPRECATED: Use render() method instead."""
|
||||
# This method is kept for backward compatibility but is no longer used
|
||||
# The render() method now handles CSV generation directly
|
||||
try:
|
||||
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
|
||||
if not self._validateJsonStructure(jsonContent):
|
||||
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
|
||||
|
||||
# Extract sections and metadata from standardized schema
|
||||
sections = self._extractSections(jsonContent)
|
||||
metadata = self._extractMetadata(jsonContent)
|
||||
|
||||
# Use provided title (which comes from documents[].title) as primary source
|
||||
# Fallback to metadata.title only if title parameter is empty
|
||||
documentTitle = title if title else metadata.get("title", "Generated Document")
|
||||
|
||||
# Generate CSV content
|
||||
csvRows = []
|
||||
|
||||
# Add title row
|
||||
if documentTitle:
|
||||
csvRows.append([documentTitle])
|
||||
csvRows.append([]) # Empty row
|
||||
|
||||
# Process each section in order - only table sections
|
||||
for section in sections:
|
||||
sectionType = section.get("content_type", "paragraph")
|
||||
if sectionType == "table":
|
||||
sectionCsv = self._renderJsonSectionToCsv(section)
|
||||
if sectionCsv:
|
||||
csvRows.extend(sectionCsv)
|
||||
csvRows.append([]) # Empty row between sections
|
||||
|
||||
# Convert to CSV string
|
||||
csvContent = self._convertRowsToCsv(csvRows)
|
||||
|
||||
return csvContent
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating CSV from JSON: {str(e)}")
|
||||
raise Exception(f"CSV generation failed: {str(e)}")
|
||||
|
||||
def _renderJsonSectionToCsv(self, section: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a single JSON section to CSV rows."""
|
||||
try:
|
||||
sectionType = section.get("content_type", "paragraph")
|
||||
elements = section.get("elements", [])
|
||||
|
||||
csvRows = []
|
||||
|
||||
# Add section title if available
|
||||
sectionTitle = section.get("title")
|
||||
if sectionTitle:
|
||||
csvRows.append([f"# {sectionTitle}"])
|
||||
|
||||
# Process each element in the section
|
||||
for element in elements:
|
||||
if sectionType == "table":
|
||||
csvRows.extend(self._renderJsonTableToCsv(element))
|
||||
elif sectionType == "list":
|
||||
csvRows.extend(self._renderJsonListToCsv(element))
|
||||
elif sectionType == "heading":
|
||||
csvRows.extend(self._renderJsonHeadingToCsv(element))
|
||||
elif sectionType == "paragraph":
|
||||
csvRows.extend(self._renderJsonParagraphToCsv(element))
|
||||
elif sectionType == "code":
|
||||
csvRows.extend(self._renderJsonCodeToCsv(element))
|
||||
else:
|
||||
# Fallback to paragraph for unknown types
|
||||
csvRows.extend(self._renderJsonParagraphToCsv(element))
|
||||
|
||||
return csvRows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||||
return [["[Error rendering section]"]]
|
||||
|
||||
def _renderJsonTableToCsv(self, tableData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON table to CSV rows."""
|
||||
try:
|
||||
# Extract from nested content structure
|
||||
content = tableData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
headers = content.get("headers", [])
|
||||
rows = content.get("rows", [])
|
||||
|
||||
csvRows = []
|
||||
|
||||
if headers:
|
||||
csvRows.append(headers)
|
||||
|
||||
if rows:
|
||||
csvRows.extend(rows)
|
||||
|
||||
return csvRows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering table: {str(e)}")
|
||||
return [["[Error rendering table]"]]
|
||||
|
||||
def _renderJsonListToCsv(self, listData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON list to CSV rows."""
|
||||
try:
|
||||
# Extract from nested content structure
|
||||
content = listData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
items = content.get("items", [])
|
||||
csvRows = []
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text", "")
|
||||
subitems = item.get("subitems", [])
|
||||
csvRows.append([text])
|
||||
|
||||
# Add subitems as indented rows
|
||||
for subitem in subitems:
|
||||
if isinstance(subitem, dict):
|
||||
csvRows.append([f" - {subitem.get('text', '')}"])
|
||||
else:
|
||||
csvRows.append([f" - {subitem}"])
|
||||
else:
|
||||
csvRows.append([str(item)])
|
||||
|
||||
return csvRows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering list: {str(e)}")
|
||||
return [["[Error rendering list]"]]
|
||||
|
||||
def _renderJsonHeadingToCsv(self, headingData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON heading to CSV rows."""
|
||||
try:
|
||||
# Extract from nested content structure
|
||||
content = headingData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
text = content.get("text", "")
|
||||
level = content.get("level", 1)
|
||||
|
||||
if text:
|
||||
# Use # symbols for heading levels
|
||||
headingText = f"{'#' * level} {text}"
|
||||
return [[headingText]]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||||
return [["[Error rendering heading]"]]
|
||||
|
||||
def _renderJsonParagraphToCsv(self, paragraphData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON paragraph to CSV rows."""
|
||||
try:
|
||||
# Extract from nested content structure
|
||||
content = paragraphData.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text", "")
|
||||
elif isinstance(content, str):
|
||||
text = content
|
||||
else:
|
||||
text = ""
|
||||
|
||||
if text:
|
||||
# Split long paragraphs into multiple rows if needed
|
||||
if len(text) > 100:
|
||||
words = text.split()
|
||||
rows = []
|
||||
currentRow = []
|
||||
currentLength = 0
|
||||
|
||||
for word in words:
|
||||
if currentLength + len(word) > 100 and currentRow:
|
||||
rows.append([" ".join(currentRow)])
|
||||
currentRow = [word]
|
||||
currentLength = len(word)
|
||||
else:
|
||||
currentRow.append(word)
|
||||
currentLength += len(word) + 1
|
||||
|
||||
if currentRow:
|
||||
rows.append([" ".join(currentRow)])
|
||||
|
||||
return rows
|
||||
else:
|
||||
return [[text]]
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||||
return [["[Error rendering paragraph]"]]
|
||||
|
||||
def _renderJsonCodeToCsv(self, codeData: Dict[str, Any]) -> List[List[str]]:
|
||||
"""Render a JSON code block to CSV rows."""
|
||||
try:
|
||||
# Extract from nested content structure
|
||||
content = codeData.get("content", {})
|
||||
if not isinstance(content, dict):
|
||||
return []
|
||||
code = content.get("code", "")
|
||||
language = content.get("language", "")
|
||||
|
||||
csvRows = []
|
||||
|
||||
if language:
|
||||
csvRows.append([f"Code ({language}):"])
|
||||
|
||||
if code:
|
||||
# Split code into lines
|
||||
codeLines = code.split('\n')
|
||||
for line in codeLines:
|
||||
csvRows.append([f" {line}"])
|
||||
|
||||
return csvRows
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||||
return [["[Error rendering code block]"]]
|
||||
|
||||
def _convertRowsToCsv(self, rows: List[List[str]]) -> str:
|
||||
"""Convert rows to CSV string."""
|
||||
import csv
|
||||
import io
|
||||
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
|
||||
for row in rows:
|
||||
if row: # Only write non-empty rows
|
||||
writer.writerow(row)
|
||||
|
||||
return output.getvalue()
|
||||
|
||||
def _cleanCsvContent(self, content: str, title: str) -> str:
|
||||
"""Clean and validate CSV content from AI."""
|
||||
content = content.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if content.startswith("```") and content.endswith("```"):
|
||||
lines = content.split('\n')
|
||||
if len(lines) > 2:
|
||||
content = '\n'.join(lines[1:-1]).strip()
|
||||
|
||||
return content
|
||||
|
||||
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue